# Import librarys

In [58]:
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import re
from textstat.textstat import textstat
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import math
import numpy as np
nltk.download(['wordnet', 'punkt', 'stopwords'])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/newscred/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/newscred/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/newscred/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
import gensim
import spacy

In [30]:
nlp = spacy.load('en')                 # You are here.
doc = nlp(u'Hello, spacy!')            # See "Using the pipeline"
print([(w.text, w.pos_) for w in doc]) # See "Doc, Span and Token"

[(u'Hello', u'INTJ'), (u',', u'PUNCT'), (u'spacy', u'NOUN'), (u'!', u'PUNCT')]


# Read data

In [31]:
data_df = pd.read_csv('./all_data.csv', index_col=0)
data_df = data_df.reset_index(drop=True)
data_df.head()

Unnamed: 0,title,text,guid,pageviews,engaged_time,avg_engaged_time,type
0,Marc Jacobs Starred in His Very Own 'Zoolander...,<p>Budding auteur Marc Jacobs decided to make ...,ad1e08c60ed4eb7775fe3cd2b6f68398,19.0,100000.0,10000.0,licensed
1,11 Ways to Sneak In Extra Steps on Your Fitnes...,"<p><a href=""http://www.purewow.com/wellness/Ho...",a68a9f01ec8b695a81817e5250d61c26,94.0,3850000.0,192500.0,licensed
2,Jaden Smith Stars in New Louis Vuitton Ads,<p>Nicolas Ghesqui&#232;re's Instagram feed ap...,bed7658b06ad4e03ebd4e0cb66595e5e,9.0,55000.0,9166.6667,licensed
3,Here's How to Walk Better in Heels,"<p>We&#8217;ve taught you <a href=""http://www....",e5d8d5f905d6b1e5c1f2394a4600a436,21.0,580000.0,30526.3158,licensed
4,Prepare to Glow Like Beyonc� in Your Next Selfie,<p>If you thought it was impossible to snap a ...,324b81625a37e7b414b0a11bdd1884f5,30.0,380000.0,25333.3333,original


In [32]:
data_df.shape

(57866, 7)

# Raw text processing

In [33]:
def preprocess_text(text, stem=False, lemmize=True):
    text = get_text(text)
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    preprocessed_words = []

    for word in words:
        if len(word) > 2 and word not in stopwords.words('english'):
            if lemmize:
                word = lemmatizer.lemmatize(word)                
            if stem:
                word = stemmer.stem(word)                
            preprocessed_words.append(word)    
    return ' '.join(preprocessed_words)

def preprocess_texts(df, column, stem=False, lemmize=True):
    count = 0
    preprocessed = []
    for text in df[column]:
        count += 1
        text = preprocess_text(text, stem, lemmize)
        preprocessed.append(text)
        if count % 100 == 0:
            print 'count: ', count
        percent = float(count)/float(len(df))*100
        if percent % 10 == 0.0:
            print "For column {}: text {} of {} processed ({}%)".format(column, count, len(df), round(percent,2))
        
    return preprocessed

In [34]:
def get_text(html):
    text = BeautifulSoup(str(html), 'lxml').get_text().encode('ascii', 'ignore')
    text = re.sub('(\\n){1,}', ' ', text)
    text = text.lower()
    return text

In [35]:
data_df['clean_text'] = [get_text(html) for html in data_df['text']]
data_df.head()

Unnamed: 0,title,text,guid,pageviews,engaged_time,avg_engaged_time,type,clean_text
0,Marc Jacobs Starred in His Very Own 'Zoolander...,<p>Budding auteur Marc Jacobs decided to make ...,ad1e08c60ed4eb7775fe3cd2b6f68398,19.0,100000.0,10000.0,licensed,budding auteur marc jacobs decided to make a l...
1,11 Ways to Sneak In Extra Steps on Your Fitnes...,"<p><a href=""http://www.purewow.com/wellness/Ho...",a68a9f01ec8b695a81817e5250d61c26,94.0,3850000.0,192500.0,licensed,you guys: a recent study says that adding an ...
2,Jaden Smith Stars in New Louis Vuitton Ads,<p>Nicolas Ghesqui&#232;re's Instagram feed ap...,bed7658b06ad4e03ebd4e0cb66595e5e,9.0,55000.0,9166.6667,licensed,nicolas ghesquire's instagram feed appears to ...
3,Here's How to Walk Better in Heels,"<p>We&#8217;ve taught you <a href=""http://www....",e5d8d5f905d6b1e5c1f2394a4600a436,21.0,580000.0,30526.3158,licensed,weve taught you how to make your heels more co...
4,Prepare to Glow Like Beyonc� in Your Next Selfie,<p>If you thought it was impossible to snap a ...,324b81625a37e7b414b0a11bdd1884f5,30.0,380000.0,25333.3333,original,if you thought it was impossible to snap a cel...


In [36]:
len(data_df[data_df['clean_text'] == ''])

80

# Feature calculation

### statistical features

In [37]:
df = data_df[data_df['clean_text'].apply(textstat.lexicon_count) != 0]

In [38]:
def textstat_stats(text):
    difficulty = textstat.flesch_reading_ease(text)
    grade_difficulty = textstat.flesch_kincaid_grade(text)
    gfog = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)
    ari = textstat.automated_readability_index(text)
    cli = textstat.coleman_liau_index(text)
    lwf = textstat.linsear_write_formula(text)
    dcrs = textstat.dale_chall_readability_score(text)
    return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs])

In [39]:
df[['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']] = df['clean_text'].apply(textstat_stats)

In [97]:
df.head()
# df.to_csv('all_data_with_stats.csv')

Unnamed: 0,title,text,guid,pageviews,engaged_time,avg_engaged_time,type,clean_text,difficulty,grade_difficulty,...,text_h1_count,text_h2_count,text_h3_count,text_h4_count,text_h5_count,text_h6_count,text_bold_count,text_italic_count,text_table_count,_text_table_count
0,Marc Jacobs Starred in His Very Own 'Zoolander...,<p>Budding auteur Marc Jacobs decided to make ...,ad1e08c60ed4eb7775fe3cd2b6f68398,19.0,100000.0,10000.0,licensed,budding auteur marc jacobs decided to make a l...,46.78,10.7,...,0,0,0,0,0,0,0,0,0,0
1,11 Ways to Sneak In Extra Steps on Your Fitnes...,"<p><a href=""http://www.purewow.com/wellness/Ho...",a68a9f01ec8b695a81817e5250d61c26,94.0,3850000.0,192500.0,licensed,you guys: a recent study says that adding an ...,82.65,5.2,...,0,0,0,0,0,0,0,0,0,0
2,Jaden Smith Stars in New Louis Vuitton Ads,<p>Nicolas Ghesqui&#232;re's Instagram feed ap...,bed7658b06ad4e03ebd4e0cb66595e5e,9.0,55000.0,9166.6667,licensed,nicolas ghesquire's instagram feed appears to ...,52.19,10.7,...,0,0,0,0,0,0,0,0,0,0
3,Here's How to Walk Better in Heels,"<p>We&#8217;ve taught you <a href=""http://www....",e5d8d5f905d6b1e5c1f2394a4600a436,21.0,580000.0,30526.3158,licensed,weve taught you how to make your heels more co...,73.17,6.8,...,0,0,0,0,0,0,0,0,0,0
4,Prepare to Glow Like Beyonc� in Your Next Selfie,<p>If you thought it was impossible to snap a ...,324b81625a37e7b414b0a11bdd1884f5,30.0,380000.0,25333.3333,original,if you thought it was impossible to snap a cel...,63.7,8.4,...,0,0,0,0,0,0,0,0,0,0


In [98]:
df.shape

(57758, 30)

### text features

In [80]:
def html_tag_features(text, tag_name):
    if isinstance(text, float):
        return 0
    soup = BeautifulSoup(text, 'lxml')
    sz = len(soup.find_all(tag_name))
    return sz

In [81]:
text_img_count = [html_tag_features(x, 'img') for x in df['text']]
text_vid_count = [html_tag_features(x, 'video') for x in df['text']]
text_link_count = [html_tag_features(x, 'a') for x in df['text']]
text_par_count = [html_tag_features(x, 'p') for x in df['text']]
text_h1_count = [html_tag_features(x, 'h1') for x in df['text']]
text_h2_count = [html_tag_features(x, 'h2') for x in df['text']]
text_h3_count = [html_tag_features(x, 'h3') for x in df['text']]
text_h4_count = [html_tag_features(x, 'h4') for x in df['text']]
text_h5_count = [html_tag_features(x, 'h5') for x in df['text']]
text_h6_count = [html_tag_features(x, 'h6') for x in df['text']]
text_bold_count = [html_tag_features(x, 'b') for x in df['text']]
text_italic_count = [html_tag_features(x, 'i') for x in df['text']]
text_table_count = [html_tag_features(x, 'table') for x in df['text']]

In [92]:
data = {'text_img_count': text_img_count ,
        'text_vid_count': text_vid_count,
        'text_link_count': text_link_count,
        'text_par_count': text_par_count,
        'text_h1_count': text_h1_count,
        'text_h2_count': text_h2_count,
        'text_h3_count': text_h3_count,
        'text_h4_count': text_h4_count,
        'text_h5_count': text_h5_count,
        'text_h6_count': text_h6_count,
        'text_bold_count': text_bold_count,
        'text_italic_count': text_italic_count,
        'text_table_count': text_table_count
    }

df1 = pd.DataFrame(data)

In [106]:
df1.head()

Unnamed: 0,text_bold_count,text_h1_count,text_h2_count,text_h3_count,text_h4_count,text_h5_count,text_h6_count,text_img_count,text_italic_count,text_link_count,text_par_count,text_table_count,text_vid_count
0,0,0,0,0,0,0,0,0,0,4,4,0,0
1,0,0,11,0,0,0,0,12,0,3,27,0,0
2,0,0,0,0,0,0,0,0,0,5,3,0,0
3,0,0,5,0,0,0,0,5,0,4,13,0,0
4,0,0,0,0,0,0,0,0,0,3,7,0,0


In [107]:
df1.shape

(57758, 13)

In [104]:
result = pd.concat([df['guid'], df1], axis=1)

In [105]:
result.shape

(57866, 14)

In [None]:
result