In [1]:
import random

import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

%matplotlib inline
sns.set_style('whitegrid')

pd.set_option('max_colwidth', 300)
pd.set_option('max_columns', 100)

In [2]:
nlp = spacy.load('en_core_web_md')

## Get Mashable Headlines data for training model

In [92]:
mash_data_path = '../Datasets/OnlineNewsPopularity-Mashable/mashable_all_features_with_text.csv'

In [93]:
hl = pd.read_csv(mash_data_path)[['title', 'shares']].dropna()

In [94]:
ind = random.randint(0, len(hl)-30)
hl.iloc[ind:ind+20, :]

Unnamed: 0,title,shares
36367,Why is there a giant pink condom in a Sydney park?,1000
36368,Gordon Hayward backs up his trash talk by swatting LeBron James,1200
36369,"HTC Re camera is bold, but forgets that image is everything",5900
36370,Poppin' bottles: Jay Z buys part of 'Ace of Spades' champagne brand,828
36371,John Lewis unveils new Christmas ad,866
36372,Rare Steinbeck WWII story finally published,942
36373,Los Angeles art museum receives $500 million in rare paintings,952
36374,"So, uh, what's up with LeBron James and the languid Cavaliers?",2400
36375,Lorde's dancing does not disappoint in 'Yellow Flicker Beat' video,853
36376,How a TV show from the '80s shaped today's leisure cruise industry,3200


---------------

## Feature engineering
Syntactic features:
1. Total tokens
1. Average token length
1. Ratio of title cased tokens
1. Ratio of upper cased tokens
1. Presence of exclamation
1. Presence of question mark
1. Presence of quote marks



Symantic features:
1. Three consecutive nouns
1. Noun percentage
1. Verb percentage
1. Proper noun percentage
1. Adverb percentage
1. Adjective percentage
1. Interjection percentage
1. Count of non stop-words


Smarter features:
1. Pantheon, Wikipedia page view features
1. Named entity one hot encoding and using best features among all
1. Action words (do, stop, fight etc.)
1. Urgency words (immediate, now, time etc)
1. Hypernym, Hyponym one hot encoding and using best features among all

### Syntactic features

In [149]:
def get_syntactic_features(df, hl_col, nlp):
    
    def total_tokens(doc):
        return len(doc)
    
    def avg_token_len(doc):
        return np.mean(np.array([len(x) for x in doc]))
    
    def title_case_tokens(doc):
        return len([x for x in doc if x.shape_[0] == 'X'])/len(doc)
    
    def upper_case_tokens(doc):
        return len([x for x in doc if x.text.isupper()])/len(doc)
    
    def exclamation_token(doc):
        return '!' in doc.text
    
    def question_mark_token(doc):
        return '?' in doc.text
    
    def quote_mark_token(doc):
        return ("'" in doc.text) or ('"' in doc.text)
    
    def master_loop(doc):
        return [
            total_tokens(doc),
            avg_token_len(doc),
            title_case_tokens(doc),
            upper_case_tokens(doc),
            exclamation_token(doc),
            question_mark_token(doc),
            quote_mark_token(doc)
        ]
        
    ndf = (
        df
        .assign(
            ans_col = lambda x: x.apply(lambda y: master_loop(nlp(y[hl_col])), axis=1),
        )
        .assign(
            total_tokens = lambda x: (x['ans_col'].str[0]),
            avg_token_len = lambda x: (x['ans_col'].str[1]),
            title_case_tokens = lambda x: (x['ans_col'].str[2]),
            upper_case_tokens = lambda x: (x['ans_col'].str[3]),
            exclamation_token = lambda x: x['ans_col'].str[4],
            question_mark_token = lambda x: x['ans_col'].str[5],
            quote_mark_token = lambda x: x['ans_col'].str[6],
        )
        .drop(['ans_col'], axis=1)
    )
    return ndf

### Semantic features

In [150]:
def get_semantic_features(df, hl_col, nlp):
    
    def get_semantic_features_sentence(doc):
        pos_list = [x.pos_ for x in doc]
        
        def is_noun(pos):
                return (pos == 'PROPN') or (pos == 'NOUN')
        
        def three_consec_nouns(pos_list):
            nouns = [is_noun(pos) for pos in pos_list]
            for i in range(len(nouns) - 3):
                if nouns[i: i+3] == [True, True, True]:
                    return True
            return False
        
        def noun_percentage(pos_list):
            nouns = [is_noun(pos) for pos in pos_list]
            return float(np.mean(np.array(nouns)))
        
        def proper_noun_percentage(pos_list):
            return len([x for x in pos_list if x == 'PROPN'])/len(pos_list)
        
        def verb_percentage(pos_list):
            return len([x for x in pos_list if x == 'VERB'])/len(pos_list)
        
        def adverb_percentage(pos_list):
            return len([x for x in pos_list if x == 'ADV'])/len(pos_list)
        
        def adjective_percentage(pos_list):
            return len([x for x in pos_list if x == 'ADJ'])/len(pos_list)
        
        def interjection(pos_list):
            return 'INTJ' in pos_list
        
        def non_stop_percentage(doc):
            return len([x for x in doc if not x.is_stop])/len(doc)
        
        return [
            three_consec_nouns(pos_list), 
            noun_percentage(pos_list),
            proper_noun_percentage(pos_list),
            verb_percentage(pos_list), 
            adverb_percentage(pos_list),
            adjective_percentage(pos_list),
            interjection(pos_list),
            non_stop_percentage(doc)
        ]
    
    ndf = (
        df
        .assign(
            ans_col = lambda x: x.apply(
                lambda y: get_semantic_features_sentence(nlp(y[hl_col])), 
                axis=1
            ),
        )
        .assign(
            three_consec_nouns = lambda x: (x['ans_col'].str[0]),
            noun_percentage = lambda x: (x['ans_col'].str[1]),
            proper_noun_percentage = lambda x: (x['ans_col'].str[2]),
            verb_percentage = lambda x: (x['ans_col'].str[3]),
            adverb_percentage = lambda x: x['ans_col'].str[4],
            adjective_percentage = lambda x: x['ans_col'].str[5],
            interjection = lambda x: x['ans_col'].str[6],
            non_stop_percentage = lambda x: x['ans_col'].str[7],
        )
        .drop(['ans_col'], axis=1)
    )
    return ndf

In [151]:
%%time
print('Calculating syntactic features...')
syntactic_feat = get_syntactic_features(hl, 'title', nlp)
print('Calculating semantic features...')
semantic_feat = get_semantic_features(hl, 'title', nlp)
print('Joining...')
all_headline_feat = syntactic_feat.merge(semantic_feat, on=['title', 'shares'])
print('Done.')

Calculating syntactic features...
Calculating semantic features...
Joining...
Done.
CPU times: user 9min 30s, sys: 166 ms, total: 9min 30s
Wall time: 9min 30s


In [153]:
all_headline_feat.to_csv('../Datasets/OnlineNewsPopularity-Mashable/all_text_features.csv', index=False)

## Smarter features

In [3]:
# Load previous features
hl_feat = pd.read_csv('../Datasets/OnlineNewsPopularity-Mashable/all_text_features.csv')

### Wikipedia page views

In [4]:
wiki_dataset_path = '../Datasets/Pageviews/pageviews_2008-2013.tsv'

In [5]:
include_cols_wiki = ['name', 'countryCode3', 'gender', 'occupation', 'industry', 'domain']
year_cols_wiki = ['2012-01', '2012-02', '2012-03', '2012-04', '2012-05', '2012-06', 
                  '2012-07', '2012-08', '2012-09', '2012-10', '2012-11', '2012-12']

wiki = (
    pd.read_csv(wiki_dataset_path, sep='\t')
    .loc[lambda x: x['lang'] == 'en']
    [include_cols_wiki + year_cols_wiki]
)

wiki['views'] = 0
for col in year_cols_wiki:
    wiki['views'] = wiki['views'] + wiki[col]
    
wiki = wiki.drop(year_cols_wiki, axis=1)
wiki['views'] = (wiki['views'] - wiki['views'].min())/(wiki['views'].max() - wiki['views'].min())
wiki['name'] = wiki['name'].str.lower()

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
wiki.head()

Unnamed: 0,name,countryCode3,gender,occupation,industry,domain,views
30,abraham lincoln,USA,Male,POLITICIAN,GOVERNMENT,INSTITUTIONS,0.687415
160,aristotle,GRC,Male,PHILOSOPHER,PHILOSOPHY,HUMANITIES,0.212682
293,ayn rand,RUS,Female,WRITER,LANGUAGE,HUMANITIES,0.196001
354,andre agassi,USA,Male,TENNIS PLAYER,INDIVIDUAL SPORTS,SPORTS,0.085859
419,aldous huxley,GBR,Male,WRITER,LANGUAGE,HUMANITIES,0.079505


In [284]:
wiki_dict = wiki[['name', 'views']].set_index('name')['views'].to_dict()

In [314]:
def get_ngram_wiki_score(doc, wiki_dict=wiki_dict):
    unigrams = [x.text.lower() for x in doc if (x.is_alpha or x.is_digit) and not (x.is_stop or len(x) < 2)]
    bigrams = [' '.join([unigrams[i], unigrams[i+1]]) for i in range(len(unigrams)-1)]
    entities = [x.text.lower() for x in doc.ents]
    search_grams = unigrams + bigrams + entities

    score = 0
    for gram in search_grams:
        try:
            matches = wiki_dict.get(gram)
            if matches is None:
                gram_score = 0
            else:
                gram_score = np.array(matches).mean()
        except ValueError:
            gram_score = 0
        
        score += gram_score
    return score

In [349]:
%%time
wiki_score = []

for tmpdoc in nlp.pipe(hl_feat['title']):
    wiki_score.append(get_ngram_wiki_score(tmpdoc))
    
hl_feat['wiki_score'] = wiki_score

CPU times: user 3min 28s, sys: 4.04 s, total: 3min 32s
Wall time: 55.7 s


In [353]:
hl_feat.head(1)

Unnamed: 0,title,shares,total_tokens,avg_token_len,title_case_tokens,upper_case_tokens,exclamation_token,question_mark_token,quote_mark_token,three_consec_nouns,noun_percentage,proper_noun_percentage,verb_percentage,adverb_percentage,adjective_percentage,interjection,non_stop_percentage,wiki_score
0,Amazon's Streaming Video Library Now a Little Easier to Navigate,593,11,5.0,0.727273,0.0,False,False,True,True,0.363636,0.363636,0.090909,0.090909,0.181818,False,0.636364,0.0


## Training

In [214]:
# Paths
X_train_path = '../Datasets/OnlineNewsPopularity-Mashable/all_text_feat_X_train.csv'
y_train_path = '../Datasets/OnlineNewsPopularity-Mashable/all_text_feat_y_train.csv'
X_test_path = '../Datasets/OnlineNewsPopularity-Mashable/all_text_feat_X_test.csv'
y_test_path = '../Datasets/OnlineNewsPopularity-Mashable/all_text_feat_y_test.csv'

In [172]:
all_headline_feat['popular'] = all_headline_feat['shares'] > 1400

In [175]:
syntactic_features = ['total_tokens', 'avg_token_len', 'title_case_tokens', 
                      'upper_case_tokens', 'exclamation_token', 'question_mark_token', 
                      'quote_mark_token']
semantic_features = ['three_consec_nouns', 'noun_percentage', 'proper_noun_percentage', 
                     'verb_percentage' ,'adverb_percentage', 'adjective_percentage', 
                     'interjection', 'non_stop_percentage']

final_features = syntactic_features + semantic_features
label = 'popular'

In [215]:
X = all_headline_feat[final_features]
y = all_headline_feat[label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.to_csv(X_train_path, index=False)
X_test.to_csv(X_test_path, index=False)
y_train.to_csv(y_train_path, index=False)
y_test.to_csv(y_test_path, index=False)

In [217]:
hl_X_train, hl_X_test, hl_y_train, hl_y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [221]:
clf = RandomForestClassifier(
    n_estimators=400, 
    max_depth=5, 
)

In [222]:
%%time
# Train
clf.fit(hl_X_train, hl_y_train)

  


CPU times: user 3.58 s, sys: 7.96 ms, total: 3.58 s
Wall time: 3.58 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [223]:
pd.Series(data=clf.feature_importances_, index=final_features).sort_values(ascending=False)

title_case_tokens         0.184983
noun_percentage           0.125702
avg_token_len             0.121856
proper_noun_percentage    0.116707
non_stop_percentage       0.085985
total_tokens              0.072720
upper_case_tokens         0.069588
verb_percentage           0.066584
adjective_percentage      0.058950
adverb_percentage         0.050194
three_consec_nouns        0.017954
quote_mark_token          0.011998
question_mark_token       0.006688
exclamation_token         0.005226
interjection              0.004865
dtype: float64

In [224]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

acc_train = accuracy_score(pred_train, y_train)
acc_test = accuracy_score(pred_test, y_test)
print(acc_train, acc_test)

0.5416876870816938 0.5240112994350282
