In [200]:
import random

import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

%matplotlib inline
sns.set_style('whitegrid')

pd.set_option('max_colwidth', 300)
pd.set_option('max_columns', 100)

## Combine all parts of data

In [2]:
train_feature_path = '../Datasets/Headlines/guardian_main_model/guardian_train.csv'
train_labels = '../Datasets/Headlines/guardian_popularity_measures/guardian_train_popularity.csv'
train_headline_path = '../Datasets/Headlines/guardian_headlines/headlines-final.csv'

test_feature_path = '../Datasets/Headlines/guardian_main_model/guardian_test.csv'
test_labels = '../Datasets/Headlines/guardian_popularity_measures/guardian_test_popularity.csv'
test_headline_path = '../Datasets/Headlines/guardian_headlines/headlines_test-final_clean.csv'

In [3]:
def read_headline_file(path):
    return pd.read_csv(
        path, 
        sep='|', 
        names=['article_id', 'section', 'publish_date', 'title']
    )

In [4]:
def combine_data(ft_path, ll_path, hl_path):
    ft = pd.read_csv(ft_path)
    ll = pd.read_csv(ll_path)
    hl = read_headline_file(hl_path)
    
    df = (
        ft
        .merge(ll, on=['article_id'])
        .merge(hl, on=['article_id'])
    )
    return df

In [5]:
train = combine_data(train_feature_path, train_labels, train_headline_path)
test = combine_data(test_feature_path, test_labels, test_headline_path)

In [6]:
print(train.shape)
print(test.shape)

(9814, 224)
(10753, 224)


-------------

## Get Mashable Headlines data for training model

In [7]:
nlp = spacy.load('en_core_web_sm')

In [2]:
mash_data_path = '../Datasets/OnlineNewsPopularity-Mashable/mashable_all_features_with_text.csv'

In [146]:
hl = pd.read_csv(mash_data_path)[['title', 'shares']].dropna()

In [148]:
ind = random.randint(0, len(hl)-30)
hl.iloc[ind:ind+20, :]

Unnamed: 0,title,shares
26340,What We Learned From Apple WWDC 2014,400
26341,Artist Prints Van Gogh's Ear From Family DNA,7700
26342,15 Gorgeous Vacation Spots Proving Heaven Is a Place on Earth,1900
26343,This Is the Man Who Made Grumpy Cat Rich,1000
26344,'Security Concerns' Halt Bowe Bergdahl's Hometown Celebration,935
26345,'Bricksy' Reimagines Banksy's Artwork in Lego Form,1400
26346,"Yes, You Should Care About Reaching Profitability",4300
26347,An 'Honest' Take on Your Candy Crush Addiction,1400
26348,"Chinese State Media: Google, Facebook, Yahoo Are U.S. 'Pawns'",2300
26349,The Undeniable Comfort of Dogs Goes on the Road,1500


---------------

## Feature engineering
Syntactic features:
1. Total tokens
1. Average token length
1. Ratio of title cased tokens
1. Ratio of upper cased tokens
1. Presence of exclamation
1. Presence of question mark
1. Presence of quote marks



Symantic features:
1. Three consecutive nouns
1. Noun percentage
1. Verb percentage
1. Proper noun percentage
1. Adverb percentage
1. Adjective percentage
1. Count of non stop-words


Smarter features:
1. Action words (do, stop, fight etc.
1. Urgency words (immediate, now, time etc)
1. Pantheon, Wikipedia page view features

### Syntactic features

In [149]:
def get_syntactic_features(df, hl_col, nlp):
    
    def total_tokens(doc):
        return len(doc)
    
    def avg_token_len(doc):
        return np.mean(np.array([len(x) for x in doc]))
    
    def title_case_tokens(doc):
        return len([x for x in doc if x.shape_[0] == 'X'])/len(doc)
    
    def upper_case_tokens(doc):
        return len([x for x in doc if x.text.isupper()])/len(doc)
    
    def exclamation_token(doc):
        return '!' in doc.text
    
    def question_mark_token(doc):
        return '?' in doc.text
    
    def quote_mark_token(doc):
        return ("'" in doc.text) or ('"' in doc.text)
    
    def master_loop(doc):
        return [
            total_tokens(doc),
            avg_token_len(doc),
            title_case_tokens(doc),
            upper_case_tokens(doc),
            exclamation_token(doc),
            question_mark_token(doc),
            quote_mark_token(doc)
        ]
        
    ndf = (
        df
        .assign(
            ans_col = lambda x: x.apply(lambda y: master_loop(nlp(y[hl_col])), axis=1),
        )
        .assign(
            total_tokens = lambda x: (x['ans_col'].str[0]),
            avg_token_len = lambda x: (x['ans_col'].str[1]),
            title_case_tokens = lambda x: (x['ans_col'].str[2]),
            upper_case_tokens = lambda x: (x['ans_col'].str[3]),
            exclamation_token = lambda x: x['ans_col'].str[4],
            question_mark_token = lambda x: x['ans_col'].str[5],
            quote_mark_token = lambda x: x['ans_col'].str[6],
        )
        .drop(['ans_col'], axis=1)
    )
    return ndf

### Semantic features

In [150]:
def get_semantic_features(df, hl_col, nlp):
    
    def get_semantic_features_sentence(doc):
        pos_list = [x.pos_ for x in doc]
        
        def is_noun(pos):
                return (pos == 'PROPN') or (pos == 'NOUN')
        
        def three_consec_nouns(pos_list):
            nouns = [is_noun(pos) for pos in pos_list]
            for i in range(len(nouns) - 3):
                if nouns[i: i+3] == [True, True, True]:
                    return True
            return False
        
        def noun_percentage(pos_list):
            nouns = [is_noun(pos) for pos in pos_list]
            return float(np.mean(np.array(nouns)))
        
        def proper_noun_percentage(pos_list):
            return len([x for x in pos_list if x == 'PROPN'])/len(pos_list)
        
        def verb_percentage(pos_list):
            return len([x for x in pos_list if x == 'VERB'])/len(pos_list)
        
        def adverb_percentage(pos_list):
            return len([x for x in pos_list if x == 'ADV'])/len(pos_list)
        
        def adjective_percentage(pos_list):
            return len([x for x in pos_list if x == 'ADJ'])/len(pos_list)
        
        def interjection(pos_list):
            return 'INTJ' in pos_list
        
        def non_stop_percentage(doc):
            return len([x for x in doc if not x.is_stop])/len(doc)
        
        return [
            three_consec_nouns(pos_list), 
            noun_percentage(pos_list),
            proper_noun_percentage(pos_list),
            verb_percentage(pos_list), 
            adverb_percentage(pos_list),
            adjective_percentage(pos_list),
            interjection(pos_list),
            non_stop_percentage(doc)
        ]
    
    ndf = (
        df
        .assign(
            ans_col = lambda x: x.apply(
                lambda y: get_semantic_features_sentence(nlp(y[hl_col])), 
                axis=1
            ),
        )
        .assign(
            three_consec_nouns = lambda x: (x['ans_col'].str[0]),
            noun_percentage = lambda x: (x['ans_col'].str[1]),
            proper_noun_percentage = lambda x: (x['ans_col'].str[2]),
            verb_percentage = lambda x: (x['ans_col'].str[3]),
            adverb_percentage = lambda x: x['ans_col'].str[4],
            adjective_percentage = lambda x: x['ans_col'].str[5],
            interjection = lambda x: x['ans_col'].str[6],
            non_stop_percentage = lambda x: x['ans_col'].str[7],
        )
        .drop(['ans_col'], axis=1)
    )
    return ndf

In [151]:
%%time
print('Calculating syntactic features...')
syntactic_feat = get_syntactic_features(hl, 'title', nlp)
print('Calculating semantic features...')
semantic_feat = get_semantic_features(hl, 'title', nlp)
print('Joining...')
all_headline_feat = syntactic_feat.merge(semantic_feat, on=['title', 'shares'])
print('Done.')

Calculating syntactic features...
Calculating semantic features...
Joining...
Done.
CPU times: user 9min 30s, sys: 166 ms, total: 9min 30s
Wall time: 9min 30s


In [153]:
all_headline_feat.to_csv('../Datasets/OnlineNewsPopularity-Mashable/all_text_features.csv', index=False)

## Training

In [172]:
all_headline_feat['popular'] = all_headline_feat['shares'] > 1400

In [175]:
syntactic_features = ['total_tokens', 'avg_token_len', 'title_case_tokens', 
                      'upper_case_tokens', 'exclamation_token', 'question_mark_token', 
                      'quote_mark_token']
semantic_features = ['three_consec_nouns', 'noun_percentage', 'proper_noun_percentage', 
                     'verb_percentage' ,'adverb_percentage', 'adjective_percentage', 
                     'interjection', 'non_stop_percentage']

final_features = syntactic_features + semantic_features
label = 'popular'

In [184]:
X = all_headline_feat[final_features].values
y = all_headline_feat[label].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [207]:
clf = RandomForestClassifier(
    n_estimators=400, 
    max_depth=5, 
)

In [208]:
%%time
# Train
clf.fit(X_train, y_train)

  


CPU times: user 3.82 s, sys: 12 ms, total: 3.83 s
Wall time: 3.83 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [209]:
pd.Series(data=clf.feature_importances_, index=final_features).sort_values(ascending=False)

title_case_tokens         0.197324
proper_noun_percentage    0.115969
noun_percentage           0.115965
avg_token_len             0.102659
non_stop_percentage       0.094424
total_tokens              0.075347
upper_case_tokens         0.075122
verb_percentage           0.062522
adjective_percentage      0.062492
adverb_percentage         0.051490
three_consec_nouns        0.019450
question_mark_token       0.008628
quote_mark_token          0.008515
interjection              0.006021
exclamation_token         0.004072
dtype: float64

In [210]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

acc_train = accuracy_score(pred_train, y_train)
acc_test = accuracy_score(pred_test, y_test)
print(acc_train, acc_test)

0.5431876328780945 0.526568017487809
