In [3]:
import numpy as np
import pandas as pd
import os, itertools, csv
from bs4 import BeautifulSoup
import re
from pytrends.request import TrendReq
from datetime import *
import time
import nltk

# load the dataset
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


In [4]:
def get_title(text: str) -> str:    
    soup = BeautifulSoup(text, "html.parser")
    title = soup.find('h1',{'class':'title'})
    return title.get_text()
# test
print(get_title(df['Page content'][0]))

NASA's Grand Challenge: Stop Asteroids From Destroying Earth


In [27]:
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

#stop = nltk.corpus.stopwords.words('english')
stop = set(stopwords.words("english"))
new_words = ["\'s"] 
stop = stop.union(new_words)

def tokenizer_stem(text: str) -> list:
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    porter = PorterStemmer()
    res = [porter.stem(token) for token in tokens if token not in stop]
    
    return " ".join(res).lower()

# test
print(tokenizer_stem(get_title(df['Page content'][0])))
headline = []
for i in range(df.shape[0]):
    headline.append(tokenizer_stem(get_title(df['Page content'][i])))
headline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuwei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yuwei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


nasa grand challeng : stop asteroid from destroy earth


['nasa grand challeng : stop asteroid from destroy earth',
 "googl new open sourc patent pledg : we wo n't sue unless attack first",
 "ballin ' : 2014 nfl draft pick get choos their own walk-out music",
 'cameraperson fail deliv slapstick laugh',
 'nfl star help young fan prove friendship with ador video',
 'the underdog internet provid head washington',
 "6-second vine video is funniest parodi iggi azalea 'fanci '",
 'bill cosbi still plan perform comedi show florida',
 'vend machin use cloud technolog person your purchas',
 "onlin subscrib up , advertis still down 'new york time '",
 'saturn photobomb moon dramat celesti moment',
 "if caricatur artist anim 'mad men '",
 'uber suspend oper nevada rule',
 'appl new spaceship campu could cost $ 5 billion build',
 "women world cup : team usa drawn 'group death '",
 "thi is how 'go-around ' avoid disast airport",
 'nhtsa site will tell you if your car ha been recal',
 "unconvent way open wine bottl , when you 're desper",
 '5 argument aga

In [54]:
from sklearn.model_selection import GridSearchCV, train_test_split
param_C = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]

#clf = LogisticRegression(random_state=0)
pipe3 = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,3))), 
                  ('clf', LogisticRegression())])

# set the param_grid parameter of GridSearchCV to a list of dictionaries
param_grid = [{'clf__C': param_C,
               'clf__solver': ['liblinear']}]
gs = GridSearchCV(estimator=pipe3, 
                  param_grid=param_grid, 
                  scoring='accuracy')

gs = gs.fit(np.array(headline), df['Popularity'])
print(gs.best_score_)
print(gs.best_params_)



0.5270050283977861
{'clf__C': 0.1, 'clf__solver': 'liblinear'}


In [39]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

names = ['LogisticRegression', 
         'LogisticRegression+(1,2)gram',
         'LogisticRegression+preprocess',
         'LogisticRegression+preprocess+hash']
# without preprocessing
pipe1 = Pipeline([('vect', CountVectorizer()), 
                  ('clf', LogisticRegression(solver = "liblinear"))])
# without preprocessing, use larger ngram range
pipe2 = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), 
                  ('clf', LogisticRegression(solver = "liblinear"))])
# with preprocessing
pipe3 = Pipeline([('vect', TfidfVectorizer()), 
                  ('clf', LogisticRegression(solver = "liblinear"))])
# with preprocessing and hasing
pipe4 = Pipeline([('vect', HashingVectorizer(n_features=2**10)), 
                  ('clf', LogisticRegression(solver = "liblinear"))])

clf_labels = ['LogisticRegression', 
         'LogisticRegression+(1,2)gram',
         'LogisticRegression+preprocess',
         'LogisticRegression+preprocess+hash']
print('[Individual]')
for pipe, label in zip([pipe1, pipe2, pipe3, pipe3], clf_labels):
    scores = cross_val_score(estimator=pipe, X=np.array(headline), y=df['Popularity'], cv=10, scoring='roc_auc')
    print('%s: %.3f (+/- %.3f)' % (label, scores.mean(), scores.std()))

# CV
# print('[auc (10-fold cv)]')
# for name, clf in zip(names, [pipe1, pipe2, pipe3, pipe4]):
#     scores = cross_val_score(estimator=clf, X=np.array(headline), y=df['Popularity'], \
#                          cv=10, scoring='roc_auc')
#     print('%s: %.3f (+/-%.3f)' % (name, scores.mean(), scores.std()))

[Individual]
LogisticRegression: 0.518 (+/- 0.011)
LogisticRegression+(1,2)gram: 0.517 (+/- 0.011)
LogisticRegression+preprocess: 0.524 (+/- 0.010)
LogisticRegression+preprocess+hash: 0.524 (+/- 0.010)


In [42]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier

print('[Voting]')
best_vt, best_w, best_score = None, (), -1
for a, b, c, d in list(itertools.permutations(range(0,4))): # try some weight combination
    clf = VotingClassifier(estimators=[('LogisticRegression', pipe1), ('LogisticRegression+(1,2)gram', pipe2), 
                                       ('LogisticRegression+preprocess', pipe3), ('LogisticRegression+preprocess+hash', pipe4)], 
                           voting='soft', weights=[a,b,c,d])
    scores = cross_val_score(estimator=clf, X=np.array(headline), y=df['Popularity'], cv=10, scoring='roc_auc')
    print('%s: %.3f (+/- %.3f)' % ((a,b,c,d), scores.mean(), scores.std()))
    if best_score < scores.mean():
        best_vt, best_w, best_score = clf, (a, b, c, d), scores.mean()

print('\nBest %s: %.3f' % (best_w, best_score))

[Voting]
(0, 1, 2, 3): 0.523 (+/- 0.009)
(0, 1, 3, 2): 0.524 (+/- 0.010)
(0, 2, 1, 3): 0.522 (+/- 0.010)
(0, 2, 3, 1): 0.522 (+/- 0.010)
(0, 3, 1, 2): 0.520 (+/- 0.010)
(0, 3, 2, 1): 0.520 (+/- 0.011)
(1, 0, 2, 3): 0.523 (+/- 0.009)
(1, 0, 3, 2): 0.524 (+/- 0.010)
(1, 2, 0, 3): 0.521 (+/- 0.010)
(1, 2, 3, 0): 0.521 (+/- 0.011)
(1, 3, 0, 2): 0.519 (+/- 0.010)
(1, 3, 2, 0): 0.519 (+/- 0.011)
(2, 0, 1, 3): 0.522 (+/- 0.010)
(2, 0, 3, 1): 0.522 (+/- 0.010)
(2, 1, 0, 3): 0.521 (+/- 0.010)
(2, 1, 3, 0): 0.521 (+/- 0.011)
(2, 3, 0, 1): 0.519 (+/- 0.011)
(2, 3, 1, 0): 0.519 (+/- 0.011)


KeyboardInterrupt: 