In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import time
import datetime
from collections import Counter

df = pd.read_csv('../NYTimesBlogTrain.csv')

y = df['Popular'] 

# convert date to timestamp (milliseconds, divide by a million to get a more readable decimal)
df['PubDate'] = df['PubDate'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%d %H:%M:%S'))/1000000)

#replace empty values with the middle most common word

def nan_to_meaningful(x, col):
    if pd.isnull(x):
        mc = Counter(col).most_common(8)
        mc = [str(i[0]) for i in mc]
        if 'nan' in mc:
            mc.remove('nan')
        rep = mc.pop((len(mc))//2)
        return rep
        
    return x
        
my_list = ["NewsDesk","SectionName","SubsectionName","Headline","Snippet","Abstract"]

for f in my_list: 
    df[f] = df[f].apply(nan_to_meaningful, args=(df[f],))

# split sentences into series of words so each word can be a column
def string_to_series_of_words(x):    
    return pd.Series(x.split(" "))


for f in my_list[3:]:
    df[f] = df[f].apply(string_to_series_of_words)
    
    
df=pd.get_dummies(df,columns=["NewsDesk","SectionName","SubsectionName", "Headline","Snippet","Abstract"],
                  drop_first=False)

new_features = list(df.dtypes.index)
new_features.remove('Popular')

X = df[new_features]

print(new_features)





['WordCount', 'PubDate', 'UniqueID', 'NewsDesk_Business', 'NewsDesk_Culture', 'NewsDesk_Foreign', 'NewsDesk_Magazine', 'NewsDesk_Metro', 'NewsDesk_National', 'NewsDesk_OpEd', 'NewsDesk_Science', 'NewsDesk_Sports', 'NewsDesk_Styles', 'NewsDesk_TStyle', 'NewsDesk_Travel', 'SectionName_Arts', 'SectionName_Business Day', 'SectionName_Crosswords/Games', 'SectionName_Health', 'SectionName_Magazine', 'SectionName_Multimedia', 'SectionName_N.Y. / Region', 'SectionName_Open', 'SectionName_Opinion', 'SectionName_Sports', 'SectionName_Style', 'SectionName_Technology', 'SectionName_Travel', 'SectionName_U.S.', 'SectionName_World', 'SubsectionName_Asia Pacific', 'SubsectionName_Dealbook', 'SubsectionName_Education', 'SubsectionName_Fashion & Style', 'SubsectionName_Politics', 'SubsectionName_Room For Debate', 'SubsectionName_Small Business', 'SubsectionName_The Public Editor', 'Headline_', 'Headline_"An', 'Headline_"Gigi"', 'Headline_"Honey', 'Headline_"Please', 'Headline_$1', 'Headline_$1.5', 'Hea

In [20]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
rf =  RandomForestClassifier(n_estimators = 50, bootstrap = True, random_state=0, criterion='entropy')
rf.fit(X_train, y_train)

y_predict_randF = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_predict_randF)

print("\nAccuracy: ", accuracy) 



Accuracy:  0.915902140673


In [21]:
decTree =  DecisionTreeClassifier(criterion='entropy', min_samples_split=4, random_state=0)
decTree.fit(X_train, y_train)
y_predict_decTree = decTree.predict(X_test)

accuracy_decTree = accuracy_score(y_test, y_predict_decTree)

print("\nAccuracy Decision Tree: ", accuracy_decTree) 





Accuracy Decision Tree:  0.906727828746


In [22]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=52, max_depth=None, min_samples_split=5, random_state=0, criterion='entropy')
etc.fit(X_train,y_train)

predictions_etc = etc.predict(X_test)
accuracy_etc = accuracy_score(y_test, predictions_etc)

print("\nAccuracy ExtraTreesClassifier: ", accuracy_etc) 





Accuracy ExtraTreesClassifier:  0.915902140673


In [23]:

combined_predictions=pd.DataFrame([y_predict_randF, y_predict_decTree, predictions_etc])

consolidated_predictions = list(combined_predictions.mode().values[0])

acc= accuracy_score(y_test, consolidated_predictions)

print("\n Combined Accuracy: ", acc) 







 Combined Accuracy:  0.920489296636
