In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import time
import datetime
from collections import Counter

df = pd.read_csv('../NYTimesBlogTrain.csv')
test_df = pd.read_csv('../NYTimesBlogTest.csv')

y = df['Popular'] 

# convert date to timestamp (milliseconds, divide by a million to get a more readable decimal)
df['PubDate'] = df['PubDate'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%d %H:%M:%S'))/1000000)
test_df['PubDate'] = test_df['PubDate'].apply(lambda x: time.mktime(time.strptime(x, '%Y-%m-%d %H:%M:%S'))/1000000)

#replace empty values with the middle most common word
def nan_to_meaningful(x, col):
    if pd.isnull(x):
        mc = Counter(col).most_common(8)
        mc = [str(i[0]) for i in mc]
        if 'nan' in mc:
            mc.remove('nan')
        rep = mc.pop((len(mc))//2)
        return rep
        
    return x
        
my_list = ["NewsDesk","SectionName","SubsectionName","Headline","Snippet","Abstract"]

for f in my_list: 
    df[f] = df[f].apply(nan_to_meaningful, args=(df[f],))
    test_df[f] = test_df[f].apply(nan_to_meaningful, args=(test_df[f],))

# split sentences into series of words so each word can be a column
def string_to_series_of_words(x):    
    return pd.Series(x.split(" "))

# only apply it to the last 3 features which are sentences, index 3 to last
for f in my_list[3:]:
    df[f] = df[f].apply(string_to_series_of_words)
    test_df[f] = test_df[f].apply(string_to_series_of_words)


# for both testing/training data generate columns in the form of feature_word so that each word is a column
df=pd.get_dummies(df,columns=["NewsDesk","SectionName","SubsectionName", "Headline","Snippet","Abstract"],
                  drop_first=False)

test_df=pd.get_dummies(test_df,columns=["NewsDesk","SectionName","SubsectionName", "Headline","Snippet","Abstract"],
                  drop_first=False)

# we now have new features added, get them and remove the label
new_features = list(df.dtypes.index)
new_features.remove('Popular')

#do the same for the testing data
new_features_test = list(test_df.dtypes.index)

# Make sure both the training and testing dataframe have the same number of columns/features
# add the columns not in the other with all zeros

for f in new_features:
    if f not in new_features_test:
        test_df[f] = 0
        
new_features_test = list(test_df.dtypes.index)
        
for g in new_features_test:
    if g not in new_features:
        df[g] = 0
        
new_features = list(df.dtypes.index)
new_features.remove('Popular')
        
X = df[new_features]
Y = test_df[new_features_test]

#make sure at the end the testing and training dataframe have the same number of features
print(len(new_features))
print(len(new_features_test))

X.head()

print("Done with data munging...")

6549
6549
Done with data munging...


In [28]:
from sklearn.ensemble import RandomForestClassifier

rf =  RandomForestClassifier(n_estimators = 50, bootstrap = True, random_state=0, criterion='entropy')
rf.fit(X, y)

y_predict_randF = rf.predict(Y)

np.set_printoptions(threshold=np.inf)

print(np.array(y_predict_randF))



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [23]:
decTree =  DecisionTreeClassifier(criterion='entropy', min_samples_split=4, random_state=0)
decTree.fit(X, y)
y_predict_decTree = decTree.predict(Y)

print(np.array(y_predict_decTree))



[0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0
 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1
 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 1 0 0 

In [26]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=52, max_depth=None, min_samples_split=5, random_state=0, criterion='entropy')
etc.fit(X,y)

predictions_etc = etc.predict(Y)

print(np.array(predictions_etc))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 