In [77]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import numpy as np
import pandas as pd
import my_globals
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from utils import get_sub_featured_datasets


In [78]:
DATA_PATH = "/".join([my_globals.DATA_DIR, my_globals.MAIN_DATA_NAME])
data = get_sub_featured_datasets(size = 10000, random_seed=4)
data.head(5)



Unnamed: 0,index,target,ids,date,user,text,weekday_Mon,weekday_Tue,weekday_Wed,weekday_Thu,...,tfidf_ðºð¾,tfidf_ðºð¾ñ,tfidf_ð¼ð,tfidf_ð¼ðµð½ñ,tfidf_ð¼ð¾ð¼ðµð½ñ,tfidf_ð½ðµ,tfidf_ð¾,tfidf_ð¾ð,tfidf_ð¾ð²ð,tfidf_ð¾ñ
0,199234,0,1971388984,Sat May 30 06:59:43 PDT 2009,spydie,@beckmermaid sorry to hear that,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,562522,0,2205682694,Wed Jun 17 05:14:30 PDT 2009,mcgheee,someone tap us an oasis ticket man,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,657457,0,2241033139,Fri Jun 19 10:27:37 PDT 2009,thedoosra,I don't wanna go alone to the community cookou...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,622900,0,2229371850,Thu Jun 18 15:38:27 PDT 2009,donnadodd,OMG I just cried so much watching grey's anatomy,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1455522,4,2063421483,Sun Jun 07 02:35:27 PDT 2009,FlissTee,@deanomarr just testing,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
features_col = [
    col for col in data.columns 
    if (
        (col.startswith("weekday") or 
        #  col.startswith("count") or 
         col.startswith("tfidf")
         ) and col != "target")
]

other_features = ["exclaim_freq", "mention_count", "cap_freq"]
features_col += other_features
print("Non-token features:")
print([col for col in features_col if not (col.startswith("count") or col.startswith("tfidf"))])

XX = data[features_col]
yy = data[["target"]]
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2)

Non-token features:
['weekday_Mon', 'weekday_Tue', 'weekday_Wed', 'weekday_Thu', 'weekday_Fri', 'weekday_Sat', 'weekday_Sun', 'exclaim_freq', 'mention_count', 'cap_freq']


## Benchmarking with BernoulliNB

In [80]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

bnb = BernoulliNB()
X_input = X_train
bnb.fit(X_input, y_train)
y_pred = bnb.predict(
    X_test
)

print("confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print()
print("accuracy_score:")
print(accuracy_score(y_test, y_pred))
print()
print("classification report:")
print(classification_report(y_test,y_pred))
print()

  y = column_or_1d(y, warn=True)


confusion matrix:
[[746 260]
 [296 698]]

accuracy_score:
0.722

classification report:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      1006
           4       0.73      0.70      0.72       994

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000




In [90]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

X_input = X_train

param_grid = {
    'n_estimators': [100, 200, 400],
    'max_features': [
        'sqrt', 
        'log2', 
        # None
    ],
    'max_depth': [10, 50, 100],
    # 'max_leaf_nodes': [3, 6, 9],
}


grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid, verbose=4)
grid_search.fit(X_input, y_train)

rfc = grid_search.best_estimator_
print(rfc)
rfc.fit(X_input, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END max_depth=10, max_features=sqrt, n_estimators=100;, score=0.719 total time=   2.8s
[CV 2/5] END max_depth=10, max_features=sqrt, n_estimators=100;, score=0.717 total time=   2.7s
[CV 3/5] END max_depth=10, max_features=sqrt, n_estimators=100;, score=0.700 total time=   2.8s
[CV 4/5] END max_depth=10, max_features=sqrt, n_estimators=100;, score=0.704 total time=   2.7s
[CV 5/5] END max_depth=10, max_features=sqrt, n_estimators=100;, score=0.726 total time=   2.8s
[CV 1/5] END max_depth=10, max_features=sqrt, n_estimators=200;, score=0.723 total time=   4.8s
[CV 2/5] END max_depth=10, max_features=sqrt, n_estimators=200;, score=0.723 total time=   4.8s
[CV 3/5] END max_depth=10, max_features=sqrt, n_estimators=200;, score=0.707 total time=   4.8s
[CV 4/5] END max_depth=10, max_features=sqrt, n_estimators=200;, score=0.729 total time=   4.8s
[CV 5/5] END max_depth=10, max_features=sqrt, n_estimators=200;, score=0.71

In [100]:
import matplotlib.pyplot as plt
import pandas as pd

feature_names = rfc.feature_names_in_
importances = rfc.feature_importances_
idx = np.argsort(importances)
feature_names[idx][-100:]

array(['tfidf_way', 'tfidf_life', 'tfidf_headache', 'tfidf_best',
       'tfidf_stupid', 'tfidf_wait', 'tfidf_follow', 'tfidf_it',
       'tfidf_cry', 'tfidf_school', 'tfidf_well', 'tfidf_ha',
       'tfidf_watching', 'tfidf_doe', 'tfidf_beautiful', 'tfidf_my',
       'tfidf_missing', 'tfidf_would', 'tfidf_no', 'tfidf_hope',
       'tfidf_birthday', 'tfidf_last', 'tfidf_anymore', 'tfidf_ugh',
       'tfidf_away', 'tfidf_oh', 'tfidf_look', 'tfidf_already',
       'tfidf_night', 'tfidf_yay', 'tfidf_much', 'tfidf_missed',
       'tfidf_cool', 'tfidf_poor', 'tfidf_could', 'tfidf_yes',
       'tfidf_feeling', 'tfidf_see', 'tfidf_twitter', 'tfidf_think',
       'tfidf_damn', 'tfidf_nice', 'tfidf_sleep', 'tfidf_back',
       'tfidf_one', 'tfidf_happy', 'tfidf_fun', 'tfidf_got',
       'tfidf_tomorrow', 'tfidf_home', 'tfidf_still', 'tfidf_why',
       'tfidf_morning', 'tfidf_welcome', 'tfidf_hurt', 'tfidf_today',
       'tfidf_though', 'tfidf_not', 'tfidf_need', 'tfidf_feel',
       'tfidf_lik