In [8]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import numpy as np
import pandas as pd
import my_globals
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from utils import get_sub_featured_datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
DATA_PATH = "/".join([my_globals.DATA_DIR, my_globals.MAIN_DATA_NAME])
data = get_sub_featured_datasets(size = 5000, random_seed=4)
data.head(5)



Unnamed: 0,index,target,ids,date,user,text,weekday_Mon,weekday_Tue,weekday_Wed,weekday_Thu,...,tfidf_ð²ñ,tfidf_ðµð³ð,tfidf_ðºð¾ð²ñ,tfidf_ðºñ,tfidf_ð¼ð,tfidf_ð½ð,tfidf_ð¾,tfidf_ð¾ð¼,tfidf_øµø,tfidf_ø¹
0,1000570,4,1880006855,Thu May 21 23:48:34 PDT 2009,JohnnyEugenio2,Omgosh I put my phone back on the hook so the ...,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,991406,4,1835115440,Mon May 18 05:10:48 PDT 2009,BalaSN,leavin ma office,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1534391,4,2178760886,Mon Jun 15 08:10:05 PDT 2009,eltorgie,thunder! ... 399/1000 words,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1426117,4,2059166942,Sat Jun 06 16:22:57 PDT 2009,naughtymeg,@chasesterling guess its just me and you!!,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,120705,0,1833295442,Sun May 17 22:51:09 PDT 2009,Rachel_Butts,@zeneth7 Keen-o! I'm gonna miss you too I'm g...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
features_col = [
    col for col in data.columns 
    if (
        (col.startswith("weekday") or 
        #  col.startswith("count") or 
         col.startswith("tfidf")
         ) and col != "target")
]

other_features = ["exclaim_freq", "mention_count", "cap_freq"]
features_col += other_features
print("Non-token features:")
print([col for col in features_col if not (col.startswith("count") or col.startswith("tfidf"))])

XX = data[features_col]
yy = data[["target"]]
X_train, X_test, y_train, y_test = train_test_split(XX, yy, test_size=0.2)

Non-token features:
['weekday_Mon', 'weekday_Tue', 'weekday_Wed', 'weekday_Thu', 'weekday_Fri', 'weekday_Sat', 'weekday_Sun', 'exclaim_freq', 'mention_count', 'cap_freq']


## Benchmarking with BernoulliNB

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

bnb = BernoulliNB()
X_input = X_train
bnb.fit(X_input, y_train)
y_pred = bnb.predict(
    X_test
)

def assess(y_true, y_pred):
    print("confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print()
    print("accuracy_score:")
    print(accuracy_score(y_true, y_pred))
    print()
    print("classification report:")
    print(classification_report(y_true,y_pred))
    print()

# print("confusion matrix:")
# print(confusion_matrix(y_test, y_pred))
# print()
# print("accuracy_score:")
# print(accuracy_score(y_test, y_pred))
# print()
# print("classification report:")
# print(classification_report(y_test,y_pred))
# print()
assess(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

X_input = X_train

param_grid = {
    'n_estimators': [100, 200, 400],
    'max_features': [
        'sqrt', 
        'log2', 
        # None
    ],
    'max_depth': [10, 50, 100],
    # 'max_leaf_nodes': [3, 6, 9],
}


grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid, verbose=4)
grid_search.fit(X_input, y_train)

rfc = grid_search.best_estimator_
print(rfc)
rfc.fit(X_input, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
rfc.fit(X_input, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
def assess(y_true, y_pred):
    print("confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print()
    print("accuracy_score:")
    print(accuracy_score(y_true, y_pred))
    print()
    print("classification report:")
    print(classification_report(y_true,y_pred))
    print()

assess(y_test, y_pred)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

feature_names = rfc.feature_names_in_
importances = rfc.feature_importances_
idx = np.argsort(importances)
feature_names[idx][-100:]

In [11]:
from sklearn.linear_model import LogisticRegression

X_input = X_train

logit_reg = LogisticRegression(max_iter=1000)
logit_reg.fit(X_input, y_train)

y_pred = logit_reg.predict(X_test)

print(accuracy_score(y_test, y_pred))
y_pred_prob=logit_reg.predict_proba(X_test)


def assess(y_true, y_pred):
    print("confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    print()
    print("accuracy_score:")
    print(accuracy_score(y_true, y_pred))
    print()
    print("classification report:")
    print(classification_report(y_true,y_pred))
    print()

assess(y_test, y_pred)

  y = column_or_1d(y, warn=True)


0.737
confusion matrix:
[[333 142]
 [121 404]]

accuracy_score:
0.737

classification report:
              precision    recall  f1-score   support

           0       0.73      0.70      0.72       475
           4       0.74      0.77      0.75       525

    accuracy                           0.74      1000
   macro avg       0.74      0.74      0.74      1000
weighted avg       0.74      0.74      0.74      1000


