    In this notebook, we model our TFIDF data with Logistic Regression and Multinomial Naive Bayes.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('../data/tfidf_df.csv')
clean_df = pd.read_csv('../data/clean_data.csv')

In [3]:
df.head()

Unnamed: 0,abus,accept,acknowledg,acn,action,activ,actual,admir,adult,advic,...,word,work,world,worst,worth,wrong,year,year old,young,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
X = df.drop(columns='target')
y = clean_df['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, stratify=y)

In [6]:
lr = LogisticRegression(random_state=7)

In [7]:
params = {'penalty': ['l1'],
          'solver': ['liblinear'],
          'C': list(np.logspace(-0.1, 0.1, 20)),
         'max_iter': [100, 150]}

In [8]:
gs = GridSearchCV(lr, param_grid=params, cv=5)

In [9]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=7, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1'], 'solver': ['liblinear'], 'C': [0.7943282347242815, 0.8138161719423083, 0.8337822234717892, 0.8542381193020124, 0.875195877204174, 0.8966678097917787, 0.9186665317542597, 0.9412049672680667, 0.9642963575895775, 0.9879542688342918, 1.0121925999468793, 1.037025590866764, 1.0624678308940412, 1.0885342672606437, 1.1152402139117905, 1.142601360502879, 1.1706337816171068, 1.1993539462092342, 1.2287787272810429, 1.2589254117941673], 'max_iter': [100, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
gs.best_params_

{'C': 1.1706337816171068,
 'max_iter': 100,
 'penalty': 'l1',
 'solver': 'liblinear'}

In [11]:
gs.best_score_

0.655099894847529

In [12]:
y_pred_train = gs.predict(X_train)
y_pred_test = gs.predict(X_test)

train_score = accuracy_score(y_train, y_pred_train)
test_score = accuracy_score(y_test, y_pred_test)

print(f"Train accuracy: {round(train_score, 4)}")
print(f"Test accuracy: \t{round(test_score, 4)}")

Train accuracy: 0.735
Test accuracy: 	0.6384


In [13]:
mnb = MultinomialNB()

params = {'alpha': list(np.logspace(0, 2, 100)),
         'fit_prior': [True, False]}

gsnb = GridSearchCV(mnb, param_grid=params, cv=5)

gsnb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [1.0, 1.0476157527896648, 1.0974987654930561, 1.1497569953977358, 1.2045035402587823, 1.2618568830660204, 1.321941148466029, 1.384886371393873, 1.4508287784959397, 1.5199110829529336, 1.5922827933410924, 1.6681005372000588, 1.7475284000076838, 1.8307382802953682, 1.9179102616724...319748, 86.97490026177834, 91.11627561154896, 95.45484566618342, 100.0], 'fit_prior': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
gsnb.best_params_

{'alpha': 2.205130739903046, 'fit_prior': True}

In [15]:
gsnb.best_score_

0.6403785488958991

In [16]:
y_pred_train_nb = gsnb.predict(X_train)
y_pred_test_nb = gsnb.predict(X_test)

train_score = accuracy_score(y_train, y_pred_train_nb)
test_score = accuracy_score(y_test, y_pred_test_nb)

print(f"Train accuracy: {round(train_score, 4)}")
print(f"Test accuracy: \t{round(test_score, 4)}")

Train accuracy: 0.8254
Test accuracy: 	0.6635
