In [36]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
## read in the cleaned data
train_df = pd.read_csv('training_data.csv')

print(train_df.shape)

(80800, 3)


In [38]:
## read in the cleaned data
valid_df = pd.read_csv('validation_data.csv')

print(valid_df.shape)

(673316, 3)


In [39]:
vectorizer = TfidfVectorizer(min_df=10)
xtrain = vectorizer.fit_transform(train_df['question_text'])
xvalid = vectorizer.transform(valid_df['question_text'])

In [40]:
xtrain

<80800x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 517716 stored elements in Compressed Sparse Row format>

In [41]:
ytrain = train_df['target']

In [42]:
xvalid

<673316x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 3510212 stored elements in Compressed Sparse Row format>

In [43]:
yvalid = valid_df['target']

In [44]:
%%time
logit = LogisticRegression(max_iter=1000, solver='liblinear')
parameters = {'C':[1, 3, 5, 10, 15, 20, 30, 50]}
logit_clf = GridSearchCV(estimator=logit, param_grid = parameters, scoring='f1', refit='f1', n_jobs=-1)
logit_clf.fit(xtrain, ytrain)

CPU times: user 899 ms, sys: 695 ms, total: 1.59 s
Wall time: 16.4 s


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 3, 5, 10, 15, 20, 30, 50]},
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring='f1', verbose=0)

In [45]:
## get complete details of the best model
logit_clf.best_estimator_

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
## get only the best hyperparamater values 
logit_clf.best_params_

{'C': 3}

In [47]:
## get the best score for the metric that was passed in grid search function
logit_clf.best_score_

0.8704357184365253

In [48]:
## train model using C=30
from sklearn.linear_model import LogisticRegression
logit_reg_clf = LogisticRegression(C=3, max_iter=1000, solver='liblinear')
logit_reg_clf.fit(xtrain, ytrain)

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
y_pred = logit_reg_clf.predict(xvalid)

In [50]:
## Print out the confusion matrix
confmtrx = confusion_matrix(yvalid, y_pred)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                  554239                   78678
Insincere(1)                  5348                   35051


In [51]:
report = classification_report(yvalid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.88      0.93    632917
           1       0.31      0.87      0.45     40399

    accuracy                           0.88    673316
   macro avg       0.65      0.87      0.69    673316
weighted avg       0.95      0.88      0.90    673316

