In [21]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB

In [4]:
## read in the cleaned data
df = pd.read_csv('cleaned_quora_data.csv')

print(df.shape)

(1306122, 3)


In [5]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0


In [6]:
## check if the dataframe has any null values
df = df.dropna()

In [7]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,technical skill need computer science undergra...,0
1306118,ffffd431801e5a2f4861,m ece good job prospect usa like india job pre...,0
1306119,ffffd48fb36b63db010c,foam insulation toxic,0
1306120,ffffec519fa37cf60c78,one start research project based biochemistry ...,0


In [8]:
vectorizer = TfidfVectorizer(min_df=100)
xtrain = vectorizer.fit_transform(df['question_text'])

In [9]:
xtrain

<1305904x7936 sparse matrix of type '<class 'numpy.float64'>'
	with 7071952 stored elements in Compressed Sparse Row format>

In [10]:
#vectorizer.get_feature_names()
print(xtrain[1])

  (0, 140)	0.4671362077183567
  (0, 2172)	0.3325866826682194
  (0, 7868)	0.2069784894059109
  (0, 2408)	0.46249737355181386
  (0, 5242)	0.2134334299359018
  (0, 139)	0.45204767744682783
  (0, 6475)	0.40561683228375484


In [11]:
ytrain = df['target']

In [25]:
naivebayes_clf = ComplementNB()
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)
cv_pred = cross_val_predict(naivebayes_clf, xtrain, ytrain, cv=skf, n_jobs=-1)

In [26]:
cv_pred

array([1, 1, 0, ..., 0, 0, 0])

In [27]:
## Print out the confusion matrix
confmtrx = confusion_matrix(ytrain, cv_pred)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                  982028                  243077
Insincere(1)                  8721                   72078


In [28]:
report = classification_report(ytrain, cv_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.80      0.89   1225105
           1       0.23      0.89      0.36     80799

    accuracy                           0.81   1305904
   macro avg       0.61      0.85      0.63   1305904
weighted avg       0.94      0.81      0.85   1305904

