In [1]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
## read in the cleaned data
train_df = pd.read_csv('train_data.csv')

print(train_df.shape)

(80800, 3)


In [3]:
## read in the cleaned data
valid_df = pd.read_csv('valid_data.csv')

print(valid_df.shape)

(673316, 3)


In [4]:
vectorizer = TfidfVectorizer(min_df=10)
xtrain = vectorizer.fit_transform(train_df['question_text'])
xvalid = vectorizer.transform(valid_df['question_text'])

In [5]:
ytrain = train_df['label']

In [6]:
xvalid

<673316x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 3510212 stored elements in Compressed Sparse Row format>

In [7]:
yvalid = valid_df['label']

In [8]:
## Defining all classifiers for voting classifier
log_clf = LogisticRegression(C=3, max_iter=1000, solver='liblinear')
svc_clf = SVC(C=1, gamma=1, kernel='rbf', probability=True)
dt_clf = DecisionTreeClassifier(max_depth=500)
rf_clf = RandomForestClassifier(bootstrap=False, max_depth=500, n_estimators=2000)
nb_clf = MultinomialNB()

In [11]:
estimators=[('lr', log_clf), ('svm', svc_clf), ('dt', dt_clf), ('rf', rf_clf), ('nb',  nb_clf)]
voting_clf = VotingClassifier(estimators=estimators, voting='soft')

In [12]:
voting_clf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=3, max_iter=1000,
                                                 solver='liblinear')),
                             ('svm', SVC(C=1, gamma=1, probability=True)),
                             ('dt', DecisionTreeClassifier(max_depth=500)),
                             ('rf',
                              RandomForestClassifier(bootstrap=False,
                                                     max_depth=500,
                                                     n_estimators=2000)),
                             ('nb', MultinomialNB())],
                 voting='soft')

In [13]:
y_pred = voting_clf.predict(xvalid)

In [14]:
## Print out the confusion matrix
confmtrx = confusion_matrix(yvalid, y_pred)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                  542128                   90789
Insincere(1)                  4546                   35853


In [15]:
report = classification_report(yvalid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.86      0.92    632917
           1       0.28      0.89      0.43     40399

    accuracy                           0.86    673316
   macro avg       0.64      0.87      0.67    673316
weighted avg       0.95      0.86      0.89    673316

