In [2]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
## read in the cleaned data
train_df = pd.read_csv('training_data.csv')

print(train_df.shape)

(80800, 3)


In [4]:
## read in the cleaned data
valid_df = pd.read_csv('validation_data.csv')

print(valid_df.shape)

(673316, 3)


In [5]:
vectorizer = TfidfVectorizer(min_df=10)
xtrain = vectorizer.fit_transform(train_df['question_text'])
xvalid = vectorizer.transform(valid_df['question_text'])

In [6]:
xtrain

<80800x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 517716 stored elements in Compressed Sparse Row format>

In [7]:
ytrain = train_df['target']

In [8]:
xvalid

<673316x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 3510212 stored elements in Compressed Sparse Row format>

In [9]:
yvalid = valid_df['target']

In [None]:
%%time
rf_clf = RandomForestClassifier(bootstrap=False)
parameters = {'max_depth': np.arange(500, 6638, 500), 'n_estimators': [500, 1000, 2000]}
random_forest_clf = GridSearchCV(estimator=rf_clf, param_grid = parameters, scoring='f1', refit='f1', n_jobs=-1)
random_forest_clf.fit(xtrain, ytrain)

In [28]:
## get complete details of the best model
random_forest_clf.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=500, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
## get only the best hyperparamater values 
random_forest_clf.best_params_

{'max_depth': 500, 'n_estimators': 2000}

In [12]:
## get the best score for the metric that was passed in grid search function
random_forest_clf.best_score_

0.8586047549941872

In [14]:
## train model using C=30
random_forest = RandomForestClassifier(bootstrap=False, max_depth=500, n_estimators=2000)
random_forest.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=500, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
y_pred = random_forest.predict(xvalid)

In [16]:
## Print out the confusion matrix
confmtrx = confusion_matrix(yvalid, y_pred)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                  523941                  108976
Insincere(1)                  4688                   35711


In [17]:
report = classification_report(yvalid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.83      0.90    632917
           1       0.25      0.88      0.39     40399

    accuracy                           0.83    673316
   macro avg       0.62      0.86      0.64    673316
weighted avg       0.95      0.83      0.87    673316

