In [19]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
## read in the cleaned data
train_df = pd.read_csv('training_data.csv')

print(train_df.shape)

(80800, 3)


In [21]:
## read in the cleaned data
valid_df = pd.read_csv('validation_data.csv')

print(valid_df.shape)

(673316, 3)


In [22]:
vectorizer = TfidfVectorizer(min_df=10)
xtrain = vectorizer.fit_transform(train_df['question_text'])
xvalid = vectorizer.transform(valid_df['question_text'])

In [23]:
xtrain

<80800x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 517716 stored elements in Compressed Sparse Row format>

In [24]:
ytrain = train_df['target']

In [25]:
xvalid

<673316x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 3510212 stored elements in Compressed Sparse Row format>

In [26]:
yvalid = valid_df['target']

In [27]:
%%time
decision_clf = DecisionTreeClassifier()
parameters = {'max_depth': np.arange(500, 6638, 500)}
decision_tree_clf = GridSearchCV(estimator=decision_clf, param_grid = parameters, scoring='f1', refit='f1', n_jobs=-1)
decision_tree_clf.fit(xtrain, ytrain)

CPU times: user 32.4 s, sys: 524 ms, total: 32.9 s
Wall time: 3min 12s


GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': array([ 500, 1000, 1500, 2000, 2500, 30

In [28]:
## get complete details of the best model
decision_tree_clf.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=500, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [29]:
## get only the best hyperparamater values 
decision_tree_clf.best_params_

{'max_depth': 500}

In [30]:
## get the best score for the metric that was passed in grid search function
decision_tree_clf.best_score_

0.8182418684018472

In [33]:
## train model using C=30
decision_tree = DecisionTreeClassifier(max_depth=500)
decision_tree.fit(xtrain, ytrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=500, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [34]:
y_pred = decision_tree.predict(xvalid)

In [35]:
## Print out the confusion matrix
confmtrx = confusion_matrix(yvalid, y_pred)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                  526144                  106773
Insincere(1)                  7643                   32756


In [36]:
report = classification_report(yvalid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.83      0.90    632917
           1       0.23      0.81      0.36     40399

    accuracy                           0.83    673316
   macro avg       0.61      0.82      0.63    673316
weighted avg       0.94      0.83      0.87    673316

