In [3]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

In [4]:
## read in the cleaned data
df = pd.read_csv('cleaned_quora_data.csv')

print(df.shape)

(1306122, 3)


In [5]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0


In [6]:
## check if the dataframe has any null values
df = df.dropna()

In [8]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,technical skill need computer science undergra...,0
1306118,ffffd431801e5a2f4861,m ece good job prospect usa like india job pre...,0
1306119,ffffd48fb36b63db010c,foam insulation toxic,0
1306120,ffffec519fa37cf60c78,one start research project based biochemistry ...,0


In [9]:
vectorizer = TfidfVectorizer(min_df=100)
xtrain = vectorizer.fit_transform(df['question_text'])

In [10]:
xtrain

<1305904x7936 sparse matrix of type '<class 'numpy.float64'>'
	with 7071952 stored elements in Compressed Sparse Row format>

In [11]:
#vectorizer.get_feature_names()
print(xtrain[1])

  (0, 140)	0.4671362077183567
  (0, 2172)	0.3325866826682194
  (0, 7868)	0.2069784894059109
  (0, 2408)	0.46249737355181386
  (0, 5242)	0.2134334299359018
  (0, 139)	0.45204767744682783
  (0, 6475)	0.40561683228375484


In [12]:
ytrain = df['target']

In [13]:
%%time
decision_clf = DecisionTreeClassifier()
param = {'max_depth': np.arange(0, 7936, 500)}
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)
grid_search_decisiontree = GridSearchCV(estimator=decision_clf,
                     param_grid=param,
                     scoring='f1',
                     refit='f1',
                     cv=skf.split(xtrain, ytrain),
                     verbose=4,
                     n_jobs=-1)

CPU times: user 699 µs, sys: 8 µs, total: 707 µs
Wall time: 691 µs


In [13]:
grid_search_decisiontree.fit(xtrain, ytrain)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 48.0min
[Parallel(n_jobs=-1)]: Done 130 out of 160 | elapsed: 248.1min remaining: 57.2min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 270.9min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x2aab21323fc0>,
             error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             pa

In [19]:
## get complete details of the best model
grid_search_decisiontree.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=500, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [20]:
## get only the best hyperparamater values 
grid_search_decisiontree.best_params_

{'max_depth': 500}

In [21]:
## get the best score for the metric that was passed in grid search function
grid_search_decisiontree.best_score_

0.44810425773753765

In [22]:
## 
df = pd.DataFrame(grid_search_decisiontree.cv_results_)

In [23]:
df = df.sort_values("rank_test_score")
df.to_csv('decision_tree_grid_search_results.csv', index=False)

In [14]:
## train model using C=30
decision_clf = DecisionTreeClassifier(max_depth=500)
cv_results = cross_val_predict(decision_clf, xtrain, ytrain, cv=skf, n_jobs=-1)

In [15]:
## Print out the confusion matrix
confmtrx = confusion_matrix(ytrain, cv_results)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                 1191826                   33279
Insincere(1)                 47735                   33064


In [16]:
report = classification_report(ytrain, cv_results)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97   1225105
           1       0.50      0.41      0.45     80799

    accuracy                           0.94   1305904
   macro avg       0.73      0.69      0.71   1305904
weighted avg       0.93      0.94      0.94   1305904

