In [106]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In TF􏰁IDF method, firstly, individual words in each document are collected to construct a feature set for each document. Sec- ondly, for each individual word, we compute its TF􏰁IDF score in each document. Thirdly, all the individual words in a document are sorted by their TF􏰁IDF scores. Then different percentages of individual words with top TD􏰁IDF scores are retained to construct the feature set (vocabulary) for representation.

In LSI method,3 original terms for the collection are those individ- ual words whose term frequency in the document is more than two. The original term weight of an individual word in a document is set as the corresponding term frequency of that individual word in that document. Then, SVD is used to decompose the original term-docu- ment matrix. Next, we retain a certain percentage of singular values in R of Eq. (8) to produce the approximation matrix, which has lower dimensions than the original term-document matrix. 

For TF􏰁IDF, term per- centage means the percentages of individual words with top TF􏰁IDF values will be retained to construct the feature set for the whole document collection. For LSI, term percentage means the percentages of top singular values in R, which will be retained to construct the approximation matrix. 

In [None]:
Support Vector Machine (SVM) is chosen in our experiment which is very popular and 
proved to be one of the best classification algorithms for text classification [18,21]. 
SVM is originally introduced by Vapnic in 1995 for solving two-class pattern recognition problem [16]
[16] Vapnic, V, The Nature of Statistical Learning Theory, Springer, 1995
[18] Wu, H. & Gunopulos, D, Evaluating the Utility of Statistical Phrases and Latent Semantic Indexing for Text Classification, Proc. of ICDM’02, pp. 713-716, 2002

In [2]:
## read in the cleaned data
df = pd.read_csv('cleaned_quora_data.csv')

print(df.shape)

(1306122, 3)


In [3]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0


In [4]:
## check if the dataframe has any null values
df = df.dropna()

In [5]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,quebec nationalist see province nation s,0
1,000032939017120e6e44,adopted dog would encourage people adopt shop,0
2,0000412ca6e4628ce2cf,velocity affect time velocity affect space geo...,0
3,000042bf85aa498cd78e,otto von guericke used magdeburg hemisphere,0
4,0000455dfa3e01eae3af,convert montra helicon mountain bike changing ...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,technical skill need computer science undergra...,0
1306118,ffffd431801e5a2f4861,m ece good job prospect usa like india job pre...,0
1306119,ffffd48fb36b63db010c,foam insulation toxic,0
1306120,ffffec519fa37cf60c78,one start research project based biochemistry ...,0


In [119]:
vectorizer = TfidfVectorizer(min_df=100)
xtrain = vectorizer.fit_transform(df['question_text'])

In [120]:
xtrain

<1305904x7936 sparse matrix of type '<class 'numpy.float64'>'
	with 7071952 stored elements in Compressed Sparse Row format>

In [121]:
#vectorizer.get_feature_names()
print(xtrain[1])

  (0, 140)	0.4671362077183567
  (0, 2172)	0.3325866826682194
  (0, 7868)	0.2069784894059109
  (0, 2408)	0.46249737355181386
  (0, 5242)	0.2134334299359018
  (0, 139)	0.45204767744682783
  (0, 6475)	0.40561683228375484


In [122]:
ytrain = df['target']

In [131]:
%%time
logit = LogisticRegression(max_iter=1000, solver='liblinear')
param = {'C':[1, 3, 5, 10, 15, 20, 30, 50, 70, 100]}
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)
grid_search_model = GridSearchCV(estimator=logit,
                     param_grid=param,
                     scoring='f1',
                     refit='f1',
                     cv=skf.split(xtrain, ytrain),
                     verbose =  4,
                     n_jobs=-1)

CPU times: user 407 µs, sys: 39 µs, total: 446 µs
Wall time: 458 µs


In [132]:
grid_search_model.fit(xtrain, ytrain)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  55 out of 100 | elapsed:  3.8min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done  81 out of 100 | elapsed:  5.7min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.9min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x2aab363509e8>,
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 3, 5, 10, 15, 20, 30, 50, 70, 100]},
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring='f1', verbose=4)

In [133]:
## get complete details of the best model
grid_search_model.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [134]:
## get only the best hyperparamater values 
grid_search_model.best_params_

{'C': 100}

In [135]:
## get the best score for the metric that was passed in grid search function
grid_search_model.best_score_

0.5084215235727425

In [136]:
## 
df = pd.DataFrame(grid_search_model.cv_results_)


In [139]:
df = df.sort_values("rank_test_score")
df.to_csv('Logistic_Regression_grid_search_results.csv', index=False)

In [140]:
## train model using C=30
logit = LogisticRegression(C=50, max_iter=1000, solver='liblinear')
cv_results = cross_val_predict(logit, xtrain, ytrain, cv=skf, n_jobs=-1)

In [141]:
## Print out the confusion matrix
confmtrx = confusion_matrix(ytrain, cv_results)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                 1208398                   16707
Insincere(1)                 47587                   33212


In [142]:
report = classification_report(ytrain, cv_results)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.99      0.97   1225105
           1       0.67      0.41      0.51     80799

    accuracy                           0.95   1305904
   macro avg       0.81      0.70      0.74   1305904
weighted avg       0.94      0.95      0.95   1305904



In [None]:
#results = cross_val_score(logit, xtrain, ytrain, cv=kfold)
#print("Accuracy:", results.mean()*100)