In [2]:
import pandas as pd
import multiprocessing as mp
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
## read in the cleaned data
train_df = pd.read_csv('training_data.csv')

print(train_df.shape)

(80800, 3)


In [5]:
## read in the cleaned data
valid_df = pd.read_csv('validation_data.csv')

print(valid_df.shape)

(673316, 3)


In [6]:
vectorizer = TfidfVectorizer(min_df=10)
xtrain = vectorizer.fit_transform(train_df['question_text'])
xvalid = vectorizer.transform(valid_df['question_text'])

In [7]:
xtrain

<80800x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 517716 stored elements in Compressed Sparse Row format>

In [8]:
ytrain = train_df['target']

In [9]:
xvalid

<673316x6638 sparse matrix of type '<class 'numpy.float64'>'
	with 3510212 stored elements in Compressed Sparse Row format>

In [10]:
yvalid = valid_df['target']

In [None]:
%%time
svm_clf = svm.SVC()
parameters = {
    'kernel': ['sigmoid', 'poly', 'rbf', 'linear'],
    'C': [1, 3, 5, 10, 20],
    'gamma': [0.001, 0.01, 0.001, 0.1, 1]   
 }
support_vector_clf = GridSearchCV(estimator=svm_clf, param_grid = parameters, scoring='f1', refit='f1', n_jobs=-1)
support_vector_clf.fit(xtrain, ytrain)

In [25]:
## get complete details of the best model
support_vector_clf.best_estimator_

LogisticRegression(C=3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
## get only the best hyperparamater values 
support_vector_clf.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [13]:
## get the best score for the metric that was passed in grid search function
support_vector_clf.best_score_

0.8753981994144935

In [14]:
## train model using C=30
svm = svm.SVC(C=1, gamma=1, kernel='rbf')
svm.fit(xtrain, ytrain)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [15]:
y_pred = svm.predict(xvalid)

In [16]:
## Print out the confusion matrix
confmtrx = confusion_matrix(yvalid, y_pred)
confmatrix_df = pd.DataFrame(confmtrx, index=['Sincere(0)','Insincere(1)'],
columns=['Predicted_Sincere(0)', 'Predicted_Insincere(1)'])
print(confmatrix_df)

              Predicted_Sincere(0)  Predicted_Insincere(1)
Sincere(0)                  555337                   77580
Insincere(1)                  4952                   35447


In [17]:
report = classification_report(yvalid, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.88      0.93    632917
           1       0.31      0.88      0.46     40399

    accuracy                           0.88    673316
   macro avg       0.65      0.88      0.70    673316
weighted avg       0.95      0.88      0.90    673316

