In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import os.path
import pickle

Reading test and train data from already preprocessed pickle file

In [4]:
X_train = pd.read_pickle('../../../Preprocessing/Data/X_train.pkl')
X_test = pd.read_pickle('../../../Preprocessing/Data/X_test.pkl')
y_train = pd.read_pickle('../../../Preprocessing/Data/y_train.pkl')
y_test = pd.read_pickle('../../../Preprocessing/Data/y_test.pkl')

Performing TF-IDF over the dataset for word embedding

In [5]:
vec = TfidfVectorizer(ngram_range=(1, 2), min_df=3,
                      max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)
df = pd.read_pickle('../../../Preprocessing/Data/preprocess_pickle.pkl')
df['comment_text_final'] = [" ".join(text) for text in df['comment_text_final'].values]
vec = vec.fit(df['comment_text_final'])

In [6]:
train_term_doc = vec.transform(X_train)
test_term_doc = vec.transform(X_test)

Scaling the input data using MaxAbsScaler

In [7]:
scaler = MaxAbsScaler()
train_term_doc = scaler.fit_transform(train_term_doc)
test_term_doc = scaler.fit_transform(test_term_doc)

Performing Cross Validation for multilpe values of C and also using Ridge and Lasso Regularization for Logistics Regression

In [12]:
def logistic_regression_with_CV(label):
    if os.path.isfile('Models/ridge_lr_' + label + '.sav') and os.path.isfile('Models/lasso_lr_' + label + '.sav'):
        ridge_logistic_regressor_grid_cv = pickle.load(open('Models/ridge_lr_' + label + '.sav', 'rb'))
        lasso_logistic_regressor_grid_cv = pickle.load(open('Models/lasso_lr_' + label + '.sav', 'rb'))
    else:
        ridge_logistic_regressor = LogisticRegression(penalty="l2", solver="liblinear", max_iter = 2000)
        lasso_logistic_regressor = LogisticRegression(penalty="l1", solver="liblinear", max_iter = 2000)

        ridge_logistic_regressor_grid_cv = GridSearchCV(estimator=ridge_logistic_regressor, 
                                                     param_grid={'C':np.logspace(-4, 4, 20)}, cv= 5, iid=False, n_jobs=-1)
        lasso_logistic_regressor_grid_cv = GridSearchCV(estimator=lasso_logistic_regressor, 
                                                     param_grid={'C':np.logspace(-4, 4, 20)}, cv= 5, iid=False, n_jobs=-1)

        ridge_logistic_regressor_grid_cv.fit(train_term_doc, y_train[label])
        lasso_logistic_regressor_grid_cv.fit(train_term_doc, y_train[label])

        pickle.dump(ridge_logistic_regressor_grid_cv, open('Models/ridge_lr_' + label + '.sav', 'wb'))
        pickle.dump(lasso_logistic_regressor_grid_cv, open('Models/lasso_lr_' + label + '.sav', 'wb'))
    
    ridge_train_pred = ridge_logistic_regressor_grid_cv.predict(train_term_doc)
    lasso_train_pred = lasso_logistic_regressor_grid_cv.predict(train_term_doc)

    ridge_test_pred = ridge_logistic_regressor_grid_cv.predict(test_term_doc)
    lasso_test_pred = lasso_logistic_regressor_grid_cv.predict(test_term_doc)

    print(label + " Ridge Train Accuracy - " + str(ridge_logistic_regressor_grid_cv.score(train_term_doc, y_train[label])))
    print(label + " Lasso Train Accuracy - " + str(lasso_logistic_regressor_grid_cv.score(train_term_doc, y_train[label])) + '\n')

    print(label + " Ridge Train F1 Score - " + str(f1_score(y_train[label], ridge_train_pred)))
    print(label + " Lasso Train F1 Score - " + str(f1_score(y_train[label], lasso_train_pred)) + '\n')
    
    print(label + " Ridge Train ROC-AUC Score - " + str(roc_auc_score(y_train[label], ridge_train_pred)))
    print(label + " Lasso Train ROC-AUC Score - " + str(roc_auc_score(y_train[label], lasso_train_pred)) + '\n')

    print(label + " Ridge Test Accuracy - " + str(ridge_logistic_regressor_grid_cv.score(test_term_doc, y_test[label])))
    print(label + " Lasso Test Accuracy - " + str(lasso_logistic_regressor_grid_cv.score(test_term_doc, y_test[label])) + '\n')

    print(label + " Ridge Test F1 Score - " + str(f1_score(y_test[label], ridge_test_pred)))
    print(label + " Lasso Test F1 Score - " + str(f1_score(y_test[label], lasso_test_pred)) + '\n')
    
    print(label + " Ridge Test ROC-AUC Score - " + str(roc_auc_score(y_test[label], ridge_test_pred)))
    print(label + " Lasso Test ROC-AUC Score - " + str(roc_auc_score(y_test[label], lasso_test_pred)) + '\n\n')

Accuracy, F1 and ROC-AUC score for each label using Binary Relevance Technique

In [14]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in label_cols:
    logistic_regression_with_CV(label)

toxic Ridge Train Accuracy - 0.9980958107837242
toxic Lasso Train Accuracy - 0.9608889557025456

toxic Ridge Train F1 Score - 0.9963052989790958
toxic Lasso Train F1 Score - 0.9214828227956341

toxic Ridge Train ROC-AUC Score - 0.9970702556227102
toxic Lasso Train ROC-AUC Score - 0.9375781202131243

toxic Ridge Test Accuracy - 0.8998423114095325
toxic Lasso Test Accuracy - 0.9124573986469302

toxic Ridge Test F1 Score - 0.7907322776065469
toxic Lasso Test F1 Score - 0.8218610909843701

toxic Ridge Test ROC-AUC Score - 0.8487089790281281
toxic Lasso Test ROC-AUC Score - 0.8736658098360226


severe_toxic Ridge Train Accuracy - 0.9798306273802365
severe_toxic Lasso Train Accuracy - 0.9747945480056124

severe_toxic Ridge Train F1 Score - 0.4459738472126635
severe_toxic Lasso Train F1 Score - 0.3119015047879617

severe_toxic Ridge Train ROC-AUC Score - 0.6514190451932962
severe_toxic Lasso Train ROC-AUC Score - 0.604954842315839

severe_toxic Ridge Test Accuracy - 0.9750241619614426
severe_