In [1]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# import spacy
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, classification_report, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv("../Data/preprocessed.csv")

In [3]:
data = data.dropna()

In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1, 5), max_features=5000)
vectorizer.fit(data['comment_text'])
vectorizer

In [5]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 5),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [6]:
val_border = int(len(data)*0.6)
validation_set = data[val_border:].copy()

In [7]:
data.columns

Index(['Unnamed: 0', 'id', 'comment_text', 'toxic', 'severe_toxic', 'obscene',
       'threat', 'insult', 'identity_hate', 'num_tokens'],
      dtype='object')

In [8]:
x_train = vectorizer.transform(data['comment_text'])
target_columns = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'identity_hate']

In [9]:
lr_classifier = LogisticRegression(solver='liblinear')
average_roc = 0
    
for label in target_columns:
    lr_classifier.fit(x_train[:val_border], data[label][:val_border])
    predictions = lr_classifier.predict(x_train[val_border:])
    print(f'Label = {label}')
    print(classification_report(validation_set[label], predictions))
    print(f'AUC: {roc_auc_score(validation_set[label], predictions)}')
    average_roc += roc_auc_score(validation_set[label], predictions)
                                 
print(f'Average AUC: {average_roc/len(target_columns)}\n\n')

Label = toxic
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     57681
           1       0.90      0.58      0.71      6079

    accuracy                           0.95     63760
   macro avg       0.93      0.79      0.84     63760
weighted avg       0.95      0.95      0.95     63760

AUC: 0.786612570646289
Label = severe_toxic
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     63125
           1       0.58      0.25      0.35       635

    accuracy                           0.99     63760
   macro avg       0.79      0.63      0.67     63760
weighted avg       0.99      0.99      0.99     63760

AUC: 0.625073360879395
Label = obscene
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     60405
           1       0.90      0.63      0.74      3355

    accuracy                           0.98     63760
   macro avg       0.94      0.

In [11]:
# SVM-classifier based on TF-IDF

sv_classifier = SVC()
average_roc = 0
    
for label in target_columns:
    sv_classifier.fit(x_train[:val_border], data[label][:val_border])
    predictions = sv_classifier.predict(x_train[val_border:])
    print(f'Label = {label}')
    print(classification_report(validation_set[label], predictions))
    print(f'AUC: {roc_auc_score(validation_set[label], predictions)}')
    average_roc += roc_auc_score(validation_set[label], predictions)
                                 
print(f'Average AUC: {average_roc/len(target_columns)}\n\n')

Label = toxic
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     57681
           1       0.91      0.59      0.71      6079

    accuracy                           0.95     63760
   macro avg       0.93      0.79      0.84     63760
weighted avg       0.95      0.95      0.95     63760

AUC: 0.791132105924126
Label = severe_toxic
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     63125
           1       0.60      0.10      0.17       635

    accuracy                           0.99     63760
   macro avg       0.80      0.55      0.58     63760
weighted avg       0.99      0.99      0.99     63760

AUC: 0.5492736259452716
Label = obscene
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     60405
           1       0.90      0.68      0.77      3355

    accuracy                           0.98     63760
   macro avg       0.94      0

In [12]:
# Multi-Layer Perceptron Classifier based on TF-IDF

mlp_classifier = MLPClassifier(max_iter=500)
average_roc = 0
    
for label in target_columns:
    mlp_classifier.fit(x_train[:val_border], data[label][:val_border])
    predictions = mlp_classifier.predict(x_train[val_border:])
    print(f'Label = {label}')
    print(classification_report(validation_set[label], predictions))
    print(f'AUC: {roc_auc_score(validation_set[label], predictions)}')
    average_roc += roc_auc_score(validation_set[label], predictions)
                                 
print(f'Average AUC: {average_roc/len(target_columns)}\n\n')

Label = toxic
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     57681
           1       0.77      0.65      0.71      6079

    accuracy                           0.95     63760
   macro avg       0.87      0.82      0.84     63760
weighted avg       0.95      0.95      0.95     63760

AUC: 0.8166472028418869
Label = severe_toxic
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     63125
           1       0.43      0.30      0.35       635

    accuracy                           0.99     63760
   macro avg       0.71      0.65      0.67     63760
weighted avg       0.99      0.99      0.99     63760

AUC: 0.6468307788259141
Label = obscene
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     60405
           1       0.81      0.71      0.75      3355

    accuracy                           0.98     63760
   macro avg       0.90      