# Hate_speech and fake_news_labelling

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, auc
from sklearn.metrics import average_precision_score, precision_recall_curve,roc_curve,roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import time

In [2]:
# Load the dataset
#load train and test data
df1=pd.read_csv(r'/Users/sk/Documents/Liverpool/jigsaw-toxic-comment-classification-challenge/train.csv/trains.csv')
df2=pd.read_csv(r'/Users/sk/Documents/Liverpool/jigsaw-toxic-comment-classification-challenge/test.csv/test.csv')
df3=pd.read_csv(r'/Users/sk/Documents/Liverpool/jigsaw-toxic-comment-classification-challenge/test_labels.csv/test_labels.csv')

In [3]:
# Display the first few rows of the train data
df1.sample(2)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
140445,ef86792ec6d8ac8b,"acceptable, as far as anyone can know",0,0,0,0,0,0
52269,8be27c1f932ceee6,Friendly notification\nPlease note that Wikipe...,0,0,0,0,0,0


In [4]:
# Prepare the target labels (only the training set has labels)
X = df1['comment_text']
y = df1[df1.columns[2:]]


# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [5]:
# Test and evaluate the model
#X_test = df2['comment_text']

# Prepare the test labels for evaluation
test_labels = df3[df3.iloc[:,2:].sum(axis=1) != -6] 
X_test = df2[df2['id'].isin(test_labels['id'])]['comment_text']
y_test = test_labels.drop(columns=['id'])
#z_test = test_labels[test_labels.columns[1:]]


In [6]:
test_labels.count()

id               153164
toxic            153164
severe_toxic     153164
obscene          153164
threat           153164
insult           153164
identity_hate    153164
dtype: int64

In [7]:
X_val_split

119105    Geez, are you forgetful!  We've already discus...
131631    Carioca RFA \n\nThanks for your support on my ...
125326    "\n\n Birthday \n\nNo worries, It's what I do ...
111256    Pseudoscience category? \n\nI'm assuming that ...
83590     (and if such phrase exists, it would be provid...
                                ...                        
121162    "  Would you claim them to be part of the ""ig...
34019     The lyrics is found in the German version, so ...
83938     Encyclopedia Titanica references do not source...
78687              A silly fat cow who won't leave me alone
127984    Shazbot now your lieing you already stated you...
Name: comment_text, Length: 31915, dtype: object

# SVM (linear) model

In [8]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

In [9]:
# Define the feature extraction and model pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=10000)),
    ('svm', OneVsRestClassifier(LinearSVC(multi_class='ovr')))
])


In [10]:
%%time
# Train a separate SVM model for each label
svm_models = []
for label in y:
    print(f'Training model for {label}')
    pipeline.fit(X_train_split, y_train_split[label])
    svm_models.append(pipeline.named_steps['svm'])

    # Validate the model
    svm_y_val_pred = pipeline.predict(X_val_split)
    print(f'ROCAUC for {label}: {roc_auc_score(y_val_split[label], svm_y_val_pred):.2f}')
    print(f'Accuracy for {label}: {accuracy_score(y_val_split[label], svm_y_val_pred):.2f}')
    print(f'Recall for {label}: {recall_score(y_val_split[label], svm_y_val_pred):.2f}')
    print(f'Precision for {label}: {precision_score(y_val_split[label], svm_y_val_pred):.2f}')
    print(f'Confusion Matrix for {label}:\n{confusion_matrix(y_val_split[label], svm_y_val_pred)}\n')


Training model for toxic




ROCAUC for toxic: 0.83
Accuracy for toxic: 0.96
Recall for toxic: 0.68
Precision for toxic: 0.86
Confusion Matrix for toxic:
[[28533   326]
 [  975  2081]]

Training model for severe_toxic




ROCAUC for severe_toxic: 0.64
Accuracy for severe_toxic: 0.99
Recall for severe_toxic: 0.27
Precision for severe_toxic: 0.54
Confusion Matrix for severe_toxic:
[[31519    75]
 [  233    88]]

Training model for obscene




ROCAUC for obscene: 0.84
Accuracy for obscene: 0.98
Recall for obscene: 0.69
Precision for obscene: 0.89
Confusion Matrix for obscene:
[[30056   144]
 [  526  1189]]

Training model for threat




ROCAUC for threat: 0.62
Accuracy for threat: 1.00
Recall for threat: 0.24
Precision for threat: 0.58
Confusion Matrix for threat:
[[31828    13]
 [   56    18]]

Training model for insult




ROCAUC for insult: 0.78
Accuracy for insult: 0.97
Recall for insult: 0.57
Precision for insult: 0.79
Confusion Matrix for insult:
[[30063   238]
 [  692   922]]

Training model for identity_hate




ROCAUC for identity_hate: 0.64
Accuracy for identity_hate: 0.99
Recall for identity_hate: 0.28
Precision for identity_hate: 0.73
Confusion Matrix for identity_hate:
[[31590    31]
 [  211    83]]

CPU times: user 19.8 s, sys: 298 ms, total: 20.1 s
Wall time: 20.4 s


In [11]:
%%time
# Predict and evaluate on the test set
svm_predictions = []
for i, label in enumerate (y.columns):
    svm_y_test_pred = svm_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))
    print(f'Accuracy for {label} on test set: {accuracy_score(y_test[label], svm_y_test_pred):.2f}')
    print(f'Recall for {label} on test set: {recall_score(y_test[label], svm_y_test_pred, average="macro"):.2f}')
    print(f'Precision for {label} on test set: {precision_score(y_test[label], svm_y_test_pred, average= "macro"):.2f}')
    print(f'Confusion Matrix for {label} on test set:\n{confusion_matrix(y_test[label], svm_y_test_pred)}\n')

Accuracy for toxic on test set: 0.38
Recall for toxic on test set: 0.35
Precision for toxic on test set: 0.20
Confusion Matrix for toxic on test set:
[[    0 88167  1019]
 [    0 57803    85]
 [    0  5785   305]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for severe_toxic on test set: 0.41
Recall for severe_toxic on test set: 0.40
Precision for severe_toxic on test set: 0.16
Confusion Matrix for severe_toxic on test set:
[[    0 88167  1019]
 [    0 63297   314]
 [    0   291    76]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for obscene on test set: 0.39
Recall for obscene on test set: 0.35
Precision for obscene on test set: 0.19
Confusion Matrix for obscene on test set:
[[    0 88167  1019]
 [    0 60128   159]
 [    0  3460   231]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for threat on test set: 0.41
Recall for threat on test set: 0.36
Precision for threat on test set: 0.14
Confusion Matrix for threat on test set:
[[    0 88167  1019]
 [    0 63397   370]
 [    0   191    20]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for insult on test set: 0.40
Recall for insult on test set: 0.36
Precision for insult on test set: 0.19
Confusion Matrix for insult on test set:
[[    0 88167  1019]
 [    0 60406   145]
 [    0  3182   245]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for identity_hate on test set: 0.41
Recall for identity_hate on test set: 0.45
Precision for identity_hate on test set: 0.20
Confusion Matrix for identity_hate on test set:
[[    0 88167  1019]
 [    0 63124   142]
 [    0   464   248]]

CPU times: user 15.6 s, sys: 144 ms, total: 15.7 s
Wall time: 15.9 s


  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
# Create output DataFrame with 'id' column and predicted labels
output_svm_df = df2[['id','comment_text']].copy()
for i, label in enumerate(y.columns):
    output_svm_df[label] = svm_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))


In [51]:
# Save predictions to CSV
output_svm_df.to_csv('/Users/sk/Documents/Liverpool/svm_predictions.csv', index=False)


# SVC

In [14]:
from sklearn.svm import SVC

In [15]:
# Define the feature extraction and model pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=10000)),
    ('svc', OneVsRestClassifier(SVC(kernel='linear')))
])


In [16]:
%%time
# Train a separate SVM model for each label
svc_models = []
for label in y:
    print(f'Training model for {label}')
    pipeline.fit(X_train_split, y_train_split[label])
    svc_models.append(pipeline.named_steps['svc'])

    # Validate the model
    svc_y_val_pred = pipeline.predict(X_val_split)
    print(f'ROCAUC for {label}: {roc_auc_score(y_val_split[label], svc_y_val_pred):.2f}')
    print(f'Accuracy for {label}: {accuracy_score(y_val_split[label], svc_y_val_pred):.2f}')
    print(f'Recall for {label}: {recall_score(y_val_split[label], svc_y_val_pred):.2f}')
    print(f'Precision for {label}: {precision_score(y_val_split[label], svc_y_val_pred):.2f}')
    print(f'Confusion Matrix for {label}:\n{confusion_matrix(y_val_split[label], svc_y_val_pred)}\n')
    

Training model for toxic
ROCAUC for toxic: 0.83
Accuracy for toxic: 0.96
Recall for toxic: 0.66
Precision for toxic: 0.89
Confusion Matrix for toxic:
[[28620   239]
 [ 1040  2016]]

Training model for severe_toxic
ROCAUC for severe_toxic: 0.52
Accuracy for severe_toxic: 0.99
Recall for severe_toxic: 0.04
Precision for severe_toxic: 0.76
Confusion Matrix for severe_toxic:
[[31590     4]
 [  308    13]]

Training model for obscene
ROCAUC for obscene: 0.85
Accuracy for obscene: 0.98
Recall for obscene: 0.70
Precision for obscene: 0.89
Confusion Matrix for obscene:
[[30053   147]
 [  521  1194]]

Training model for threat
ROCAUC for threat: 0.58
Accuracy for threat: 1.00
Recall for threat: 0.16
Precision for threat: 0.63
Confusion Matrix for threat:
[[31834     7]
 [   62    12]]

Training model for insult
ROCAUC for insult: 0.78
Accuracy for insult: 0.97
Recall for insult: 0.57
Precision for insult: 0.80
Confusion Matrix for insult:
[[30077   224]
 [  696   918]]

Training model for ident

In [17]:
%%time
# Predict and evaluate on the test set
svc_predictions = []
for i, label in enumerate (y.columns):
    svc_y_test_pred = svc_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))
    print(f'Accuracy for {label} on test set: {accuracy_score(y_test[label], svc_y_test_pred):.2f}')
    print(f'Recall for {label} on test set: {recall_score(y_test[label], svc_y_test_pred, average="macro"):.2f}')
    print(f'Precision for {label} on test set: {precision_score(y_test[label], svc_y_test_pred, average= "macro"):.2f}')
    print(f'Confusion Matrix for {label} on test set:\n{confusion_matrix(y_test[label], svc_y_test_pred)}\n')

Accuracy for toxic on test set: 0.38
Recall for toxic on test set: 0.34
Precision for toxic on test set: 0.20
Confusion Matrix for toxic on test set:
[[    0 88572   614]
 [    0 57846    42]
 [    0  5907   183]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for severe_toxic on test set: 0.41
Recall for severe_toxic on test set: 0.37
Precision for severe_toxic on test set: 0.15
Confusion Matrix for severe_toxic on test set:
[[    0 88572   614]
 [    0 63426   185]
 [    0   327    40]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for obscene on test set: 0.39
Recall for obscene on test set: 0.34
Precision for obscene on test set: 0.18
Confusion Matrix for obscene on test set:
[[    0 88572   614]
 [    0 60196    91]
 [    0  3557   134]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for threat on test set: 0.41
Recall for threat on test set: 0.35
Precision for threat on test set: 0.14
Confusion Matrix for threat on test set:
[[    0 88572   614]
 [    0 63551   216]
 [    0   202     9]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for insult on test set: 0.40
Recall for insult on test set: 0.35
Precision for insult on test set: 0.19
Confusion Matrix for insult on test set:
[[    0 88572   614]
 [    0 60480    71]
 [    0  3273   154]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for identity_hate on test set: 0.41
Recall for identity_hate on test set: 0.41
Precision for identity_hate on test set: 0.20
Confusion Matrix for identity_hate on test set:
[[    0 88572   614]
 [    0 63205    61]
 [    0   548   164]]

CPU times: user 4min 21s, sys: 1.02 s, total: 4min 22s
Wall time: 4min 26s


  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
# Create output DataFrame with 'id' column and predicted labels
output_svc_df = df2[['id','comment_text']].copy()
for i, label in enumerate(y.columns):
    output_svc_df[label] = svm_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))


In [49]:
# Save predictions to CSV
output_svc_df.to_csv('/Users/sk/Documents/Liverpool/svc_predictions.csv', index=False)


# logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression



In [21]:
# Define the feature extraction and model pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=10000)),
    ('logisticRegression', OneVsRestClassifier(LogisticRegression(C=1e5, max_iter=100000))) #penalty, default = l2, ridge
])

In [22]:
%%time
# Train a separate LogisticRegression model for each label
logistic_models = []
for label in y.columns:
    print(f'Training model for {label}')
    pipeline.fit(X_train_split, y_train_split[label])
    logistic_models.append(pipeline.named_steps['logisticRegression'])

    # Validate the model
    log_y_val_pred = pipeline.predict(X_val_split)
    print(f'ROCAUC for {label}: {roc_auc_score(y_val_split[label], log_y_val_pred):.2f}')
    print(f'Accuracy for {label}: {accuracy_score(y_val_split[label], log_y_val_pred):.2f}')
    print(f'Recall for {label}: {recall_score(y_val_split[label], log_y_val_pred):.2f}')
    print(f'Precision for {label}: {precision_score(y_val_split[label], log_y_val_pred):.2f}')
    print(f'Confusion Matrix for {label}:\n{confusion_matrix(y_val_split[label], log_y_val_pred)}\n')
    

Training model for toxic
ROCAUC for toxic: 0.83
Accuracy for toxic: 0.94
Recall for toxic: 0.68
Precision for toxic: 0.72
Confusion Matrix for toxic:
[[28065   794]
 [  983  2073]]

Training model for severe_toxic
ROCAUC for severe_toxic: 0.66
Accuracy for severe_toxic: 0.98
Recall for severe_toxic: 0.33
Precision for severe_toxic: 0.17
Confusion Matrix for severe_toxic:
[[31083   511]
 [  216   105]]

Training model for obscene
ROCAUC for obscene: 0.82
Accuracy for obscene: 0.96
Recall for obscene: 0.66
Precision for obscene: 0.65
Confusion Matrix for obscene:
[[29581   619]
 [  586  1129]]

Training model for threat
ROCAUC for threat: 0.65
Accuracy for threat: 0.99
Recall for threat: 0.30
Precision for threat: 0.14
Confusion Matrix for threat:
[[31709   132]
 [   52    22]]

Training model for insult
ROCAUC for insult: 0.76
Accuracy for insult: 0.95
Recall for insult: 0.53
Precision for insult: 0.55
Confusion Matrix for insult:
[[29602   699]
 [  753   861]]

Training model for ident

In [23]:
%%time
# Predict and evaluate on the test set
predictions = []
for i, label in enumerate (y.columns):
    log_y_test_pred = logistic_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))
    print(f'Accuracy for {label} on test set: {accuracy_score(y_test[label], log_y_test_pred):.2f}')
    print(f'Recall for {label} on test set: {recall_score(y_test[label], log_y_test_pred, average="macro"):.2f}')
    print(f'Precision for {label} on test set: {precision_score(y_test[label], log_y_test_pred, average= "macro"):.2f}')
    print(f'Confusion Matrix for {label} on test set:\n{confusion_matrix(y_test[label], log_y_test_pred)}\n')

Accuracy for toxic on test set: 0.38
Recall for toxic on test set: 0.36
Precision for toxic on test set: 0.17
Confusion Matrix for toxic on test set:
[[    0 86323  2863]
 [    0 56976   912]
 [    0  5548   542]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for severe_toxic on test set: 0.41
Recall for severe_toxic on test set: 0.40
Precision for severe_toxic on test set: 0.15
Confusion Matrix for severe_toxic on test set:
[[    0 86323  2863]
 [    0 62235  1376]
 [    0   289    78]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for obscene on test set: 0.39
Recall for obscene on test set: 0.36
Precision for obscene on test set: 0.16
Confusion Matrix for obscene on test set:
[[    0 86323  2863]
 [    0 59191  1096]
 [    0  3333   358]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for threat on test set: 0.41
Recall for threat on test set: 0.38
Precision for threat on test set: 0.14
Confusion Matrix for threat on test set:
[[    0 86323  2863]
 [    0 62350  1417]
 [    0   174    37]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for insult on test set: 0.39
Recall for insult on test set: 0.36
Precision for insult on test set: 0.16
Confusion Matrix for insult on test set:
[[    0 86323  2863]
 [    0 59467  1084]
 [    0  3057   370]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for identity_hate on test set: 0.41
Recall for identity_hate on test set: 0.44
Precision for identity_hate on test set: 0.16
Confusion Matrix for identity_hate on test set:
[[    0 86323  2863]
 [    0 62044  1222]
 [    0   480   232]]

CPU times: user 15.7 s, sys: 179 ms, total: 15.9 s
Wall time: 16.3 s


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
len(log_y_test_pred)

153164

In [46]:
# Create output DataFrame with 'id' column and predicted labels
output_log_df = df2[['id','comment_text']].copy()
for i, label in enumerate(y.columns):
    output_log_df[label] = logistic_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))
    

In [47]:
# Save predictions to CSV
output_log_df.to_csv('/Users/sk/Documents/Liverpool/log_predictions.csv', index=False)


# Multinomial NB

In [27]:
from sklearn.naive_bayes import MultinomialNB


In [28]:
# Define the feature extraction and model pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=10000)),
    ('multinomialNB', OneVsRestClassifier(MultinomialNB()))
])


In [29]:
%%time
# Train a separate MultinomialNaiveBayes model for each label
naive_bayes_models = []
for label in y.columns:
    print(f'Training model for {label}')
    pipeline.fit(X_train_split, y_train_split[label])
    naive_bayes_models.append(pipeline.named_steps['multinomialNB'])

    # Validate the model
    nb_y_val_pred = pipeline.predict(X_val_split)
    print(f'ROCAUC for {label}: {roc_auc_score(y_val_split[label], nb_y_val_pred):.2f}')
    print(f'Accuracy for {label}: {accuracy_score(y_val_split[label], nb_y_val_pred):.2f}')
    print(f'Recall for {label}: {recall_score(y_val_split[label], nb_y_val_pred):.2f}')
    print(f'Precision for {label}: {precision_score(y_val_split[label], nb_y_val_pred):.2f}')
    print(f'Confusion Matrix for {label}:\n{confusion_matrix(y_val_split[label], nb_y_val_pred)}\n')
    

Training model for toxic
ROCAUC for toxic: 0.75
Accuracy for toxic: 0.95
Recall for toxic: 0.51
Precision for toxic: 0.92
Confusion Matrix for toxic:
[[28731   128]
 [ 1493  1563]]

Training model for severe_toxic
ROCAUC for severe_toxic: 0.55
Accuracy for severe_toxic: 0.99
Recall for severe_toxic: 0.10
Precision for severe_toxic: 0.73
Confusion Matrix for severe_toxic:
[[31582    12]
 [  289    32]]

Training model for obscene
ROCAUC for obscene: 0.75
Accuracy for obscene: 0.97
Recall for obscene: 0.50
Precision for obscene: 0.91
Confusion Matrix for obscene:
[[30116    84]
 [  864   851]]

Training model for threat
ROCAUC for threat: 0.50
Accuracy for threat: 1.00
Recall for threat: 0.00
Precision for threat: 0.00
Confusion Matrix for threat:
[[31840     1]
 [   74     0]]

Training model for insult
ROCAUC for insult: 0.70
Accuracy for insult: 0.97
Recall for insult: 0.41
Precision for insult: 0.83
Confusion Matrix for insult:
[[30164   137]
 [  960   654]]

Training model for ident

In [30]:
%%time
# Predict and evaluate on the test set
predictions = []
for i, label in enumerate (y.columns):
    nb_y_test_pred = naive_bayes_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))
    print(f'Accuracy for {label} on test set: {accuracy_score(y_test[label], nb_y_test_pred):.2f}')
    print(f'Recall for {label} on test set: {recall_score(y_test[label], nb_y_test_pred, average="macro"):.2f}')
    print(f'Precision for {label} on test set: {precision_score(y_test[label], nb_y_test_pred, average= "macro"):.2f}')
    print(f'Confusion Matrix for {label} on test set:\n{confusion_matrix(y_test[label], nb_y_test_pred)}\n')
    

Accuracy for toxic on test set: 0.38
Recall for toxic on test set: 0.33
Precision for toxic on test set: 0.21
Confusion Matrix for toxic on test set:
[[    0 89111    75]
 [    0 57888     0]
 [    0  6063    27]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for severe_toxic on test set: 0.42
Recall for severe_toxic on test set: 0.35
Precision for severe_toxic on test set: 0.20
Confusion Matrix for severe_toxic on test set:
[[    0 89111    75]
 [    0 63602     9]
 [    0   349    18]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for obscene on test set: 0.39
Recall for obscene on test set: 0.34
Precision for obscene on test set: 0.22
Confusion Matrix for obscene on test set:
[[    0 89111    75]
 [    0 60286     1]
 [    0  3665    26]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for threat on test set: 0.42
Recall for threat on test set: 0.34
Precision for threat on test set: 0.15
Confusion Matrix for threat on test set:
[[    0 89111    75]
 [    0 63743    24]
 [    0   208     3]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for insult on test set: 0.40
Recall for insult on test set: 0.34
Precision for insult on test set: 0.22
Confusion Matrix for insult on test set:
[[    0 89111    75]
 [    0 60550     1]
 [    0  3401    26]]



  _warn_prf(average, modifier, msg_start, len(result))


Accuracy for identity_hate on test set: 0.41
Recall for identity_hate on test set: 0.34
Precision for identity_hate on test set: 0.21
Confusion Matrix for identity_hate on test set:
[[    0 89111    75]
 [    0 63262     4]
 [    0   689    23]]

CPU times: user 15.7 s, sys: 124 ms, total: 15.8 s
Wall time: 16 s


  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
# Create output DataFrame with 'id' column and predicted labels
output_NB_df = df2[['id','comment_text']].copy()
for i, label in enumerate(y.columns):
    output_NB_df[label] = naive_bayes_models[i].predict(pipeline.named_steps['tfidf'].transform(X_test))
    

In [45]:
# Save predictions to CSV
output_NB_df.to_csv('/Users/sk/Documents/Liverpool/nb_predictions.csv', index=False)


# Evaluate models performance

In [41]:
# Evaluate model performance for training
model_names = ['svm_models', 'svc_models', 'logistic_models', 'naive_bayes_models']
accuracies = [[accuracy_score(y_val_split[label], svm_y_val_pred)], [accuracy_score(y_val_split[label], svc_y_val_pred)],
              [accuracy_score(y_val_split[label], log_y_val_pred)], [accuracy_score(y_val_split[label], nb_y_val_pred)]]
precisions = [[precision_score(y_val_split[label], svm_y_val_pred)], [precision_score(y_val_split[label], svc_y_val_pred)],
              [precision_score(y_val_split[label], log_y_val_pred)], [precision_score(y_val_split[label], nb_y_val_pred)]]
recalls = [[recall_score(y_val_split[label], svm_y_val_pred)], [recall_score(y_val_split[label], svc_y_val_pred)],
              [recall_score(y_val_split[label], log_y_val_pred)], [recall_score(y_val_split[label], nb_y_val_pred)]]
roc_auc = [[roc_auc_score(y_val_split[label], svm_y_val_pred)], [roc_auc_score(y_val_split[label], svc_y_val_pred)],
              [roc_auc_score(y_val_split[label], log_y_val_pred)], [roc_auc_score(y_val_split[label], nb_y_val_pred)]]

# Evaluate model performance for predictions
model_names = ['svm_models', 'svc_models', 'logistic_models', 'naive_bayes_models']
accuracies_pred = [[accuracy_score(y_test[label], svm_y_test_pred)], [accuracy_score(y_test[label], svc_y_test_pred)],
              [accuracy_score(y_test[label], log_y_test_pred)], [accuracy_score(y_test[label], nb_y_test_pred)]]
precisions_pred = [[precision_score(y_test[label], svm_y_test_pred,average='macro')], [precision_score(y_test[label], svc_y_test_pred,average='macro')],
              [precision_score(y_test[label], log_y_test_pred,average='macro')], [precision_score(y_test[label], nb_y_test_pred,average='macro')]]
recalls_pred = [[recall_score(y_test[label], svm_y_test_pred,average='macro')], [recall_score(y_test[label], svc_y_test_pred,average='macro')],
              [recall_score(y_test[label], log_y_test_pred,average='macro')], [recall_score(y_test[label], nb_y_test_pred, average='macro')]]
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
# Create a DataFrame to store train scores
train_results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'ROC_AUC': roc_auc
})

# Create a DataFrame to store prediction scores
pred_results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies_pred,
    'Precision': precisions_pred,
    'Recall': recalls_pred,
})

print('Train scores')
print(train_results_df.to_string(index=False))

print('__________________________________________________________________________')

print('Prediction scores')
print(pred_results_df.to_string(index=False))

Train scores
             Model             Accuracy             Precision                 Recall              ROC_AUC
        svm_models [0.9924173586088046]  [0.7280701754385965]    [0.282312925170068] [0.6406662820088346]
        svc_models [0.9918846937176876]  [0.8181818181818182]  [0.15306122448979592]  [0.576372489478382]
   logistic_models [0.9795707347642174] [0.17454545454545456]  [0.32653061224489793]  [0.656086532522626]
naive_bayes_models [0.9909760300798998]  [0.7142857142857143] [0.034013605442176874] [0.5169435536144821]
__________________________________________________________________________
Prediction scores
             Model             Accuracy             Precision                Recall
        svm_models [0.4137525789349978]  [0.1973237636645179] [0.44869003840984817]
        svc_models [0.4137329920869134] [0.20346865669267133]  [0.4097909652260694]
   logistic_models  [0.406596850434828]  [0.1568572430714416]  [0.4355091415343882]
naive_bayes_models   [0.4131

In [37]:
#output count
print(f'Count of label 1 using SVM model is: {np.sum((svm_y_test_pred)==1):.2f}')
print(f'Count of label 0 using SVM model is: {np.sum((svm_y_test_pred)==0):.2f}')
print(f'Count of label 1 using SVC model is: {np.sum((svc_y_test_pred)==1):.2f}') 
print(f'Count of label 0 using SVC model is: {np.sum((svc_y_test_pred)==0):.2f}')
print(f'Count of label 1 using Logistic Regression model is: {np.sum((log_y_test_pred)==1):.2f}')
print(f'Count of label 0 using Logistic Regression model is: {np.sum((log_y_test_pred)==0):.2f}')
print(f'Count of label 1 using MultinomialNB model is: {np.sum((nb_y_test_pred)==1):.2f}')
print(f'Count of label 0 using MultinomialNB model is: {np.sum((nb_y_test_pred)==0):.2f}')

#print(f'Confusion Matrix for {label} on test set:\n{confusion_matrix(y_test[label], nb_y_test_pred)}\n')
 #print(f'Precision for {label} on test set: {precision_score(y_test[label], nb_y_test_pred, average= "macro"):.2f}')

Count of label 1 using SVM model is: 1409.00
Count of label 0 using SVM model is: 151755.00
Count of label 1 using SVC model is: 839.00
Count of label 0 using SVC model is: 152325.00
Count of label 1 using Logistic Regression model is: 4317.00
Count of label 0 using Logistic Regression model is: 148847.00
Count of label 1 using MultinomialNB model is: 102.00
Count of label 0 using MultinomialNB model is: 153062.00
