In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score
import re, string

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
X = np.array(data['comment_text'])
y = np.array(data[label_cols])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [7]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r'\1', s).split()

In [8]:
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [9]:
trn_vec = vec.fit_transform(X_train)
tst_vec = vec.transform(X_test)

In [10]:
def model(y):
    m=ComplementNB()
    m.fit(trn_vec, y)
    return m

In [22]:
pred = np.zeros((len(X_test), len(label_cols)))

for i in range(len(label_cols)):
    print('fitting ', label_cols[i])
    m = model(y_train[:, i])
    pred[:,i] =m.predict_proba(tst_vec)[:,1]

fitting  toxic
fitting  severe_toxic
fitting  obscene
fitting  threat
fitting  insult
fitting  identity_hate


In [23]:
pred.shape == y_test.shape

True

In [24]:
p = pred.round()
for i in range(len(label_cols)):
    print(label_cols[i])
    print(confusion_matrix(y_test[:, i], p[:, i]))
    print(accuracy_score(y_test[:, i], p[:, i]))
    print(recall_score(y_test[:, i], p[:, i]))
    print(precision_score(y_test[:, i], p[:, i]))
    print(f1_score(y_test[:, i], p[:, i]))
    print('')

toxic
[[42976   314]
 [ 2776  1806]]
0.9354528743315508
0.3941510257529463
0.8518867924528302
0.5389435989256938

severe_toxic
[[47272   114]
 [  409    77]]
0.98907503342246
0.15843621399176955
0.4031413612565445
0.22747415066469717

obscene
[[45051   265]
 [ 1617   939]]
0.9606868315508021
0.3673708920187793
0.7799003322259136
0.49946808510638296

threat
[[47696    40]
 [  135     1]]
0.9963444184491979
0.007352941176470588
0.024390243902439025
0.011299435028248588

insult
[[45163   320]
 [ 1627   762]]
0.9593290441176471
0.3189619087484303
0.7042513863216266
0.4390665514261019

identity_hate
[[47372    68]
 [  408    24]]
0.9900568181818182
0.05555555555555555
0.2608695652173913
0.0916030534351145

