In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import   roc_auc_score,multilabel_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords

In [2]:
# Word Vectorizer
vect = TfidfVectorizer(lowercase = True,ngram_range = (1,1),
                       use_idf = True,sublinear_tf = True,
                       stop_words='english',max_features=10000)

In [3]:
Xtrain = pd.read_csv('../.data/train_new_features.csv',index_col='id')
Xtest = pd.read_csv('../.data/test_new_features.csv',index_col='id')

Xtr = Xtrain[['comment_text']]
ytr = Xtrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']]

Xts = Xtest[['comment_text']]

In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(
    Xtr,ytr,test_size=0.20)
print(xtrain.shape)

(127656, 1)


## For now we want to have a glance at a baseline model, so we are not using nothing besides the comment_text with TF-IDF and a Logistic Regression with no tunning

In [5]:
print(ytrain.apply(pd.value_counts))
print(ytest.apply(pd.value_counts))

    toxic  severe_toxic  obscene  threat  insult  identity_hate
0  115458        126389   120909  127275  121384         126544
1   12198          1267     6747     381    6272           1112
   toxic  severe_toxic  obscene  threat  insult  identity_hate
0  28819         31587    30213   31818   30310          31622
1   3096           328     1702      97    1605            293


In [6]:
from sklearn.linear_model import LogisticRegressionCV 
clf = OneVsRestClassifier(LogisticRegression(multi_class="multinomial"))

modelo = Pipeline([('vetorizador',vect),
                   ('classificador',clf)
                  ])

In [7]:
modelo.fit(xtrain.comment_text,ytrain);

In [8]:
ypred = np.array(modelo.predict_proba(xtest.comment_text))

roc_auc_score(ytest,ypred,average='macro')

0.9771638893839931

In [32]:
y_pred_sub = np.array(modelo.predict_proba(Xts.comment_text))
y_pred_sub.shape

(153164, 6)

# submiting

In [21]:
submission_file = pd.read_csv( '../.data/sample_submission.csv')

In [23]:
submission_file.iloc[:,1:] = y_pred_sub

In [31]:
submission_file.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998929,0.239878,0.998264,0.036925,0.939904,0.29467
1,0000247867823ef7,0.006549,0.002097,0.003238,0.00152,0.006079,0.002413
2,00013b17ad220c46,0.009531,0.001093,0.004561,0.00051,0.009164,0.001566
3,00017563c3f7919a,0.002203,0.001748,0.002639,0.000732,0.002366,0.000515
4,00017695ad8997eb,0.027388,0.002633,0.005453,0.001517,0.009465,0.00234


In [29]:
submission_file.to_csv('../.data/submission.csv', index=False,header=True)

### Score in our test set : 0.97716
### Score in Kaggle Public Leaderboard: 0.97340

---------------------------

### Confusion metrix per label

In [40]:
ypred_class = np.array(
    modelo.predict(xtest.comment_text)
)
multilabel_confusion_matrix(ytest,ypred_class)

array([[[28620,   199],
        [ 1071,  2025]],

       [[31509,    78],
        [  244,    84]],

       [[30057,   156],
        [  545,  1157]],

       [[31811,     7],
        [   76,    21]],

       [[30101,   209],
        [  699,   906]],

       [[31596,    26],
        [  229,    64]]])