In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/train.csv")

In [3]:
train[:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train = train.fillna("unknown")

In [5]:
test = pd.read_csv("../data/test.csv")

In [6]:
test[:5]

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [7]:
test = test.fillna("unknown")

In [8]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(target_labels)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_x = train["comment_text"]
train_x.shape

(159571,)

In [11]:
train_y = train.drop(["id", "comment_text"], axis=1)
train_y.shape

(159571, 6)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV

In [13]:
loss = []
preds_train = np.zeros((train_x.shape[0], len(target_labels)))

In [14]:
corpus = train["comment_text"].append(test["comment_text"])

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=10000)
vectorizer.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
scaler = MaxAbsScaler().fit(vectorizer.transform(corpus))

In [17]:
train_x_tf = vectorizer.transform(train_x)
train_x_tf[0]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 49 stored elements in Compressed Sparse Row format>

In [18]:
train_x_tf = scaler.transform(train_x_tf)

In [19]:
for i,label in enumerate(target_labels):
    print("Fitting {}".format(label)),
    model = MultinomialNB()
    score = cross_val_score(model, train_x_tf, train_y[label], cv=5, scoring='neg_log_loss')
    print("Score: {}".format(-1*score, (-1*score).mean()))

Fitting toxic
Score: [ 0.1483651   0.14835159  0.15008625  0.1508239   0.14876484]
Fitting severe_toxic
Score: [ 0.03640955  0.03678665  0.03549326  0.03490874  0.034756  ]
Fitting obscene
Score: [ 0.09638473  0.09654212  0.09180878  0.09286437  0.09314745]
Fitting threat
Score: [ 0.01837779  0.01488188  0.01622753  0.01697065  0.01652912]
Fitting insult
Score: [ 0.1025637   0.10004183  0.09794684  0.09983171  0.10012616]
Fitting identity_hate
Score: [ 0.03881105  0.03775829  0.03802477  0.03619605  0.03700685]


In [20]:
params = {'C':[1,5]}
param_per_model = []

for i,label in enumerate(target_labels):
    print("Fitting {}", label),
    model = GridSearchCV(LogisticRegression(),params,scoring="neg_log_loss",cv=5, n_jobs=4)
    model.fit(train_x_tf, train_y[label])
    print("Score: {}", model.best_score_)
    print(model.best_params_)
    param_per_model.append(model.best_params_)

Fitting {} toxic
Score: {} -0.118908600458
{'C': 5}
Fitting {} severe_toxic
Score: {} -0.0274020120904
{'C': 1}
Fitting {} obscene
Score: {} -0.0679711217436
{'C': 5}
Fitting {} threat
Score: {} -0.00919538351821
{'C': 5}
Fitting {} insult
Score: {} -0.0798041947252
{'C': 1}
Fitting {} identity_hate
Score: {} -0.0268871541221
{'C': 5}


In [21]:
models = []
for i,label in enumerate(target_labels):
    print("Fitting {}", label),
    model = LogisticRegression(**param_per_model[i])
    model.fit(train_x_tf,train_y[label])
    models.append(model)

Fitting {} toxic
Fitting {} severe_toxic
Fitting {} obscene
Fitting {} threat
Fitting {} insult
Fitting {} identity_hate


In [22]:
test_x_tf = vectorizer.transform(test["comment_text"])
test_x_tf[0]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 66 stored elements in Compressed Sparse Row format>

In [23]:
test_x_tf = scaler.transform(test_x_tf)

In [24]:
preds = np.zeros( (test.shape[0], len(target_labels)))
preds.shape

(153164, 6)

In [25]:
for i,label in enumerate(target_labels):
    predicted = models[i].predict_proba(test_x_tf)[:,1]
    preds[:,i] = predicted

In [26]:
preds[0]

array([ 0.99992732,  0.20124534,  0.99957553,  0.19709229,  0.91740575,
        0.2283741 ])

In [27]:
submission = pd.DataFrame(data=preds,columns=target_labels,index=test["id"] )
submission[:5]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.999927,0.201245,0.999576,0.197092,0.917406,0.228374
0000247867823ef7,0.000249,0.001643,9.8e-05,7.6e-05,0.00506,0.000538
00013b17ad220c46,0.003892,0.002042,0.006129,0.000199,0.008413,0.00054
00017563c3f7919a,0.000843,0.001547,0.00049,0.000311,0.001773,0.000238
00017695ad8997eb,0.020676,0.004117,0.002206,0.000242,0.007394,0.000382


In [28]:
submission.to_csv("./submission_tf_lr.csv")