In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("./data/train.csv")

In [3]:
train[:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
train = train.fillna("unknown")

In [5]:
test = pd.read_csv("./data/test.csv")

In [6]:
test[:5]

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [7]:
test = test.fillna("unknown")

In [8]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(target_labels)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_x = train["comment_text"]
train_x.shape

(95851,)

In [11]:
train_y = train.drop(["id", "comment_text"], axis=1)
train_y.shape

(95851, 6)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV

In [13]:
loss = []
preds_train = np.zeros((train_x.shape[0], len(target_labels)))

In [15]:
corpus = train["comment_text"].append(test["comment_text"])

In [89]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=10000)
vectorizer.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [90]:
scaler = MaxAbsScaler().fit(vectorizer.transform(corpus))

In [91]:
train_x_tf = vectorizer.transform(train_x)
train_x_tf[0]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [92]:
train_x_tf = scaler.transform(train_x_tf)

In [93]:
for i,label in enumerate(target_labels):
    print("Fitting {}", label),
    model = MultinomialNB()
    score = cross_val_score(model, train_x_tf, train_y[label], cv=5, scoring='neg_log_loss')
    print("Score: {}", -1*score, (-1*score).mean())

Fitting {} toxic
Score: {} [ 0.15356393  0.15417913  0.1513692   0.15535793  0.14873275] 0.152640588423
Fitting {} severe_toxic
Score: {} [ 0.03545153  0.03881223  0.03755481  0.0331255   0.03710016] 0.0364088444663
Fitting {} obscene
Score: {} [ 0.09184717  0.09212082  0.09412534  0.09552576  0.09102754] 0.0929293249004
Fitting {} threat
Score: {} [ 0.02497257  0.02594421  0.02411483  0.02267872  0.02793391] 0.0251288478869
Fitting {} insult
Score: {} [ 0.09967515  0.09701069  0.09663224  0.10033653  0.10268839] 0.0992685989441
Fitting {} identity_hate
Score: {} [ 0.04144205  0.04448202  0.0428516   0.0407856   0.04503145] 0.042918543917


Fitting {} toxic
Score: {} [ 0.15356393  0.15417913  0.1513692   0.15535793  0.14873275] 0.152640588423
Fitting {} severe_toxic
Score: {} [ 0.03545153  0.03881223  0.03755481  0.0331255   0.03710016] 0.0364088444663
Fitting {} obscene
Score: {} [ 0.09184717  0.09212082  0.09412534  0.09552576  0.09102754] 0.0929293249004
Fitting {} threat
Score: {} [ 0.02497257  0.02594421  0.02411483  0.02267872  0.02793391] 0.0251288478869
Fitting {} insult
Score: {} [ 0.09967515  0.09701069  0.09663224  0.10033653  0.10268839] 0.0992685989441
Fitting {} identity_hate
Score: {} [ 0.04144205  0.04448202  0.0428516   0.0407856   0.04503145] 0.042918543917

Fitting {} toxic
Score: {} -0.308137506872
{'C': 1}
Fitting {} severe_toxic
Score: {} -0.0543838253589
{'C': 5}
Fitting {} obscene
Score: {} -0.202364584295
{'C': 1}
Fitting {} threat
Score: {} -0.0209387781268
{'C': 5}
Fitting {} insult
Score: {} -0.191509132656
{'C': 1}
Fitting {} identity_hate
Score: {} -0.0474659824791
{'C': 1}

In [95]:
params = {'C':[1,5]}
param_per_model = []

for i,label in enumerate(target_labels):
    print("Fitting {}", label),
    model = GridSearchCV(LogisticRegression(),params,scoring="neg_log_loss",cv=5, n_jobs=4)
    model.fit(train_x_tf, train_y[label])
    print("Score: {}", model.best_score_)
    print(model.best_params_)
    param_per_model.append(model.best_params_)

Fitting {} toxic
Score: {} -0.131041726583
{'C': 1}
Fitting {} severe_toxic
Score: {} -0.0281628220841
{'C': 1}
Fitting {} obscene
Score: {} -0.0746218410702
{'C': 1}
Fitting {} threat
Score: {} -0.0109110940556
{'C': 5}
Fitting {} insult
Score: {} -0.0857094427731
{'C': 1}
Fitting {} identity_hate
Score: {} -0.0281420181409
{'C': 5}


In [96]:
models = []
for i,label in enumerate(target_labels):
    print("Fitting {}", label),
    model = LogisticRegression(**param_per_model[i])
    model.fit(train_x_tf,train_y[label])
    models.append(model)

Fitting {} toxic
Fitting {} severe_toxic
Fitting {} obscene
Fitting {} threat
Fitting {} insult
Fitting {} identity_hate


In [97]:
test_x_tf = vectorizer.transform(test["comment_text"])
test_x_tf[0]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [98]:
test_x_tf = scaler.transform(test_x_tf)

In [99]:
preds = np.zeros( (test.shape[0], len(target_labels)))
preds.shape

(226998, 6)

In [100]:
for i,label in enumerate(target_labels):
    predicted = models[i].predict_proba(test_x_tf)[:,1]
    preds[:,i] = predicted

In [101]:
preds[0]

array([ 0.0307636 ,  0.00515364,  0.01735106,  0.00057656,  0.01658191,
        0.00143104])

In [102]:
submission = pd.DataFrame(data=preds,columns=target_labels,index=test["id"] )
submission[:5]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6044863,0.030764,0.005154,0.017351,0.000577,0.016582,0.001431
6102620,0.019686,0.000787,0.00618,0.000128,0.001536,0.001183
14563293,0.004901,0.001204,0.003725,0.000103,0.002727,0.000125
21086297,0.065876,0.003786,0.031758,0.000348,0.021949,0.000915
22982444,0.012266,0.00248,0.007358,0.000327,0.004049,0.00031


In [33]:
# submission.to_csv("./submission2_tf_lr.csv")

NameError: name 'submission' is not defined