In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

In [4]:
# train.sort_values(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [5]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


In [6]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])
all_text

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
5         "\n\nCongratulations from me as well, use the ...
6              COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
7         Your vandalism to the Matt Shirvington article...
8         Sorry if the word 'nonsense' was offensive to ...
9         alignment on this subject and which are contra...
10        "\nFair use rationale for Image:Wonju.jpg\n\nT...
11        bbq \n\nbe a man and lets discuss it-maybe ove...
12        Hey... what is it..\n@ | talk .\nWhat is it......
13        Before you start throwing accusations and warn...
14        Oh, and the girl above started her arguments w...
15        "\n\nJuelz Santanas Age\n\nIn 2002, Juelz Sant...
16        Bye! \n\nDon't look, come or t

### TF-IDF (word)

In [7]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

### TF-IDF (charcter)

In [8]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5),
    max_features=25000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [9]:
train_word_features

<159571x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 6703542 stored elements in Compressed Sparse Row format>

In [15]:
#losses = []
predictions = {'id': test['id']}
for class_name in class_names:
    print(class_name)
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

#     cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
#     losses.append(cv_loss)
#     print('CV score for class {} is {}'.format(class_name, cv_loss))


    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [27]:
#print('Total CV score is {}'.format(np.mean(losses)))
submission = pd.DataFrame.from_dict(predictions)
class_names.insert(0, 'id')
submission = submission.loc[:,class_names]
submission.to_csv('submission.csv', index=False)

In [28]:
submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999897,0.263781,0.999568,0.061297,0.979454,0.312269
1,0000247867823ef7,0.005179,0.001534,0.002360,0.000255,0.003411,0.001891
2,00013b17ad220c46,0.008686,0.001459,0.006133,0.000372,0.003099,0.000860
3,00017563c3f7919a,0.003147,0.001694,0.002292,0.000808,0.002747,0.000552
4,00017695ad8997eb,0.016339,0.001614,0.004566,0.000869,0.005480,0.001068
5,0001ea8717f6de06,0.004553,0.000569,0.002021,0.001085,0.005929,0.000930
6,00024115d4cbde0f,0.003811,0.000602,0.003921,0.000236,0.003894,0.000928
7,000247e83dcc1211,0.371495,0.001673,0.016347,0.001570,0.033973,0.002631
8,00025358d4737918,0.004597,0.001374,0.006357,0.000568,0.003315,0.001220
9,00026d1092fe71cc,0.003291,0.000766,0.002647,0.000446,0.003711,0.000700
