In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


##### Cleaning Text

In [3]:
from utils import clean_text

df["comment_text"] = df["comment_text"].apply(clean_text)

##### Splitting in Train and Val Split

In [4]:
from sklearn.model_selection import train_test_split

df_train,df_test = train_test_split(df,test_size=0.2)

In [5]:
x_train = df_train["comment_text"]
y_train = df_train.iloc[:,2:-1].values
x_test = df_test["comment_text"]
y_test = df_test.iloc[:,2:-1].values

TF-IDF Vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(stop_words='english',
                                  sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='word',
                                  token_pattern=r'\w{2,}',  #vectorize 2-character words or more
                                  ngram_range=(1, 3),
                                  max_features=5000)

char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='char',
                                  stop_words='english',
                                  ngram_range=(2, 6),
                                  max_features=5000)

In [7]:
word_vectorizer.fit(pd.concat([x_train,x_test]))
x_train_word_features = word_vectorizer.transform(x_train)
x_test_word_features = word_vectorizer.transform(x_test)

In [8]:
char_vectorizer.fit(pd.concat([x_train,x_test]))
x_train_char_features = char_vectorizer.transform(x_train)
x_test_char_features = char_vectorizer.transform(x_test)



In [25]:
from scipy.sparse import hstack

x_train_features = hstack([x_train_word_features,x_train_char_features])
x_test_features = hstack([x_test_word_features,x_test_char_features])

##### Define holdout features

In [26]:
test_hold_out = pd.read_csv("../data/test.csv")

hold_out_word_features = word_vectorizer.transform(test_hold_out["comment_text"])
hold_out_char_features = char_vectorizer.transform(test_hold_out["comment_text"])
hold_out_features = hstack([hold_out_word_features,hold_out_char_features])

In [28]:
submission_df = pd.read_csv("../data/sample_submission.csv")
submission_df.shape,test_hold_out.shape

((153164, 7), (153164, 2))

Building a per-class Logistic_train_word_features

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

CLASS_NAMES = list(df.columns)[-6:]

cv_scores = []

for class_name in CLASS_NAMES:
    
    train_target = df_train[class_name]
    classifier = LogisticRegression(C=0.1,solver='sag')
    
    #Step 1. Cross-Validation
    cv_score = np.mean(cross_val_score(classifier, x_train_features, train_target, cv=3, scoring='roc_auc'))
    cv_scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    
    #Step 2. Fit the classifier on the train_data
    classifier.fit(x_train_features, train_target)
    
    #Step 3. Predict on the test data
    test_hold_out[class_name] = classifier.predict_proba(hold_out_features)[:,1]


CV score for class toxic is 0.9608753827456878
CV score for class severe_toxic is 0.9855058104539448
CV score for class obscene is 0.9801828497361633
CV score for class threat is 0.9776942472047808
CV score for class insult is 0.9721762877502429
CV score for class identity_hate is 0.9676307977437486


In [33]:
test_hold_out

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.995477,0.171234,0.988351,0.015602,0.961125,0.150784
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.023289,0.003987,0.014420,0.001975,0.014591,0.006141
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.016945,0.004197,0.011127,0.002284,0.009777,0.004171
3,00017563c3f7919a,":If you have a look back at the source, the in...",0.017524,0.003734,0.010730,0.002097,0.010115,0.002788
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.051908,0.003942,0.018272,0.001912,0.018847,0.004505
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",0.298740,0.004772,0.071945,0.002284,0.035350,0.006685
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,0.059954,0.004789,0.030752,0.002493,0.022128,0.007938
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",0.026295,0.003327,0.017247,0.002077,0.014279,0.005229
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",0.031734,0.002389,0.011892,0.001478,0.014272,0.009613
