In [None]:
import pandas as pd
import numpy as np
import re, string

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load the data

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

label_cols = ['toxic', 'severe_toxic', 
               'obscene', 'threat', 
               'insult', 'identity_hate']
df_train = df.rename(columns={'comment_text': 'text'})
df_train.sample(10)

In [None]:
df_train.text[6] # WOW ! Just Wow. Never mind

# Tokenization
Use TF-IDF for tokenization.

In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

n = df_train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, 
                    min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                    smooth_idf=1, sublinear_tf=1)

trn_term_doc = vec.fit_transform(df_train['text'])
trn_term_doc

In [None]:
# Training data
X = trn_term_doc

# Model

We will calculate log-count ratio r. See in this [paper](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf) for the formula of log-count.

In [None]:
def pr(y_i, y):
    p = X[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def log_count(y):
    return np.log(pr(1, y) / pr(0,y))
    

After calculating log count ratio r, use logistic regression to fit X*r and target y.

In the paper, the author use SVM instead of logistic regression.

In [None]:
def get_mdl(y):
    y = y.values
    r = log_count(y)
    m = LogisticRegression(C=4, dual=False, solver='liblinear')
    X_nb = X.multiply(r)
    return m.fit(X_nb, y), r

# Validation

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
X_less_toxic = vec.transform(df_val["less_toxic"])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
preds_less_toxic = np.zeros((len(df_val), len(label_cols)))
preds_more_toxic = np.zeros((len(df_val), len(label_cols)))

# Learn through all columns of features.
for i, j in enumerate(label_cols):
    print('fit', j)
    (m,r)  = get_mdl(df_train[j])
    preds_less_toxic[:, i] = m.predict_proba(X_less_toxic.multiply(r))[:, 1]
    preds_more_toxic[:, i] = m.predict_proba(X_more_toxic.multiply(r))[:, 1]
    
preds_less_toxic = preds_less_toxic.sum(axis=1)
preds_more_toxic = preds_more_toxic.sum(axis=1)

acc = (preds_less_toxic < preds_more_toxic).mean()

In [None]:
print('Validation accuracy: ', acc)

# Submission

In [None]:
df_sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
X_test = vec.transform(df_sub['text'])

preds = np.zeros((len(df_sub), len(label_cols)))
for i, j in enumerate(label_cols):
    m,r  = get_mdl(df_train[j])
    preds[:, i] = m.predict_proba(X_test.multiply(r))[:, 1]
    
preds_test = preds.max(axis=1) # get max prob of among all features for the final score of toxicity.


In [None]:
# Show result
pd.DataFrame(preds, columns = label_cols) 

In [None]:
df_sub['score'] = preds_test
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)