In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import cufflinks as cf
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re, string

In [None]:
train_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_df.head()

In [None]:
train_df['comment_text'][0]

In [None]:
comment_len = train_df['comment_text'].str.len()
comment_len.mean(), comment_len.std(), comment_len.max()

In [None]:
comment_len.iplot(kind='hist');

In [None]:
label = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
train_df['none'] = 1 - train_df[label].max(axis=1)
train_df.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df['comment_text'].isna().sum()

In [None]:
np.where(comment_len==0)

In [None]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize)
train_term_doc = tfidf_vec.fit_transform(train_df['comment_text'])
test_trem_doc = tfidf_vec.transform(test_df['comment_text'])

In [None]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
x = train_term_doc
test_x = test_trem_doc

In [None]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True, solver='liblinear')
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
preds = np.zeros((len(test_df), len(label)))

for i, j in enumerate(label):
    print('fit', j)
    m,r = get_mdl(train_df[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

In [None]:
subm = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label)], axis=1)
submission.to_csv('./submission.csv', index=False)