In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from sklearn.svm import LinearSVC
from sklearn import decomposition, preprocessing
from scipy import sparse
import xgboost as xgb

In [None]:
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
lens = train.comment_text.str.len()
lens.hist()

In [None]:
train["label_count"] = train["toxic"] + train["severe_toxic"] + train["obscene"] \
                        + train["threat"] + train["insult"] + train["identity_hate"]

In [None]:
len(train[train["label_count"] == 1]), len(train[train["label_count"] == 0]), len(train[train["label_count"] > 1])

In [None]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

### Building Model

In [None]:
import re, string

alpha = 0.0001

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [None]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                      min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True,
                      smooth_idf=True, sublinear_tf=True)
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [None]:
# count_vec = CountVectorizer(ngram_range=(1,2), tokenizer=tokenize,
#                             min_df=3, max_df=0.9, strip_accents='unicode', binary=True)
# trn_term_bi_doc = count_vec.fit_transform(train[COMMENT])
# test_term_bi_doc = count_vec.transform(test[COMMENT])
# # # Verify when TfidfVectorizer = 0, CountVectorizer = 0 => SUCCESS
# # set(np.array(vec.get_feature_names())[trn_term_doc[10,:].nonzero()[1]]) \
# #     - set(np.array(count_vec.get_feature_names())[trn_term_bi_doc[10,:].nonzero()[1]])

# ================
# The above method generate the SAME word index as that generated by TfidfVectorizer
# The following procedure is faster
trn_term_bi_doc = csr_matrix(trn_term_doc.shape, dtype=np.int8)
none_zero_mask = np.array(trn_term_doc[trn_term_doc.nonzero()]>0)[0]
rows = trn_term_doc.nonzero()[0][none_zero_mask]
cols = trn_term_doc.nonzero()[1][none_zero_mask]
trn_term_bi_doc[rows, cols] = 1

test_term_bi_doc = csr_matrix(test_term_doc.shape, dtype=np.int8)
none_zero_mask = np.array(test_term_doc[test_term_doc.nonzero()]>0)[0]
rows = test_term_doc.nonzero()[0][none_zero_mask]
cols = test_term_doc.nonzero()[1][none_zero_mask]
test_term_bi_doc[rows, cols] = 1
# # Verification
# print(set(trn_term_bi_doc.nonzero()[0]) - set(trn_term_doc.nonzero()[0]))
# print(set(trn_term_bi_doc.nonzero()[1]) - set(trn_term_doc.nonzero()[1]))
# print(set(test_term_bi_doc.nonzero()[0]) - set(test_term_doc.nonzero()[0]))
# print(set(test_term_bi_doc.nonzero()[1]) - set(test_term_doc.nonzero()[1]))a

In [None]:
def get_count(feature_x, y, label):
    index_label = (y == label).nonzero()[0]
    matrix_label = feature_x[index_label, :]
    count = matrix_label.sum(0) + alpha
    norm_count = count / np.linalg.norm(count, ord=1)
    return norm_count

In [None]:
def get_trained_model(model, feature_x, x, y):
    p_count = get_count(feature_x, y, 1)
    q_count = get_count(feature_x, y, 0)
    r = np.log(p_count / q_count)
    
    x_nb = x.multiply(r)
    if type(model).__name__ == "XGBClassifier":
        x_nb = x_nb.tocsc()
    return model.fit(x_nb, y), r

In [None]:
def get_accuracy(model, x_feature, x, test_x):
    correct_count = 0
    total_count = 0
    for i, j in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        print('fit', j)
        y = train[j].values
        m,r = get_trained_model(model, x_feature, x, y)
        
        x_test_nb = test_x.multiply(r)
        if type(model).__name__ == "XGBClassifier":
            x_test_nb = x_test_nb.tocsc()
        
        y_pred = m.predict(x_test_nb)

        y_test = test_label[j].values
        graded_mask = y_test != -1
        correct_count += sum(y_test[graded_mask] == y_pred[graded_mask])
        total_count += len(y_test[graded_mask])
        print(correct_count, total_count)
    return (correct_count / total_count)

In [None]:
# TF-IDF feature, TF-IDF input with lr model
model = LogisticRegression(C=4, max_iter=500)
get_accuracy(model, trn_term_doc, trn_term_doc, test_term_doc)

In [None]:
# binary feature, binary input with lr model
model = LogisticRegression(C=4, max_iter=500)
get_accuracy(model, trn_term_bi_doc, trn_term_bi_doc, test_term_bi_doc)

In [None]:
# TF-IDF feature, binary input with lr model
model = LogisticRegression(C=4, max_iter=500)
get_accuracy(model, trn_term_doc, trn_term_bi_doc, test_term_bi_doc)

In [None]:
# Can't use this here, because TruncatedSVD will generate negative and 
# can't use this to calculate r = np.log(p_count / q_count) anymore
# =================
# svd = decomposition.TruncatedSVD(n_components=50)
# svd.fit(trn_term_doc)
# xtrain_svd = svd.transform(trn_term_doc)
# xtest_svd = svd.transform(test_term_doc)

# scl = preprocessing.StandardScaler()
# scl.fit(xtrain_svd)
# xtrain_svd_scl = sparse.csr_matrix(scl.transform(xtrain_svd))
# xtest_svd_scl = sparse.csr_matrix(scl.transform(xtest_svd))

In [None]:
# TF-IDF feature, TF-IDF input with svm model
model = LinearSVC(C=4, max_iter=1000)
get_accuracy(model, trn_term_doc, trn_term_doc, test_term_doc)

In [None]:
# binary feature, binary input with svm model
model = LinearSVC(C=4, max_iter=1000)
get_accuracy(model, trn_term_bi_doc, trn_term_bi_doc, test_term_bi_doc)

In [None]:
# TF-IDF feature, binary input with svm model
model = LinearSVC(C=4, max_iter=1000)
get_accuracy(model, trn_term_doc, trn_term_bi_doc, test_term_bi_doc)

In [None]:
# TF-IDF feature, TF-IDF input with xgboost model
model = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
                          subsample=0.8, nthread=10, learning_rate=0.1)
get_accuracy(model, trn_term_doc, trn_term_doc, test_term_doc.tocsc())

In [None]:
# TF-IDF feature, TF-IDF input with xgboost model
model = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
                          subsample=0.8, nthread=10, learning_rate=0.1)
get_accuracy(model, trn_term_bi_doc, trn_term_bi_doc, test_term_bi_doc.tocsc())

In [None]:
# TF-IDF feature, TF-IDF input with xgboost model
model = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
                          subsample=0.8, nthread=10, learning_rate=0.1)
get_accuracy(model, trn_term_doc, trn_term_bi_doc, test_term_bi_doc.tocsc())