In [34]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

In [35]:
# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

In [36]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [37]:
train = train.fillna("unknown")
test = test.fillna("unknown")

In [38]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=20)

In [39]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [40]:
transform_com = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])
'''comments_train = transform_com.transform(train['comment_text'])'''
comments_train = transform_com.transform(train_mes)
comments_valid = transform_com.transform(valid_mes)
comments_test = transform_com.transform(test['comment_text'])



In [41]:
gc.collect()

10

In [42]:
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [43]:
data = [train_mes, valid_mes, test]

In [44]:
for element in data:
    element['total_length'] = element['comment_text'].apply(len)
    element['capitals'] = element['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    element['caps_vs_length'] = element.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    element['num_exclamation_marks'] = element['comment_text'].apply(lambda comment: comment.count('!'))
    element['num_question_marks'] = element['comment_text'].apply(lambda comment: comment.count('?'))
    element['num_punctuation'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    element['num_symbols'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    element['num_words'] = element['comment_text'].apply(lambda comment: len(comment.split()))
    element['num_unique_words'] = element['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    element['words_vs_unique'] = element['num_unique_words'] / element['num_words']
    element['num_smilies'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))

col = ['total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies']

In [45]:
train_mes = scipy.sparse.csr_matrix(train_mes[col].values)
valid_mes = scipy.sparse.csr_matrix(valid_mes[col].values)
test = scipy.sparse.csr_matrix(test[col].values)


In [46]:
comments_train = scipy.sparse.hstack([train_mes.tocsr(),comments_train.tocsr()])
comments_valid = scipy.sparse.hstack([valid_mes,comments_valid])
comments_test = scipy.sparse.hstack([test,comments_test])

In [47]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [48]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

In [49]:
for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(comments_train, train_l[j], comments_valid,valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)

fit toxic




Parameters: { "silent" } are not used.

[0]	train-auc:0.67043	test-auc:0.67182
[1]	train-auc:0.73550	test-auc:0.73324
[2]	train-auc:0.76069	test-auc:0.76152
[3]	train-auc:0.77394	test-auc:0.77752
[4]	train-auc:0.83334	test-auc:0.83544
[5]	train-auc:0.83349	test-auc:0.83543
[6]	train-auc:0.83369	test-auc:0.83587
[7]	train-auc:0.83836	test-auc:0.84122
[8]	train-auc:0.85255	test-auc:0.85346
[9]	train-auc:0.85582	test-auc:0.85692
[10]	train-auc:0.87869	test-auc:0.87521
[11]	train-auc:0.87863	test-auc:0.87503
[12]	train-auc:0.87948	test-auc:0.87664
[13]	train-auc:0.88556	test-auc:0.88098
[14]	train-auc:0.88861	test-auc:0.88530
[15]	train-auc:0.89482	test-auc:0.89072
[16]	train-auc:0.89952	test-auc:0.89535
[17]	train-auc:0.90268	test-auc:0.89885
[18]	train-auc:0.90917	test-auc:0.90456
[19]	train-auc:0.91060	test-auc:0.90602
[20]	train-auc:0.91196	test-auc:0.90796
[21]	train-auc:0.91292	test-auc:0.90824
[22]	train-auc:0.91440	test-auc:0.90895
[23]	train-auc:0.91517	test-auc:0.90931
[24]	train



fit severe_toxic
Parameters: { "silent" } are not used.

[0]	train-auc:0.81015	test-auc:0.81533
[1]	train-auc:0.86518	test-auc:0.86037
[2]	train-auc:0.89946	test-auc:0.91095
[3]	train-auc:0.90720	test-auc:0.91385
[4]	train-auc:0.91664	test-auc:0.92000
[5]	train-auc:0.92916	test-auc:0.92765
[6]	train-auc:0.93675	test-auc:0.93841
[7]	train-auc:0.93740	test-auc:0.93851
[8]	train-auc:0.93816	test-auc:0.93986
[9]	train-auc:0.94300	test-auc:0.94287
[10]	train-auc:0.94306	test-auc:0.94279
[11]	train-auc:0.94646	test-auc:0.94587
[12]	train-auc:0.94731	test-auc:0.94589
[13]	train-auc:0.94937	test-auc:0.94889
[14]	train-auc:0.94938	test-auc:0.94904
[15]	train-auc:0.94947	test-auc:0.94923
[16]	train-auc:0.94995	test-auc:0.94922
[17]	train-auc:0.95046	test-auc:0.95092
[18]	train-auc:0.95050	test-auc:0.95087
[19]	train-auc:0.95055	test-auc:0.95103
[20]	train-auc:0.95111	test-auc:0.95257
[21]	train-auc:0.95127	test-auc:0.95262
[22]	train-auc:0.96157	test-auc:0.95898
[23]	train-auc:0.96197	test-auc:0

In [50]:
gc.collect()

82

In [51]:
subm = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([sample_submission_id, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('xgb.csv', index=False)