In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer



In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.fillna("unknown")
test = test.fillna("unknown")

In [6]:
train.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
95846,999977655955,"""\nI have discussed it, unlike most of those w...",0,0,0,0,0,0
95847,999982426659,"ps. Almost forgot, Paine don't reply back to t...",1,0,1,0,0,0
95848,999982764066,Mamoun Darkazanli\nFor some reason I am unable...,0,0,0,0,0,0
95849,999986890563,Salafi would be a better term. It is more poli...,0,0,0,0,0,0
95850,999988164717,making wikipedia a better and more inviting pl...,0,0,0,0,0,0


In [7]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],
                                                          train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], 
                                                          test_size=0.2, random_state=2)

In [12]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [14]:
transform_com = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(pd.concat([train['comment_text'],test['comment_text']],axis=0))

In [15]:
transform_com

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize at 0x125ca88c8>, use_idf=1,
        vocabulary=None)

In [16]:
comments_train = transform_com.transform(train_mes)
comments_valid = transform_com.transform(valid_mes)
comments_test = transform_com.transform(test['comment_text'])
gc.collect()

153

In [17]:
import xgboost as xgb

In [18]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=400):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.12
    param['max_depth'] = 5
    param['silent'] = 1
    param['eval_metric'] = 'logloss'
    param['min_child_weight'] = 1
    param['subsample'] = 0.5
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return model

In [19]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

In [21]:
len(preds)

226998

In [None]:
for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(comments_train, train_l[j], comments_valid,valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test))
    gc.collect()

fit toxic
[0]	train-logloss:0.612228	test-logloss:0.613133
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 20 rounds.
[1]	train-logloss:0.546457	test-logloss:0.547469
[2]	train-logloss:0.493659	test-logloss:0.494705
[3]	train-logloss:0.448541	test-logloss:0.449591
[4]	train-logloss:0.412804	test-logloss:0.414211
[5]	train-logloss:0.381571	test-logloss:0.383408
[6]	train-logloss:0.355397	test-logloss:0.357351
[7]	train-logloss:0.333238	test-logloss:0.335384
[8]	train-logloss:0.314213	test-logloss:0.3165
[9]	train-logloss:0.298311	test-logloss:0.300988
[10]	train-logloss:0.284769	test-logloss:0.287681
[11]	train-logloss:0.272807	test-logloss:0.275729
[12]	train-logloss:0.262565	test-logloss:0.265578
[13]	train-logloss:0.253654	test-logloss:0.256788
[14]	train-logloss:0.245623	test-logloss:0.248743
[15]	train-logloss:0.238288	test-logloss:0.24194
[16]	train-logloss:0.232278	test-logloss:0.236012
[17]

[161]	train-logloss:0.117001	test-logloss:0.135477
[162]	train-logloss:0.116789	test-logloss:0.13537
[163]	train-logloss:0.116603	test-logloss:0.135216
[164]	train-logloss:0.116372	test-logloss:0.135017
[165]	train-logloss:0.116176	test-logloss:0.134802
[166]	train-logloss:0.115948	test-logloss:0.134663
[167]	train-logloss:0.115749	test-logloss:0.134493
[168]	train-logloss:0.115538	test-logloss:0.134392
[169]	train-logloss:0.115364	test-logloss:0.134196
[170]	train-logloss:0.115181	test-logloss:0.134017
[171]	train-logloss:0.114994	test-logloss:0.133913
[172]	train-logloss:0.114772	test-logloss:0.13387
[173]	train-logloss:0.114545	test-logloss:0.13374
[174]	train-logloss:0.11432	test-logloss:0.133639
[175]	train-logloss:0.114115	test-logloss:0.133434
[176]	train-logloss:0.113859	test-logloss:0.133256
[177]	train-logloss:0.113679	test-logloss:0.133175
[178]	train-logloss:0.113466	test-logloss:0.13308
[179]	train-logloss:0.113274	test-logloss:0.132946
[180]	train-logloss:0.113129	test-lo

[323]	train-logloss:0.091922	test-logloss:0.122137
[324]	train-logloss:0.091801	test-logloss:0.122072
[325]	train-logloss:0.09172	test-logloss:0.122042
[326]	train-logloss:0.091637	test-logloss:0.121938
[327]	train-logloss:0.091507	test-logloss:0.121884
[328]	train-logloss:0.091425	test-logloss:0.121771
[329]	train-logloss:0.091331	test-logloss:0.12177
[330]	train-logloss:0.091235	test-logloss:0.121696
[331]	train-logloss:0.091137	test-logloss:0.12165
[332]	train-logloss:0.091003	test-logloss:0.121579
[333]	train-logloss:0.090878	test-logloss:0.121566
[334]	train-logloss:0.090789	test-logloss:0.121488
[335]	train-logloss:0.090663	test-logloss:0.121502
[336]	train-logloss:0.090578	test-logloss:0.12146
[337]	train-logloss:0.090457	test-logloss:0.121489
[338]	train-logloss:0.090328	test-logloss:0.12147
[339]	train-logloss:0.090246	test-logloss:0.121402
[340]	train-logloss:0.090157	test-logloss:0.121349
[341]	train-logloss:0.090069	test-logloss:0.12128
[342]	train-logloss:0.089968	test-log

[84]	train-logloss:0.018351	test-logloss:0.031392
[85]	train-logloss:0.018274	test-logloss:0.031395
[86]	train-logloss:0.018133	test-logloss:0.031295
[87]	train-logloss:0.018057	test-logloss:0.031264
[88]	train-logloss:0.017999	test-logloss:0.031269
[89]	train-logloss:0.017929	test-logloss:0.031237
[90]	train-logloss:0.017885	test-logloss:0.031215
[91]	train-logloss:0.0178	test-logloss:0.031157
[92]	train-logloss:0.017752	test-logloss:0.031135
[93]	train-logloss:0.017699	test-logloss:0.031123
[94]	train-logloss:0.017645	test-logloss:0.03113
[95]	train-logloss:0.017588	test-logloss:0.031149
[96]	train-logloss:0.017546	test-logloss:0.031167
[97]	train-logloss:0.017462	test-logloss:0.031159
[98]	train-logloss:0.017409	test-logloss:0.031072
[99]	train-logloss:0.017347	test-logloss:0.031027
[100]	train-logloss:0.017274	test-logloss:0.030995
[101]	train-logloss:0.017219	test-logloss:0.031014
[102]	train-logloss:0.017123	test-logloss:0.030952
[103]	train-logloss:0.017031	test-logloss:0.030962

[84]	train-logloss:0.061486	test-logloss:0.075216
[85]	train-logloss:0.061218	test-logloss:0.075142
[86]	train-logloss:0.060987	test-logloss:0.074979
[87]	train-logloss:0.060789	test-logloss:0.074865
[88]	train-logloss:0.060576	test-logloss:0.074517
[89]	train-logloss:0.060335	test-logloss:0.074357
[90]	train-logloss:0.060106	test-logloss:0.074125
[91]	train-logloss:0.059841	test-logloss:0.073943
[92]	train-logloss:0.059557	test-logloss:0.073752
[93]	train-logloss:0.059346	test-logloss:0.073594
[94]	train-logloss:0.059118	test-logloss:0.073516
[95]	train-logloss:0.058892	test-logloss:0.073359
[96]	train-logloss:0.058657	test-logloss:0.073229
[97]	train-logloss:0.058472	test-logloss:0.073156
[98]	train-logloss:0.058212	test-logloss:0.072954
[99]	train-logloss:0.058016	test-logloss:0.0728
[100]	train-logloss:0.057778	test-logloss:0.072574
[101]	train-logloss:0.05762	test-logloss:0.072457
[102]	train-logloss:0.057441	test-logloss:0.072396
[103]	train-logloss:0.057252	test-logloss:0.072234

[246]	train-logloss:0.041284	test-logloss:0.065049
[247]	train-logloss:0.041233	test-logloss:0.06505
[248]	train-logloss:0.041154	test-logloss:0.064971
[249]	train-logloss:0.041104	test-logloss:0.064991
[250]	train-logloss:0.041022	test-logloss:0.064986
[251]	train-logloss:0.040967	test-logloss:0.064943
[252]	train-logloss:0.040862	test-logloss:0.06491
[253]	train-logloss:0.040802	test-logloss:0.064907
[254]	train-logloss:0.040753	test-logloss:0.064896
[255]	train-logloss:0.04067	test-logloss:0.064887
[256]	train-logloss:0.040574	test-logloss:0.06492
[257]	train-logloss:0.040481	test-logloss:0.064935
[258]	train-logloss:0.040424	test-logloss:0.064922
[259]	train-logloss:0.040348	test-logloss:0.064905
[260]	train-logloss:0.040278	test-logloss:0.064912
[261]	train-logloss:0.040224	test-logloss:0.064924
[262]	train-logloss:0.04015	test-logloss:0.064909
[263]	train-logloss:0.040085	test-logloss:0.064915
[264]	train-logloss:0.040029	test-logloss:0.064918
[265]	train-logloss:0.039986	test-lo

In [None]:
subm = pd.read_csv('../input/sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('xgb.csv', index=False)