In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
from collections import OrderedDict
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

import os

In [2]:
df = pd.read_csv('../input/train.csv')

In [3]:
submission_df = pd.read_csv('../input/test.csv')

In [4]:
df.shape, submission_df.shape

In [5]:
df.columns

In [6]:
submission_df.columns

In [7]:
label_cols = df.columns[2:]

In [8]:
label_cols

In [9]:
labels = df[label_cols]

In [10]:
df = df.drop(label_cols, axis=1)

In [11]:
labels

In [12]:
joined = pd.concat((df, submission_df))

In [13]:
joined['num_capital'] = joined['comment_text'].str.count('[A-Z]')

In [14]:
joined['comment_text'] = joined['comment_text'].str.replace('\n\n', '')

In [15]:
joined['comment_text'] = joined['comment_text'].str.lower()

In [16]:
joined['comment_text'] = joined['comment_text'].str.strip()

In [17]:
joined['elipses'] = joined.comment_text.str.contains(r'\.{3,}').astype(np.int8)

In [18]:
joined['num_explanation'] = joined.comment_text.str.count('!')

In [19]:
joined['quotes'] = joined.comment_text.str.count('"')

In [20]:
joined['your_mom'] = joined.comment_text.str.contains('your mom').astype(np.int8)

In [21]:
joined['comment_length'] = joined.comment_text.str.len()

In [22]:
joined['comment_text'] = joined.comment_text.str.replace(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '')

In [23]:
joined['capital_ratio'] = joined['num_capital'] / joined['comment_length']

In [24]:
joined['num_question'] = joined.comment_text.str.count(r'\?')

In [25]:
joined['slur'] = joined.comment_text.str.contains('niggers?|jews?|cunts?').astype(np.int8)

In [26]:
joined['comment_text'] = joined['comment_text'].str.replace(r'f\*ck|f\*\*k', 'fuck')

In [27]:
tfvectorizer = TfidfVectorizer(max_features=2**15, ngram_range=(1,2),
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)

In [28]:
tfvectorizer.fit(joined.comment_text)

In [29]:
tfidf_feats = tfvectorizer.transform(joined.comment_text)

In [30]:
tfidf_train = tfidf_feats[:df.shape[0], :]

In [31]:
tfidf_test = tfidf_feats[df.shape[0]:, :]

In [32]:
train = joined.iloc[:df.shape[0], :]

In [33]:
train.shape, df.shape

In [34]:
submit = joined.iloc[df.shape[0]:, :]

In [35]:
ids = submit.pop('id')

In [36]:
submit = submit.drop('comment_text', axis=1)

In [37]:
submit_sparse = csr_matrix(submit)

In [38]:
train = train.drop(['id', 'comment_text'], axis=1)

In [39]:
train_sparse = csr_matrix(train)

In [40]:
train_sparse.shape

In [41]:
tfidf_train.shape

In [42]:
train_merged = hstack((train_sparse, tfidf_train))

In [43]:
submit_merged = hstack((submit_sparse, tfidf_test))

In [44]:
labels

In [45]:
train_merged.shape, labels.toxic.shape

In [46]:
train_merged = csr_matrix(train_merged)

In [None]:
dtest = xgb.DMatrix(submit_merged)
dm = xgb.DMatrix(train_merged)

In [None]:
data = OrderedDict()
data['id'] = ids
for col in label_cols:
    current_class = labels[col]
    base_score = current_class.mean()
    params = {
    'objective': 'binary:logistic',
    'colsample_bytree': .8,
    'learning_rate': .1,
    'max_depth': 8,
    'min_child_weight': 4,
    'nthread': -1,
    'base_score': base_score
    }
    dm.set_label(current_class)
    booster = xgb.train(params, dm, num_boost_round=300, verbose_eval=False)
    data[col] = booster.predict(dtest)

In [None]:
final = pd.DataFrame(data)
final.to_csv('submission.csv', index=False, header=True, columns=['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])