In [182]:
import numpy as np
import pandas as pd
import xgboost as xgb
import re, string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [183]:
stop_words_list = set(stopwords.words('english'))

In [184]:
sample_submission_set = pd.read_csv("../data/sample_submission.csv")
train_set = pd.read_csv("../data/train.csv")
test_set = pd.read_csv("../data/test.csv")

In [185]:
def prepare_data_set(data_set):
    data_set = data_set.fillna("unknown")
    # data_set['comment_text'] = \
    #     data_set['comment_text'].apply(preprocess_text)
    data_set['total_length'] = \
        data_set['comment_text'].apply(len)
    data_set['is_upper'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(1 for sen in comment if sen.isupper()))
    data_set['is_exclamation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('!'))
    data_set['is_question'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('?'))
    data_set['sum_of_punctuation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '.,;:'))
    data_set['sum_of_another_symbols'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '*&$%'))
    data_set['sum_of_words'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(comment.split()))
    data_set['count_if_unique'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(set(word for word in comment.split())))
    data_set['diff_sums_unique_and_words'] = \
        data_set['count_if_unique'] / data_set['sum_of_words']
    data_set['if_positive'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in (':-)', ':)', ';-)', ';)')))
    return data_set

In [186]:
# def preprocess_text(sen):
#     sentence = sen.lower()
#     # sentence = re.sub('\[.*?\]', '', sentence)
#     # sentence = re.sub('https?://\S+|www\.\S+', '', sentence)
#     # sentence = re.sub('<.*?>+', '', sentence)
#     # sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)
#     # sentence = re.sub('\n', '', sentence)
#     # sentence = re.sub('\w*\d\w*', '', sentence)
#     # Удаление символов пунктуации и специальных символов
#     sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
#     # Заменяем любые пробелы на один обычный
#     sentence = re.sub(r'\s+', ' ', sentence)
#     # Удаляем стоп-слова
#     sentence = " ".join([word for word in sentence.split() if word not in stop_words_list])
#     return sentence

In [187]:
train_set = prepare_data_set(train_set)
test_set = prepare_data_set(test_set)

In [188]:
train_set.head(100)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,total_length,is_upper,is_exclamation,is_question,sum_of_punctuation,sum_of_another_symbols,sum_of_words,count_if_unique,diff_sums_unique_and_words,if_positive
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,265,17,0,1,6,0,43,41,0.953488,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112,8,1,0,5,0,17,17,1.000000,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233,4,0,0,4,0,42,39,0.928571,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,626,11,0,0,6,0,113,82,0.725664,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67,2,0,1,3,0,13,13,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,003b9f448ee4a29d,"""\r\n\r\nThanks. I can see that violating clea...",0,0,0,0,0,0,557,25,0,0,7,0,84,72,0.857143,0
96,003bd094feef5263,"""\r\nHi\r\nThanks for our kind words. See you ...",0,0,0,0,0,0,57,4,0,0,1,0,13,12,0.923077,0
97,003caacc6ce6c9e9,Collusion in poker \r\n\r\nThis is regarded as...,0,0,0,0,0,0,138,2,0,1,1,0,23,20,0.869565,0
98,003d77a20601cec1,"Thanks much - however, if it's been resolved, ...",0,0,0,0,0,0,174,8,0,1,4,0,31,29,0.935484,0


In [190]:
token_template = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenizer_template(s):
    return token_template.sub(r' \1 ', s).split()

In [191]:
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),
                              tokenizer=tokenizer_template,
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1)),
])

In [192]:
numeric_features = ['total_length', 'is_upper', 'is_exclamation', 'is_question',
                    'sum_of_punctuation', 'sum_of_another_symbols', 'sum_of_words',
                    'count_if_unique', 'diff_sums_unique_and_words', 'if_positive']

In [193]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

In [194]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'comment_text'),
        ('num', numeric_transformer, numeric_features),
    ])

In [195]:
train_X, valid_X, train_Y, valid_Y = train_test_split(
    train_set,
    train_set[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
    test_size=0.2,
    random_state=20)



In [196]:
preprocessor.fit(train_X)



In [197]:
tf_idf_train = preprocessor.transform(train_X)
tf_idf_valid = preprocessor.transform(valid_X)
tf_idf_test = preprocessor.transform(test_set)

In [169]:
columns_toxic = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predictions = np.zeros((test_set.shape[0], len(columns_toxic)))

In [200]:
def trainByXGBoost(train_X, train_y, test_X, test_y=None,
                   feature_names=None,
                   seed_val=2017,
                   num_rounds=1200):
    param = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'max_depth': 6,
        'silent': 1,
        'eval_metric': 'auc',
        'min_child_weight': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'seed': seed_val
    }
    # num_rounds = num_rounds

    params_list = list(param.items())
    xgb_train = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgb_train,'train'), (xgtest, 'test') ]
        model = xgb.train(params_list, xgb_train, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(params_list, xgb_train, num_rounds)

    return model

In [202]:
for i, j in enumerate(columns_toxic):
    print('fit for column '+ j)
    model = trainByXGBoost(tf_idf_train, train_Y[j], tf_idf_valid, valid_Y[j])
    predictions[:, i] = model.predict(xgb.DMatrix(tf_idf_test), ntree_limit = model.best_ntree_limit)

fit for column toxic




Parameters: { "silent" } are not used.

[0]	train-auc:0.63885	test-auc:0.63234
[1]	train-auc:0.68052	test-auc:0.67346
[2]	train-auc:0.75357	test-auc:0.74738
[3]	train-auc:0.75472	test-auc:0.74799
[4]	train-auc:0.75936	test-auc:0.75225
[5]	train-auc:0.76521	test-auc:0.76079
[6]	train-auc:0.83144	test-auc:0.82748
[7]	train-auc:0.83215	test-auc:0.82824
[8]	train-auc:0.86866	test-auc:0.86285
[9]	train-auc:0.87264	test-auc:0.86833
[10]	train-auc:0.87760	test-auc:0.87292
[11]	train-auc:0.87767	test-auc:0.87324
[12]	train-auc:0.88083	test-auc:0.87539
[13]	train-auc:0.88169	test-auc:0.87646
[14]	train-auc:0.88908	test-auc:0.88378
[15]	train-auc:0.89521	test-auc:0.88759
[16]	train-auc:0.90162	test-auc:0.89330
[17]	train-auc:0.90397	test-auc:0.89607
[18]	train-auc:0.90652	test-auc:0.89815
[19]	train-auc:0.90776	test-auc:0.89955
[20]	train-auc:0.90856	test-auc:0.89996
[21]	train-auc:0.91052	test-auc:0.90160
[22]	train-auc:0.91416	test-auc:0.90439
[23]	train-auc:0.91551	test-auc:0.90576
[24]	train



fit for column severe_toxic
Parameters: { "silent" } are not used.

[0]	train-auc:0.75827	test-auc:0.75617
[1]	train-auc:0.85152	test-auc:0.84557
[2]	train-auc:0.90122	test-auc:0.90182
[3]	train-auc:0.90687	test-auc:0.90827
[4]	train-auc:0.90784	test-auc:0.90838
[5]	train-auc:0.91204	test-auc:0.90988
[6]	train-auc:0.92607	test-auc:0.92200
[7]	train-auc:0.92634	test-auc:0.92222
[8]	train-auc:0.92955	test-auc:0.92368
[9]	train-auc:0.93259	test-auc:0.92843
[10]	train-auc:0.93256	test-auc:0.92994
[11]	train-auc:0.93412	test-auc:0.92986
[12]	train-auc:0.93424	test-auc:0.92997
[13]	train-auc:0.93473	test-auc:0.93005
[14]	train-auc:0.93478	test-auc:0.93005
[15]	train-auc:0.93492	test-auc:0.93005
[16]	train-auc:0.94131	test-auc:0.93771
[17]	train-auc:0.94130	test-auc:0.93762
[18]	train-auc:0.94299	test-auc:0.93928
[19]	train-auc:0.94309	test-auc:0.93925
[20]	train-auc:0.94308	test-auc:0.93925
[21]	train-auc:0.94311	test-auc:0.93922
[22]	train-auc:0.94618	test-auc:0.94379
[23]	train-auc:0.94627

In [203]:
submission_samples = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
submission_output = pd.concat([sample_submission_id, pd.DataFrame(predictions, columns = columns_toxic)], axis=1)
submission_output.to_csv('submission_8.csv', index=False)