In [95]:
import numpy as np
import pandas as pd
import xgboost as xgb
import re, string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [96]:
stop_words_list = set(stopwords.words('english'))

In [97]:
sample_submission_set = pd.read_csv("../data/sample_submission.csv")
train_set = pd.read_csv("../data/train.csv")
test_set = pd.read_csv("../data/test.csv")

In [98]:
def prepare_data_set(data_set):
    data_set = data_set.fillna("unknown")
    data_set['comment_text'] = \
        data_set['comment_text'].apply(preprocess_text)
    data_set['total_length'] = \
        data_set['comment_text'].apply(len)
    data_set['is_upper'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(1 for sen in comment if sen.isupper()))
    data_set['is_exclamation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('!'))
    data_set['is_question'] = \
        data_set['comment_text'].apply(
            lambda comment:
                comment.count('?'))
    data_set['sum_of_punctuation'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '.,;:'))
    data_set['sum_of_another_symbols'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in '*&$%'))
    data_set['sum_of_words'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(comment.split()))
    data_set['count_if_unique'] = \
        data_set['comment_text'].apply(
            lambda comment:
                len(set(word for word in comment.split())))
    data_set['diff_sums_unique_and_words'] = \
        data_set['count_if_unique'] / data_set['sum_of_words']
    data_set['if_positive'] = \
        data_set['comment_text'].apply(
            lambda comment:
                sum(comment.count(word) for word in (':-)', ':)', ';-)', ';)')))
    return data_set

In [99]:
def preprocess_text(sen):
    sentence = sen.lower()
    sentence = re.sub('\[.*?\]', '', sentence)
    sentence = re.sub('https?://\S+|www\.\S+', '', sentence)
    sentence = re.sub('<.*?>+', '', sentence)
    sentence = re.sub('[%s]' % re.escape(string.punctuation), '', sentence)
    sentence = re.sub('\n', '', sentence)
    sentence = re.sub('\w*\d\w*', '', sentence)
    # Удаление символов пунктуации и специальных символов
    sentence = re.sub('[^a-zA-Z0-9]', ' ', sen)
    # Заменяем любые пробелы на один обычный
    sentence = re.sub(r'\s+', ' ', sentence)
    # Удаляем стоп-слова
    sentence = " ".join([word for word in sentence.split() if word not in stop_words_list])
    return sentence

In [100]:
train_set = prepare_data_set(train_set)
test_set = prepare_data_set(test_set)

In [101]:
token_template = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenizer_template(s):
    return token_template.sub(r' \1 ', s).split()

In [102]:
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),
                              tokenizer=tokenizer_template,
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1)),
])

In [103]:
numeric_features = ['total_length', 'is_upper', 'is_exclamation', 'is_question',
                    'sum_of_punctuation', 'sum_of_another_symbols', 'sum_of_words',
                    'count_if_unique', 'diff_sums_unique_and_words', 'if_positive']

In [104]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

In [105]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'comment_text'),
        ('num', numeric_transformer, numeric_features),
    ])

In [106]:
train_X, valid_X, train_Y, valid_Y = train_test_split(
    train_set,
    train_set[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
    test_size=0.2,
    random_state=20)



In [107]:
preprocessor.fit(train_X)



In [108]:
tf_idf_train = preprocessor.transform(train_X)
tf_idf_valid = preprocessor.transform(valid_X)
tf_idf_test = preprocessor.transform(test_set)

In [109]:
columns_toxic = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predictions = np.zeros((test_set.shape[0], len(columns_toxic)))

In [110]:
def trainByXGBoost(train_X, train_y, test_X, test_y=None,
                   feature_names=None,
                   seed_val=2017,
                   num_rounds=600):
    param = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'max_depth': 6,
        'silent': 1,
        'eval_metric': 'auc',
        'min_child_weight': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'seed': seed_val
    }
    # num_rounds = num_rounds

    params_list = list(param.items())
    xgb_train = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgb_train,'train'), (xgtest, 'test') ]
        model = xgb.train(params_list, xgb_train, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(params_list, xgb_train, num_rounds)

    return model

In [111]:
for i, j in enumerate(columns_toxic):
    print('fit for column '+ j)
    model = trainByXGBoost(tf_idf_train, train_Y[j], tf_idf_valid, valid_Y[j])
    predictions[:, i] = model.predict(xgb.DMatrix(tf_idf_test), ntree_limit = model.best_ntree_limit)

fit for column toxic
Parameters: { "silent" } are not used.





[0]	train-auc:0.66118	test-auc:0.66211
[1]	train-auc:0.70037	test-auc:0.69965
[2]	train-auc:0.73412	test-auc:0.73690
[3]	train-auc:0.75774	test-auc:0.75917
[4]	train-auc:0.76269	test-auc:0.76351
[5]	train-auc:0.76376	test-auc:0.76465
[6]	train-auc:0.77513	test-auc:0.77586
[7]	train-auc:0.80436	test-auc:0.80425
[8]	train-auc:0.81055	test-auc:0.81010
[9]	train-auc:0.81070	test-auc:0.81017
[10]	train-auc:0.84692	test-auc:0.84308
[11]	train-auc:0.84737	test-auc:0.84371
[12]	train-auc:0.84810	test-auc:0.84482
[13]	train-auc:0.84860	test-auc:0.84489
[14]	train-auc:0.85321	test-auc:0.84937
[15]	train-auc:0.86394	test-auc:0.85771
[16]	train-auc:0.86679	test-auc:0.86178
[17]	train-auc:0.86954	test-auc:0.86571
[18]	train-auc:0.87004	test-auc:0.86645
[19]	train-auc:0.87870	test-auc:0.87610
[20]	train-auc:0.88111	test-auc:0.87782
[21]	train-auc:0.88554	test-auc:0.88097
[22]	train-auc:0.88793	test-auc:0.88424
[23]	train-auc:0.88773	test-auc:0.88364
[24]	train-auc:0.89056	test-auc:0.88570
[25]	train



fit for column severe_toxic
Parameters: { "silent" } are not used.

[0]	train-auc:0.80870	test-auc:0.81149
[1]	train-auc:0.89002	test-auc:0.88272
[2]	train-auc:0.90913	test-auc:0.90898
[3]	train-auc:0.90968	test-auc:0.91107
[4]	train-auc:0.91134	test-auc:0.91259
[5]	train-auc:0.91864	test-auc:0.91552
[6]	train-auc:0.92011	test-auc:0.92023
[7]	train-auc:0.92286	test-auc:0.92038
[8]	train-auc:0.92291	test-auc:0.92048
[9]	train-auc:0.92373	test-auc:0.92048
[10]	train-auc:0.92862	test-auc:0.92503
[11]	train-auc:0.93107	test-auc:0.92815
[12]	train-auc:0.93383	test-auc:0.93117
[13]	train-auc:0.93809	test-auc:0.94030
[14]	train-auc:0.93812	test-auc:0.94031
[15]	train-auc:0.94002	test-auc:0.94188
[16]	train-auc:0.94004	test-auc:0.94185
[17]	train-auc:0.94046	test-auc:0.94333
[18]	train-auc:0.94055	test-auc:0.94339
[19]	train-auc:0.94270	test-auc:0.94930
[20]	train-auc:0.94270	test-auc:0.94935
[21]	train-auc:0.94877	test-auc:0.95544
[22]	train-auc:0.94882	test-auc:0.95547
[23]	train-auc:0.94886

In [112]:
submission_samples = pd.read_csv('../data/sample_submission.csv')
sample_submission_id = pd.DataFrame({'id': submission_samples["id"]})
submission_output = pd.concat([sample_submission_id, pd.DataFrame(predictions, columns = columns_toxic)], axis=1)
submission_output.to_csv('submission_5.csv', index=False)