Based on this article: https://habr.com/ru/post/555064/

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re

from catboost import CatBoostClassifier, Pool
from catboost.text_processing import Tokenizer, Dictionary

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
import catboost
catboost.__version__

# Constants

In [None]:
SEED = 42
y_label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Load Data

In [None]:
src = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
ss = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
test_labels = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
test_src = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

# Preprocessing

In [None]:
def cleanString(comment: str) -> str:
    comment = re.sub('n\'t', '', comment)
    comment = re.sub('\'m', '', comment)
    comment = re.sub('\'ve', '', comment)
    # comment = re.sub(' to', '', comment)
    # comment = re.sub('the', '', comment)
    comment = re.sub('\'s', '', comment)
    comment = re.sub(' is', '', comment)
    comment = re.sub(' are', '', comment)
    comment = re.sub(' have', '', comment)
    comment = re.sub(' has', '', comment)
    comment = re.sub(' a', '', comment)
    comment = re.sub(' the', '', comment)


    comment = comment.replace('\n', ' \n ')
    comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    comment = comment.replace(r'[0-9]', '')
    # remove punctuation and numbers
    # comment = re.sub('[^a-zA-Z!?%]', ' ', comment)
    comment = re.sub('[^a-zA-Z%]', ' ', comment)
    # del %
    comment = re.sub('%', '', comment)
    # remove multiple spaces
    comment = re.sub(r' +', ' ', comment)
    # remove newline
    comment = re.sub(r'\n', ' ', comment)
    # remove digits
    # comment = ''.join(i for i in comment if not i.isdigit())
    comment = re.sub(r' +', ' ', comment)
    comment = comment.strip()
    return comment

In [None]:
src.comment_text = src.comment_text.map(cleanString)
test_src.comment_text = test_src.comment_text.map(cleanString)

In [None]:
src.sample(5, random_state=SEED)

# Tokenizer

In [None]:
src.iloc[131631].comment_text

In [None]:
# tokenizer = Tokenizer(lowercasing=True,
#                       number_process_policy=None,
#                       separator_type='BySense',
#                       skip_empty=True,
#                       token_types=['Word'],
#                       sub_tokens_policy='SeveralTokens')

In [None]:
# tokenizer.tokenize(src.iloc[131631].comment_text)

In [None]:
text_proc_param = {
    'tokenizers': [{'tokenizer_id': 'Sense',
                    'separator_type': 'BySense',
                    'lowercasing': 'True',
                    'token_types':['Word'],
                    'sub_tokens_policy':'SeveralTokens'}],
    'dictionaries': [{'dictionary_id': 'Word',
                      'max_dictionary_size': '4000'}],
    'feature_calcers': ['BoW:top_tokens_count=3000']
}

# Train test split

In [None]:
df_train, df_test = train_test_split(src, test_size=0.15, random_state=SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)

print('train shape:', df_train.shape)
print('valid shape:', df_val.shape)
print('test  shape:', df_test.shape)

In [None]:
train_pools = dict()
val_pools = dict()
oos_pools = dict()
for label in y_label:
    train_pools[label] = Pool(df_train[['comment_text']], label=df_train[label], text_features=['comment_text'])
    val_pools[label] = Pool(df_val[['comment_text']], label=df_val[label], text_features=['comment_text'])
    oos_pools[label] = Pool(df_test[['comment_text']], label=df_test[label], text_features=['comment_text'])

In [None]:
train_pools

# Training

In [None]:
models = dict()
print('===Start Train===')
for label in y_label:
    print('Train model for label', label)
    models[label] = CatBoostClassifier(learning_rate=0.3,
                                       task_type='GPU',
                                       iterations=5000,
                                       eval_metric='AUC',
                                       od_type='Iter',
                                       od_wait=500,
                                       random_state=SEED,
                                       **text_proc_param)
    models[label].fit(train_pools[label], eval_set=val_pools[label], early_stopping_rounds=500, 
                      verbose=100, use_best_model=True)

In [None]:
y_pred_oos = np.zeros((df_test.shape[0], len(y_label)))
for idx, label in enumerate(y_label):
    y_pred_oos[:, idx] = models[label].predict_proba(oos_pools[label])[:, 1]
print(roc_auc_score(df_test[y_label], y_pred_oos))

# Predict and save

In [None]:
test_pool = Pool(test_src[['comment_text']], text_features=['comment_text'])

In [None]:
y_pred_test = np.zeros((test_src.shape[0], len(y_label)))
for idx, label in enumerate(y_label):
    y_pred_test[:, idx] = models[label].predict_proba(test_pool)[:, 1]

In [None]:
ss[y_label] = y_pred_test

In [None]:
ss.head()

In [None]:
ss.to_csv('submission.csv', index=False)