# Машинное обучение для текстов. Проект с Catboost text_features

In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import re
import string
from sklearn.metrics import roc_auc_score

In [None]:
df = pd.read_csv('toxic_comments.csv')

Применим функцию для чистки текста от лишних символов

In [None]:
def text_cleaning(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [None]:
df['text']=df['text'].apply(text_cleaning)

Разделим выборки

In [None]:
train, test = train_test_split(df,test_size=0.4,random_state=42)

In [None]:
val, test = train_test_split(test,test_size=0.5,random_state=42)

In [None]:
len(train) / len(df)

0.5999962399182809

In [None]:
len(val) / len(df)

0.19999874663942696

In [None]:
len(test) / len(df)

0.20000501344229216

In [None]:
X = ['text']
y = ['toxic']
text_features = ['text']

Зададим параметры CatBoost и обучим модель

In [None]:
params = {'eval_metric':'AUC',
          'text_features':text_features,
          'task_type':'GPU',
          'learning_rate':0.1,
          'verbose':100}

In [None]:
cbc = CatBoostClassifier(**params)

In [None]:
cbc.fit(train[X], train[y], eval_set=(val[X], val[y]))

0:	test: 0.8624353	best: 0.8624353 (0)	total: 83.4ms	remaining: 1m 23s
100:	test: 0.9611021	best: 0.9611021 (100)	total: 1.52s	remaining: 13.6s
200:	test: 0.9645854	best: 0.9645854 (200)	total: 2.7s	remaining: 10.7s
300:	test: 0.9657083	best: 0.9657083 (300)	total: 3.89s	remaining: 9.04s
400:	test: 0.9665153	best: 0.9665297 (392)	total: 5.08s	remaining: 7.58s
500:	test: 0.9671564	best: 0.9671564 (500)	total: 6.28s	remaining: 6.25s
600:	test: 0.9672574	best: 0.9672916 (594)	total: 7.46s	remaining: 4.95s
700:	test: 0.9675327	best: 0.9675825 (688)	total: 8.64s	remaining: 3.68s
800:	test: 0.9676971	best: 0.9676971 (800)	total: 9.81s	remaining: 2.44s
900:	test: 0.9679151	best: 0.9679221 (899)	total: 11s	remaining: 1.21s
999:	test: 0.9678916	best: 0.9679292 (909)	total: 12.2s	remaining: 0us
bestTest = 0.9679291844
bestIteration = 909
Shrink model to first 910 iterations.


<catboost.core.CatBoostClassifier at 0x241d893ec40>

In [None]:
train['y_score'] = cbc.predict_proba(train[X])[:, 1]

In [None]:
test['y_score'] = cbc.predict_proba(test[X])[:, 1]

ROC-AUC на трейне

In [None]:
roc_auc_score(train['toxic'], train['y_score'])

0.9882368816678246

ROC-AUC на тесте

In [None]:
roc_auc_score(test['toxic'], test['y_score'])

0.9717235506226992

Для F1 найдем оптимальный порог

In [None]:
val['y_score'] = cbc.predict_proba(val[X])[:, 1]

In [None]:
thrs = [0] + sorted(list(val['y_score'].unique()))

In [None]:
from tqdm.notebook import tqdm

In [None]:
res = []
for thr in tqdm(thrs):
  val['y_pred'] = (val['y_score'] > thr) * 1
  res.append((thr,
              f1_score(val['toxic'],val['y_pred'])))

  0%|          | 0/31238 [00:00<?, ?it/s]

In [None]:
f1s = pd.DataFrame(res,columns=['thr','f1'])

In [None]:
f1s[f1s['f1'] == f1s['f1'].max()]

Unnamed: 0,thr,f1
28169,0.351264,0.796402


В итоге на тесте будет такой F1

In [None]:
test['y_pred'] = (test['y_score'] > 0.281696) * 1

In [None]:
f1_score(test['toxic'],test['y_pred'])

0.7832188713048671

F1 со стандартным порогом 0.5 

In [None]:
f1_score(test['toxic'], cbc.predict(test[X]))

0.780084388185654