In [None]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.base import clone
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from matplotlib import pyplot as plt
import scipy as sp
from scipy.sparse import csr_matrix
from scipy import sparse

%matplotlib inline

In [None]:
#!pip install catboost

In [None]:
from catboost import Pool
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix
from matplotlib import pyplot as plt
import pickle

In [None]:
X_train = pd.read_csv('/content/drive/MyDrive/train_200k_texts.csv')
X_test = pd.read_csv('/content/drive/MyDrive/test_50k_texts.csv')
y_train = pd.read_csv('/content/drive/MyDrive/ytrain_200k_texts.csv')
y_test = pd.read_csv('/content/drive/MyDrive/ytest_50k_texts.csv')

In [None]:
X_test.drop('Unnamed: 0', axis=1, inplace=True)
X_train.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
y_test.drop('Unnamed: 0', axis=1, inplace=True)
y_train.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
y_test.columns

Index(['topic_le'], dtype='object')

In [None]:
y_train.columns

Index(['topic_le'], dtype='object')

In [None]:
for topic in ['Библиотека', 'Оружие']:
    y_train = y_train.drop(y_train.iloc[X_train[X_train['topic'] == topic].index].index) 
    X_train = X_train.drop(X_train[X_train['topic'] == topic].index)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
# Соотнесения закодированного таргета и названий категорий
dict_topic = dict(zip(X_test.topic, X_test.topic_le))
dict_topic = dict(sorted(dict_topic.items(), key=lambda item: item[1]))

In [None]:
#уни граммы обучение 
vec = TfidfVectorizer(ngram_range=(1, 1), max_df=0.8, min_df=0.0001)
bow = vec.fit_transform(X_train['text_str'])

In [None]:
#OHE для 'seanos', 'year'
one_hot_enc = OneHotEncoder(drop='first')

arr_enc = one_hot_enc.fit_transform(X_train[['seanos', 'year']])

X = sp.sparse.hstack((bow, arr_enc))
X.shape
X = csr_matrix(X)

In [None]:
bow_test = vec.transform(X_test['text_str'])

dummies = one_hot_enc.fit_transform(X_test[['seanos', 'year']])
Xtest = sp.sparse.hstack((bow_test, dummies))
Xtest.shape

(41512, 34224)

In [None]:
np.shape(y_train)

(207543, 1)

In [None]:
np.shape(X_train)

(207543, 14)

In [None]:
np.shape(y_test)

(41512, 1)

In [None]:
np.shape(X_test)

(41512, 14)

Попробуем  применить Стекинг используя параметры , полученные подборкой из оптуны:
Подход использует понятие базовых классификаторов, каждый из которых независимо обучается на некотором (возможно одном и том же) множестве признаков, а также мета-классификатора, использующего предсказания базовых классификаторов как признаки.
Для избежания переобучения будем разбивать обучающую выборку на фолды.

Возьмем лучшую модель из чекпоинта 4 и сравним ее со стекингом из log reg и randomforest


In [None]:
clf = CatBoostClassifier(n_estimators=1600, random_state=123, depth=9, task_type='GPU' )
text_cols = ['text_str', 'title_lemmas']
cat_cols = ['year', 'day_of_week', 'seanos']


In [None]:
clf.fit(X_train[['text_str', 'title_lemmas', 'date_enc', 'year', 'seanos', 'day_of_week']],
        y_train[['topic_le']],
        text_features=text_cols,
        cat_features=cat_cols)


Learning rate set to 0.130299
0:	learn: 1.8002995	total: 127ms	remaining: 3m 23s
1:	learn: 1.6142798	total: 239ms	remaining: 3m 10s
2:	learn: 1.4664547	total: 337ms	remaining: 2m 59s
3:	learn: 1.3438438	total: 447ms	remaining: 2m 58s
4:	learn: 1.2682933	total: 545ms	remaining: 2m 53s
5:	learn: 1.2040593	total: 643ms	remaining: 2m 50s
6:	learn: 1.1434679	total: 753ms	remaining: 2m 51s
7:	learn: 1.0920126	total: 863ms	remaining: 2m 51s
8:	learn: 1.0531108	total: 973ms	remaining: 2m 51s
9:	learn: 1.0074971	total: 1.09s	remaining: 2m 53s
10:	learn: 0.9703810	total: 1.21s	remaining: 2m 54s
11:	learn: 0.9399726	total: 1.32s	remaining: 2m 54s
12:	learn: 0.9119222	total: 1.43s	remaining: 2m 54s
13:	learn: 0.8904931	total: 1.53s	remaining: 2m 53s
14:	learn: 0.8672947	total: 1.62s	remaining: 2m 51s
15:	learn: 0.8481109	total: 1.74s	remaining: 2m 52s
16:	learn: 0.8321983	total: 1.84s	remaining: 2m 51s
17:	learn: 0.8155355	total: 1.92s	remaining: 2m 49s
18:	learn: 0.8017095	total: 2.02s	remaining:

<catboost.core.CatBoostClassifier at 0x7f5679764eb0>

In [None]:
preds = clf.predict(X_test[['text_str', 'title_lemmas', 'date_enc', 'year', 'seanos', 'day_of_week']])
# оцениваем качество
f1 = f1_score(y_test[['topic_le']], preds, average='micro')
print(f'f1-score: {f1}')
print(classification_report(y_test[['topic_le']], preds, target_names=dict_topic))    

f1-score: 0.8642802081325882
                   precision    recall  f1-score   support

   69-я параллель       0.67      0.69      0.68       121
       Библиотека       0.00      0.00      0.00         2
           Бизнес       0.73      0.70      0.72       268
      Бывший СССР       0.87      0.90      0.88      3376
              Дом       0.87      0.85      0.86       797
    Забота о себе       0.80      0.91      0.85       114
         Из жизни       0.75      0.76      0.75      1613
   Интернет и СМИ       0.80      0.78      0.79      2333
             Крым       0.71      0.21      0.32        24
    Культпросвет        1.00      0.17      0.29        12
         Культура       0.87      0.89      0.88      2584
          Легпром       0.00      0.00      0.00         4
              Мир       0.86      0.87      0.86      7048
       Моя страна       0.86      0.91      0.88       221
  Наука и техника       0.92      0.88      0.90      2722
       Нацпроекты       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
estimators = [('rf', RandomForestClassifier(random_state=42, max_depth=97, n_estimators=300,min_samples_split=120, min_samples_leaf=1)),
              ('logreg', LogisticRegression(random_state=42, max_iter=500, warm_start=True, solver='sag', C=2, n_jobs=-1))
              ]

reg = StackingClassifier(estimators=estimators,
                        cv=10,
                        final_estimator = CatBoostClassifier(n_estimators=1600, random_state=123, depth=9, task_type='GPU'), n_jobs= -1)


In [None]:
reg.fit(X, y_train)
y_pred = reg.predict(Xtest)
score = f1_score(y_pred, y_test, average="micro")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Learning rate set to 0.130299
0:	learn: 1.5682786	total: 39.9ms	remaining: 1m 3s
1:	learn: 1.3889909	total: 78.6ms	remaining: 1m 2s
2:	learn: 1.2195219	total: 128ms	remaining: 1m 8s
3:	learn: 1.1148448	total: 165ms	remaining: 1m 5s
4:	learn: 1.0398560	total: 203ms	remaining: 1m 4s
5:	learn: 0.9770969	total: 240ms	remaining: 1m 3s
6:	learn: 0.9266571	total: 275ms	remaining: 1m 2s
7:	learn: 0.8819166	total: 313ms	remaining: 1m 2s
8:	learn: 0.8382273	total: 349ms	remaining: 1m 1s
9:	learn: 0.7993682	total: 384ms	remaining: 1m
10:	learn: 0.7668775	total: 417ms	remaining: 1m
11:	learn: 0.7420903	total: 451ms	remaining: 59.6s
12:	learn: 0.7200446	total: 484ms	remaining: 59.1s
13:	learn: 0.6986170	total: 518ms	remaining: 58.7s
14:	learn: 0.6810809	total: 551ms	remaining: 58.2s
15:	learn: 0.6633353	total: 583ms	remaining: 57.7s
16:	learn: 0.6464026	total: 616ms	remaining: 57.3s
17:	learn: 0.6313756	total: 648ms	remaining: 56.9s
18:	learn: 0.6193871	total: 680ms	remaining: 56.5s
19:	learn: 0.60

  y = column_or_1d(y, warn=True)


In [None]:
print(f'f1-score: {score}')
print(classification_report(y_test, y_pred, target_names=dict_topic))   

f1-score: 0.853078627866641
                   precision    recall  f1-score   support

   69-я параллель       0.72      0.58      0.64       121
       Библиотека       0.00      0.00      0.00         2
           Бизнес       0.72      0.61      0.66       268
      Бывший СССР       0.85      0.89      0.87      3376
              Дом       0.85      0.84      0.85       797
    Забота о себе       0.86      0.82      0.84       114
         Из жизни       0.78      0.71      0.74      1613
   Интернет и СМИ       0.79      0.77      0.78      2333
             Крым       0.75      0.25      0.38        24
    Культпросвет        1.00      0.17      0.29        12
         Культура       0.87      0.90      0.89      2584
          Легпром       0.00      0.00      0.00         4
              Мир       0.84      0.86      0.85      7048
       Моя страна       0.82      0.89      0.85       221
  Наука и техника       0.90      0.89      0.89      2722
       Нацпроекты       0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
