In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import hstack

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor, Pool


pd.options.display.float_format = '{:,.2f}'.format
sns.set_theme()
nltk.download("stopwords");

RANDOM_STATE = 44

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv("data/train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("data/test_dataset_test.csv", index_col=0)

train_df["is_train"] = 1
test_df["is_train"] = 0

df = pd.concat([train_df, test_df])

## Попробуем препроцессор, с добавлением embedings

In [48]:
embedings = np.load("embedings/embedings tiny.npy", allow_pickle=True)

In [49]:
embedings = np.array([x for x in embedings])

In [50]:
class RBKpreprocessor(BaseEstimator):
    def __init__(self):
        self._vectorizer_tags = TfidfVectorizer()
        self._vectorizer_authors = TfidfVectorizer()
        self._category_ohe = OneHotEncoder()
        self._category_from_title_ohe = OneHotEncoder()
        self._stop_words = stopwords.words("russian")
        self._stemmer = SnowballStemmer("russian")
        self._vectorizer_title = TfidfVectorizer()
        self._embedings = embedings
        
        
    def _clean_list(self, title):
        return(title.
           replace("[", "").
           replace("]", "").
           replace(".", "").
           replace("'", "").
           replace(",", " ")
          )
    
    def _clean_title(self, title):
        if title.find("\n")>0:
            title = title[0:title.find("\n\n")].lower()
        title = " ".join([self._stemmer.stem(w) for w in title.split() if w not in self._stop_words])
        return title

    def _find_category_in_title(self, title):
        if title.find("\n")>0:
            title = title[title.find("\n\n"):].lower().strip()
        else:
            title = ""
        if "," in title:
            title = title[0:title.index(",")]
        else:
            title = ""

        return title
    
    def fit(self, df):
        self._category_ohe.fit(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        self._vectorizer_tags.fit(tags_clean)
        self._vectorizer_authors.fit(authors_clean);
        
        title_clean = df.title.apply(self._clean_title)
        category_from_title =  df.title.apply(self._find_category_in_title)
        self._category_from_title_ohe.fit(category_from_title.values.reshape(-1,1))
        self._vectorizer_title.fit(title_clean)
        
        return(self)
        
        
    def transform(self, df, embedings):
        ctr_zero = (df.ctr == 0)
        ctr_log = np.log(df.ctr)
        mean_ctr_log = np.mean(ctr_log.values, where=(ctr_log != -np.inf))
        ctr_log = np.where(df["ctr"] == 0, mean_ctr_log, ctr_log)
        
        category_sparse = self._category_ohe.transform(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        
        authors_count = authors_clean.apply(lambda x: len(x.split()))
        tags_count = tags_clean.apply(lambda x: len(x.split()))
        
        tags_sparse = self._vectorizer_tags.transform(tags_clean)
        authors_sparse = self._vectorizer_authors.transform(authors_clean)
        
        publish_date = pd.to_datetime(df.publish_date)
        publish_year = publish_date.dt.year * 100 + publish_date.dt.month
        publish_day = publish_date.dt.day
        publish_weekday = publish_date.dt.weekday
        publish_hour = publish_date.dt.hour
        
        title_clean = df.title.apply(self._clean_title)
        title_sparse  = self._vectorizer_title.transform(title_clean)
        
        category_from_title =  df.title.apply(self._find_category_in_title)
        category_from_title_sparse = self._category_from_title_ohe.transform(category_from_title.values.reshape(-1,1))
        
        return hstack([
            ctr_zero.values[:,None],
            ctr_log[:,None],
            category_sparse,
            authors_count.values[:,None],
            tags_count.values[:,None],
            tags_sparse,
            authors_sparse,
            publish_year.values[:,None],
            publish_day.values[:,None],
            publish_weekday.values[:,None],
            publish_hour.values[:,None],
            title_sparse,
            category_from_title_sparse,
            embedings
        ])
        
def my_metric(y_real, preds, detail=True):
    overall = (
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]) + 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]) + 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
    if not detail:
        return overall
    else:
        return (
            overall,
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]), 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]), 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
def write_down_predictions(preds, output_file = "output.csv"):
    solution = pd.read_csv("data\sample_solution.csv")
    solution.iloc[:,1:4] = preds
    solution.to_csv(output_file, index=False)

In [51]:
%%time
preprocess = RBKpreprocessor()
preprocess.fit(df)

Wall time: 3.53 s


RBKpreprocessor()

In [52]:
valid_train_df, valid_test_df, embedings_train, embedings_test = train_test_split(
    train_df, 
    embedings[:7000], 
    test_size=0.2, 
    random_state=RANDOM_STATE, 
    shuffle=True)

In [53]:
%%time
X_train = preprocess.transform(valid_train_df, embedings_train)
y_train = valid_train_df[["views", "depth", "full_reads_percent"]]

X_test = preprocess.transform(valid_test_df, embedings_test)
y_test = valid_test_df[["views", "depth", "full_reads_percent"]]



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 2.89 s


In [56]:
X_train.shape, X_test.shape

((5600, 17689), (1400, 17689))

## Попробуем небольшой catboost, будет ли он лучше работать с ембедингами

In [58]:
model = CatBoostRegressor(iterations=20,
                          early_stopping_rounds=3,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          loss_function='MultiRMSE',
                          verbose=True)

In [59]:
%%time
model.fit(X_train, y_train, eval_set=Pool(data=X_test, label=y_test))

Got unsafe target value = 2.5542e+06 at object #334 of dataset learn
Got unsafe target value = 2.5542e+06 at object #164 of dataset test #0


0:	learn: 85541.4617249	test: 125537.2162878	best: 125537.2162878 (0)	total: 2.63s	remaining: 50.1s
1:	learn: 84731.9992927	test: 125414.0482532	best: 125414.0482532 (1)	total: 5.07s	remaining: 45.7s
2:	learn: 83779.2919589	test: 124892.8179410	best: 124892.8179410 (2)	total: 7.7s	remaining: 43.6s
3:	learn: 83289.6734415	test: 124632.0069448	best: 124632.0069448 (3)	total: 8.93s	remaining: 35.7s
4:	learn: 82741.1636623	test: 124633.4067864	best: 124632.0069448 (3)	total: 9.74s	remaining: 29.2s
5:	learn: 82149.5295496	test: 124631.9672215	best: 124631.9672215 (5)	total: 12.3s	remaining: 28.8s
6:	learn: 81754.5316248	test: 124539.8748868	best: 124539.8748868 (6)	total: 14.9s	remaining: 27.8s
7:	learn: 81161.7464590	test: 124324.4408038	best: 124324.4408038 (7)	total: 17.4s	remaining: 26.1s
8:	learn: 80581.7081048	test: 124317.9693731	best: 124317.9693731 (8)	total: 20s	remaining: 24.5s
9:	learn: 80106.6290974	test: 124299.0692449	best: 124299.0692449 (9)	total: 22.6s	remaining: 22.6s
10:

<catboost.core.CatBoostRegressor at 0x28e2a0a8f40>

In [60]:
preds = model.predict(X_test)
my_metric(y_test, preds)

(0.05225998769756309,
 0.01744562962851819,
 0.02765104269898977,
 0.007163315370055134)

## И другую модель (по одному на каждый признак)

In [61]:
model1 = CatBoostRegressor(iterations=20,
                          early_stopping_rounds=3,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          eval_metric="R2",
                          loss_function="RMSE")
model2 = model1.copy()
model3 = model1.copy()

In [62]:
%%time
model1.fit(X_train, y_train.iloc[:,0], eval_set=Pool(data=X_test, label=y_test.iloc[:,0]))

Learning rate set to 0.5
0:	learn: 0.1467092	test: 0.0671466	best: 0.0671466 (0)	total: 888ms	remaining: 16.9s
1:	learn: 0.2522587	test: 0.0657459	best: 0.0671466 (0)	total: 1.76s	remaining: 15.9s
2:	learn: 0.3454789	test: 0.1114195	best: 0.1114195 (2)	total: 2.58s	remaining: 14.6s
3:	learn: 0.4794136	test: 0.1208319	best: 0.1208319 (3)	total: 3.5s	remaining: 14s
4:	learn: 0.6460549	test: 0.1771254	best: 0.1771254 (4)	total: 4.43s	remaining: 13.3s
5:	learn: 0.6767582	test: 0.1764642	best: 0.1771254 (4)	total: 5.29s	remaining: 12.3s
6:	learn: 0.6969219	test: 0.2000917	best: 0.2000917 (6)	total: 6.14s	remaining: 11.4s
7:	learn: 0.7718086	test: 0.2324043	best: 0.2324043 (7)	total: 7.16s	remaining: 10.7s
8:	learn: 0.8122202	test: 0.2582038	best: 0.2582038 (8)	total: 8.1s	remaining: 9.9s
9:	learn: 0.8201114	test: 0.2583382	best: 0.2583382 (9)	total: 8.97s	remaining: 8.97s
10:	learn: 0.8316302	test: 0.2716182	best: 0.2716182 (10)	total: 9.95s	remaining: 8.14s
11:	learn: 0.8370980	test: 0.280

<catboost.core.CatBoostRegressor at 0x28e2a0a89a0>

In [63]:
%%time
model2.fit(X_train, y_train.iloc[:,1], eval_set=Pool(data=X_test, label=y_test.iloc[:,1]))

Learning rate set to 0.5
0:	learn: 0.4405624	test: 0.4225773	best: 0.4225773 (0)	total: 1.39s	remaining: 26.4s
1:	learn: 0.5940880	test: 0.5537544	best: 0.5537544 (1)	total: 2.63s	remaining: 23.7s
2:	learn: 0.6878806	test: 0.6657538	best: 0.6657538 (2)	total: 4.16s	remaining: 23.6s
3:	learn: 0.7465693	test: 0.7283427	best: 0.7283427 (3)	total: 5.6s	remaining: 22.4s
4:	learn: 0.7611980	test: 0.7312734	best: 0.7312734 (4)	total: 6.94s	remaining: 20.8s
5:	learn: 0.7674473	test: 0.7310818	best: 0.7312734 (4)	total: 8.49s	remaining: 19.8s
6:	learn: 0.7790798	test: 0.7333795	best: 0.7333795 (6)	total: 9.89s	remaining: 18.4s
7:	learn: 0.7832381	test: 0.7333558	best: 0.7333795 (6)	total: 11.5s	remaining: 17.2s
8:	learn: 0.7962158	test: 0.7405250	best: 0.7405250 (8)	total: 12.8s	remaining: 15.7s
9:	learn: 0.8007889	test: 0.7403604	best: 0.7405250 (8)	total: 14.2s	remaining: 14.2s
10:	learn: 0.8037543	test: 0.7394252	best: 0.7405250 (8)	total: 15.8s	remaining: 12.9s
11:	learn: 0.8129746	test: 0.

<catboost.core.CatBoostRegressor at 0x28e2a0a8130>

In [64]:
%%time
model3.fit(X_train, y_train.iloc[:,2], eval_set=Pool(data=X_test, label=y_test.iloc[:,2]))

Learning rate set to 0.5
0:	learn: 0.2029518	test: 0.1569214	best: 0.1569214 (0)	total: 1.3s	remaining: 24.6s
1:	learn: 0.2274240	test: 0.1573228	best: 0.1573228 (1)	total: 2.69s	remaining: 24.2s
2:	learn: 0.2948394	test: 0.2000421	best: 0.2000421 (2)	total: 4.27s	remaining: 24.2s
3:	learn: 0.3088192	test: 0.1997155	best: 0.2000421 (2)	total: 5.72s	remaining: 22.9s
4:	learn: 0.3220353	test: 0.2015711	best: 0.2015711 (4)	total: 7.23s	remaining: 21.7s
5:	learn: 0.3306185	test: 0.2020422	best: 0.2020422 (5)	total: 8.84s	remaining: 20.6s
6:	learn: 0.3743167	test: 0.2189132	best: 0.2189132 (6)	total: 10.3s	remaining: 19.2s
7:	learn: 0.3802777	test: 0.2190205	best: 0.2190205 (7)	total: 12.1s	remaining: 18.1s
8:	learn: 0.3845505	test: 0.2204704	best: 0.2204704 (8)	total: 13.6s	remaining: 16.6s
9:	learn: 0.4096838	test: 0.2326239	best: 0.2326239 (9)	total: 15.2s	remaining: 15.2s
10:	learn: 0.4384074	test: 0.2406781	best: 0.2406781 (10)	total: 16.6s	remaining: 13.6s
11:	learn: 0.4517904	test: 0

<catboost.core.CatBoostRegressor at 0x28e2a0a81c0>

## Модель с ембедингами сходится хуже, чем простая модель

Но давайте попрбуем все таки натенировать полную модель, дойти до конца, взяв самый сложный признак.  
В прошлый раз он сошелся до 0.368 на тесте

In [66]:
GOAL_NUM = 2

In [68]:
model = CatBoostRegressor(iterations=1000,
                          early_stopping_rounds=30,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          eval_metric="R2",
                          loss_function="RMSE")

In [69]:
model.fit(X_train, 
          y_train.iloc[:,GOAL_NUM], 
          eval_set=Pool(
              data=X_test, 
              label=y_test.iloc[:,GOAL_NUM])
         )

Learning rate set to 0.063811
0:	learn: 0.0320621	test: 0.0256319	best: 0.0256319 (0)	total: 839ms	remaining: 13m 57s
1:	learn: 0.0587656	test: 0.0473987	best: 0.0473987 (1)	total: 1.68s	remaining: 13m 59s
2:	learn: 0.0801676	test: 0.0649458	best: 0.0649458 (2)	total: 2.54s	remaining: 14m 4s
3:	learn: 0.1070928	test: 0.0866210	best: 0.0866210 (3)	total: 3.49s	remaining: 14m 28s
4:	learn: 0.1300672	test: 0.1040182	best: 0.1040182 (4)	total: 4.45s	remaining: 14m 44s
5:	learn: 0.1473207	test: 0.1180416	best: 0.1180416 (5)	total: 5.33s	remaining: 14m 43s
6:	learn: 0.1637411	test: 0.1293597	best: 0.1293597 (6)	total: 6.29s	remaining: 14m 51s
7:	learn: 0.1819696	test: 0.1415651	best: 0.1415651 (7)	total: 7.19s	remaining: 14m 51s
8:	learn: 0.1932137	test: 0.1502187	best: 0.1502187 (8)	total: 8.08s	remaining: 14m 49s
9:	learn: 0.2075894	test: 0.1592449	best: 0.1592449 (9)	total: 8.95s	remaining: 14m 45s
10:	learn: 0.2110157	test: 0.1595851	best: 0.1595851 (10)	total: 9.82s	remaining: 14m 42s
1

91:	learn: 0.4546281	test: 0.2501681	best: 0.2501681 (91)	total: 1m 53s	remaining: 18m 43s
92:	learn: 0.4573533	test: 0.2509478	best: 0.2509478 (92)	total: 1m 55s	remaining: 18m 45s
93:	learn: 0.4581978	test: 0.2513203	best: 0.2513203 (93)	total: 1m 57s	remaining: 18m 48s
94:	learn: 0.4598664	test: 0.2523300	best: 0.2523300 (94)	total: 1m 58s	remaining: 18m 52s
95:	learn: 0.4628649	test: 0.2540140	best: 0.2540140 (95)	total: 2m	remaining: 18m 52s
96:	learn: 0.4636255	test: 0.2537049	best: 0.2540140 (95)	total: 2m 2s	remaining: 18m 56s
97:	learn: 0.4660885	test: 0.2541316	best: 0.2541316 (97)	total: 2m 3s	remaining: 18m 59s
98:	learn: 0.4706278	test: 0.2563724	best: 0.2563724 (98)	total: 2m 5s	remaining: 19m 3s
99:	learn: 0.4730283	test: 0.2565096	best: 0.2565096 (99)	total: 2m 7s	remaining: 19m 3s
100:	learn: 0.4752582	test: 0.2565648	best: 0.2565648 (100)	total: 2m 8s	remaining: 19m 7s
101:	learn: 0.4788289	test: 0.2582381	best: 0.2582381 (101)	total: 2m 10s	remaining: 19m 8s
102:	lea

180:	learn: 0.6119870	test: 0.3231008	best: 0.3231008 (180)	total: 4m 18s	remaining: 19m 29s
181:	learn: 0.6147054	test: 0.3233322	best: 0.3233322 (181)	total: 4m 20s	remaining: 19m 29s
182:	learn: 0.6162701	test: 0.3243587	best: 0.3243587 (182)	total: 4m 21s	remaining: 19m 28s
183:	learn: 0.6165633	test: 0.3243137	best: 0.3243587 (182)	total: 4m 23s	remaining: 19m 28s
184:	learn: 0.6176619	test: 0.3247917	best: 0.3247917 (184)	total: 4m 25s	remaining: 19m 27s
185:	learn: 0.6189204	test: 0.3253321	best: 0.3253321 (185)	total: 4m 26s	remaining: 19m 27s
186:	learn: 0.6208413	test: 0.3258090	best: 0.3258090 (186)	total: 4m 28s	remaining: 19m 27s
187:	learn: 0.6228256	test: 0.3264861	best: 0.3264861 (187)	total: 4m 30s	remaining: 19m 27s
188:	learn: 0.6230527	test: 0.3266659	best: 0.3266659 (188)	total: 4m 31s	remaining: 19m 25s
189:	learn: 0.6258945	test: 0.3264985	best: 0.3266659 (188)	total: 4m 33s	remaining: 19m 24s
190:	learn: 0.6279221	test: 0.3263601	best: 0.3266659 (188)	total: 4m 

269:	learn: 0.7593231	test: 0.3486852	best: 0.3488952 (268)	total: 6m 44s	remaining: 18m 13s
270:	learn: 0.7615173	test: 0.3484120	best: 0.3488952 (268)	total: 6m 45s	remaining: 18m 11s
271:	learn: 0.7621505	test: 0.3488178	best: 0.3488952 (268)	total: 6m 47s	remaining: 18m 10s
272:	learn: 0.7627833	test: 0.3489982	best: 0.3489982 (272)	total: 6m 49s	remaining: 18m 9s
273:	learn: 0.7641677	test: 0.3484384	best: 0.3489982 (272)	total: 6m 50s	remaining: 18m 8s
274:	learn: 0.7662862	test: 0.3481706	best: 0.3489982 (272)	total: 6m 52s	remaining: 18m 6s
275:	learn: 0.7675107	test: 0.3484287	best: 0.3489982 (272)	total: 6m 53s	remaining: 18m 5s
276:	learn: 0.7692957	test: 0.3481328	best: 0.3489982 (272)	total: 6m 55s	remaining: 18m 4s
277:	learn: 0.7699222	test: 0.3483699	best: 0.3489982 (272)	total: 6m 56s	remaining: 18m 2s
278:	learn: 0.7709958	test: 0.3483122	best: 0.3489982 (272)	total: 6m 58s	remaining: 18m 1s
279:	learn: 0.7734078	test: 0.3479908	best: 0.3489982 (272)	total: 6m 59s	rem

358:	learn: 0.8468884	test: 0.3557192	best: 0.3560615 (353)	total: 9m 7s	remaining: 16m 16s
359:	learn: 0.8472155	test: 0.3559244	best: 0.3560615 (353)	total: 9m 8s	remaining: 16m 15s
360:	learn: 0.8480276	test: 0.3559337	best: 0.3560615 (353)	total: 9m 10s	remaining: 16m 14s
361:	learn: 0.8487716	test: 0.3561777	best: 0.3561777 (361)	total: 9m 12s	remaining: 16m 13s
362:	learn: 0.8488487	test: 0.3561976	best: 0.3561976 (362)	total: 9m 13s	remaining: 16m 11s
363:	learn: 0.8506572	test: 0.3557766	best: 0.3561976 (362)	total: 9m 15s	remaining: 16m 10s
364:	learn: 0.8507855	test: 0.3557845	best: 0.3561976 (362)	total: 9m 17s	remaining: 16m 9s
365:	learn: 0.8518732	test: 0.3559560	best: 0.3561976 (362)	total: 9m 18s	remaining: 16m 7s
366:	learn: 0.8529007	test: 0.3557905	best: 0.3561976 (362)	total: 9m 20s	remaining: 16m 6s
367:	learn: 0.8533429	test: 0.3559955	best: 0.3561976 (362)	total: 9m 22s	remaining: 16m 5s
368:	learn: 0.8541493	test: 0.3565337	best: 0.3565337 (368)	total: 9m 23s	re

446:	learn: 0.8972975	test: 0.3591486	best: 0.3592648 (445)	total: 11m 33s	remaining: 14m 17s
447:	learn: 0.8975233	test: 0.3591941	best: 0.3592648 (445)	total: 11m 35s	remaining: 14m 16s
448:	learn: 0.8982753	test: 0.3591448	best: 0.3592648 (445)	total: 11m 36s	remaining: 14m 15s
449:	learn: 0.8988890	test: 0.3587999	best: 0.3592648 (445)	total: 11m 38s	remaining: 14m 14s
450:	learn: 0.8993862	test: 0.3588244	best: 0.3592648 (445)	total: 11m 40s	remaining: 14m 12s
451:	learn: 0.9000718	test: 0.3590212	best: 0.3592648 (445)	total: 11m 42s	remaining: 14m 11s
452:	learn: 0.9002790	test: 0.3590086	best: 0.3592648 (445)	total: 11m 44s	remaining: 14m 10s
453:	learn: 0.9006182	test: 0.3588765	best: 0.3592648 (445)	total: 11m 45s	remaining: 14m 8s
454:	learn: 0.9009350	test: 0.3589021	best: 0.3592648 (445)	total: 11m 47s	remaining: 14m 7s
455:	learn: 0.9015096	test: 0.3588148	best: 0.3592648 (445)	total: 11m 49s	remaining: 14m 5s
456:	learn: 0.9018976	test: 0.3588649	best: 0.3592648 (445)	tot

<catboost.core.CatBoostRegressor at 0x28e2a257790>

In [74]:
model2 = model

In [71]:
GOAL_NUM = 0

In [72]:
model0 = CatBoostRegressor(iterations=1000,
                          early_stopping_rounds=30,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          eval_metric="R2",
                          loss_function="RMSE")

In [73]:
model0.fit(X_train, 
          y_train.iloc[:,GOAL_NUM], 
          eval_set=Pool(
              data=X_test, 
              label=y_test.iloc[:,GOAL_NUM])
         )

Learning rate set to 0.063811
0:	learn: 0.0198965	test: 0.0086408	best: 0.0086408 (0)	total: 883ms	remaining: 14m 42s
1:	learn: 0.0380604	test: 0.0087190	best: 0.0087190 (1)	total: 1.73s	remaining: 14m 21s
2:	learn: 0.0564822	test: 0.0087558	best: 0.0087558 (2)	total: 2.61s	remaining: 14m 28s
3:	learn: 0.0910003	test: 0.0185691	best: 0.0185691 (3)	total: 3.5s	remaining: 14m 32s
4:	learn: 0.1083865	test: 0.0272397	best: 0.0272397 (4)	total: 4.48s	remaining: 14m 51s
5:	learn: 0.1248559	test: 0.0271430	best: 0.0272397 (4)	total: 5.38s	remaining: 14m 50s
6:	learn: 0.1411084	test: 0.0351500	best: 0.0351500 (6)	total: 6.25s	remaining: 14m 46s
7:	learn: 0.1688225	test: 0.0358593	best: 0.0358593 (7)	total: 7.14s	remaining: 14m 45s
8:	learn: 0.1833760	test: 0.0358999	best: 0.0358999 (8)	total: 8.05s	remaining: 14m 46s
9:	learn: 0.1974070	test: 0.0359155	best: 0.0359155 (9)	total: 8.94s	remaining: 14m 45s
10:	learn: 0.2239785	test: 0.0440600	best: 0.0440600 (10)	total: 9.85s	remaining: 14m 45s
1

91:	learn: 0.8149441	test: 0.3305907	best: 0.3305907 (91)	total: 2m 14s	remaining: 22m 12s
92:	learn: 0.8160765	test: 0.3306072	best: 0.3306072 (92)	total: 2m 16s	remaining: 22m 15s
93:	learn: 0.8209533	test: 0.3344960	best: 0.3344960 (93)	total: 2m 18s	remaining: 22m 14s
94:	learn: 0.8223606	test: 0.3346317	best: 0.3346317 (94)	total: 2m 20s	remaining: 22m 15s
95:	learn: 0.8242512	test: 0.3366609	best: 0.3366609 (95)	total: 2m 21s	remaining: 22m 15s
96:	learn: 0.8287706	test: 0.3386705	best: 0.3386705 (96)	total: 2m 23s	remaining: 22m 16s
97:	learn: 0.8297179	test: 0.3396108	best: 0.3396108 (97)	total: 2m 25s	remaining: 22m 17s
98:	learn: 0.8306154	test: 0.3397658	best: 0.3397658 (98)	total: 2m 27s	remaining: 22m 17s
99:	learn: 0.8314458	test: 0.3406479	best: 0.3406479 (99)	total: 2m 28s	remaining: 22m 18s
100:	learn: 0.8321608	test: 0.3415211	best: 0.3415211 (100)	total: 2m 30s	remaining: 22m 16s
101:	learn: 0.8329896	test: 0.3424140	best: 0.3424140 (101)	total: 2m 31s	remaining: 22m

180:	learn: 0.9212908	test: 0.4096431	best: 0.4096431 (180)	total: 4m 37s	remaining: 20m 57s
181:	learn: 0.9215318	test: 0.4096431	best: 0.4096431 (180)	total: 4m 39s	remaining: 20m 55s
182:	learn: 0.9217513	test: 0.4096423	best: 0.4096431 (180)	total: 4m 41s	remaining: 20m 54s
183:	learn: 0.9219774	test: 0.4096344	best: 0.4096431 (180)	total: 4m 42s	remaining: 20m 53s
184:	learn: 0.9225522	test: 0.4101159	best: 0.4101159 (184)	total: 4m 44s	remaining: 20m 51s
185:	learn: 0.9227559	test: 0.4101128	best: 0.4101159 (184)	total: 4m 45s	remaining: 20m 49s
186:	learn: 0.9236096	test: 0.4101546	best: 0.4101546 (186)	total: 4m 47s	remaining: 20m 48s
187:	learn: 0.9239331	test: 0.4101742	best: 0.4101742 (187)	total: 4m 48s	remaining: 20m 46s
188:	learn: 0.9241696	test: 0.4101889	best: 0.4101889 (188)	total: 4m 50s	remaining: 20m 45s
189:	learn: 0.9251650	test: 0.4107912	best: 0.4107912 (189)	total: 4m 51s	remaining: 20m 44s
190:	learn: 0.9255758	test: 0.4108514	best: 0.4108514 (190)	total: 4m 

269:	learn: 0.9504072	test: 0.4215644	best: 0.4215644 (269)	total: 7m 2s	remaining: 19m 1s
270:	learn: 0.9505148	test: 0.4215694	best: 0.4215694 (270)	total: 7m 3s	remaining: 18m 59s
271:	learn: 0.9511191	test: 0.4221337	best: 0.4221337 (271)	total: 7m 5s	remaining: 18m 58s
272:	learn: 0.9511974	test: 0.4221274	best: 0.4221337 (271)	total: 7m 6s	remaining: 18m 57s
273:	learn: 0.9512778	test: 0.4221140	best: 0.4221337 (271)	total: 7m 8s	remaining: 18m 55s
274:	learn: 0.9513489	test: 0.4221066	best: 0.4221337 (271)	total: 7m 10s	remaining: 18m 54s
275:	learn: 0.9523157	test: 0.4217049	best: 0.4221337 (271)	total: 7m 11s	remaining: 18m 52s
276:	learn: 0.9526409	test: 0.4217038	best: 0.4221337 (271)	total: 7m 13s	remaining: 18m 51s
277:	learn: 0.9527185	test: 0.4217036	best: 0.4221337 (271)	total: 7m 15s	remaining: 18m 50s
278:	learn: 0.9533933	test: 0.4220249	best: 0.4221337 (271)	total: 7m 16s	remaining: 18m 48s
279:	learn: 0.9536226	test: 0.4222022	best: 0.4222022 (279)	total: 7m 18s	re

358:	learn: 0.9668852	test: 0.4272909	best: 0.4273447 (354)	total: 9m 32s	remaining: 17m 1s
359:	learn: 0.9669663	test: 0.4272836	best: 0.4273447 (354)	total: 9m 33s	remaining: 17m
360:	learn: 0.9670038	test: 0.4272882	best: 0.4273447 (354)	total: 9m 35s	remaining: 16m 58s
361:	learn: 0.9670435	test: 0.4272818	best: 0.4273447 (354)	total: 9m 37s	remaining: 16m 57s
362:	learn: 0.9672111	test: 0.4273063	best: 0.4273447 (354)	total: 9m 38s	remaining: 16m 55s
363:	learn: 0.9672831	test: 0.4273059	best: 0.4273447 (354)	total: 9m 40s	remaining: 16m 54s
364:	learn: 0.9673213	test: 0.4272998	best: 0.4273447 (354)	total: 9m 42s	remaining: 16m 52s
365:	learn: 0.9673889	test: 0.4273058	best: 0.4273447 (354)	total: 9m 43s	remaining: 16m 51s
366:	learn: 0.9678731	test: 0.4277406	best: 0.4277406 (366)	total: 9m 45s	remaining: 16m 49s
367:	learn: 0.9679128	test: 0.4277367	best: 0.4277406 (366)	total: 9m 46s	remaining: 16m 47s
368:	learn: 0.9679499	test: 0.4277367	best: 0.4277406 (366)	total: 9m 48s	r

446:	learn: 0.9747705	test: 0.4289942	best: 0.4290719 (440)	total: 11m 55s	remaining: 14m 45s
447:	learn: 0.9749835	test: 0.4290011	best: 0.4290719 (440)	total: 11m 57s	remaining: 14m 43s
448:	learn: 0.9750545	test: 0.4291362	best: 0.4291362 (448)	total: 11m 58s	remaining: 14m 42s
449:	learn: 0.9750791	test: 0.4291335	best: 0.4291362 (448)	total: 12m	remaining: 14m 40s
450:	learn: 0.9751797	test: 0.4293249	best: 0.4293249 (450)	total: 12m 2s	remaining: 14m 38s
451:	learn: 0.9752043	test: 0.4293269	best: 0.4293269 (451)	total: 12m 3s	remaining: 14m 37s
452:	learn: 0.9752444	test: 0.4293401	best: 0.4293401 (452)	total: 12m 5s	remaining: 14m 35s
453:	learn: 0.9752676	test: 0.4293371	best: 0.4293401 (452)	total: 12m 6s	remaining: 14m 34s
454:	learn: 0.9753161	test: 0.4293998	best: 0.4293998 (454)	total: 12m 8s	remaining: 14m 32s
455:	learn: 0.9753395	test: 0.4293996	best: 0.4293998 (454)	total: 12m 10s	remaining: 14m 30s
456:	learn: 0.9753725	test: 0.4293931	best: 0.4293998 (454)	total: 12

534:	learn: 0.9804092	test: 0.4308916	best: 0.4308916 (534)	total: 14m 21s	remaining: 12m 29s
535:	learn: 0.9804238	test: 0.4308907	best: 0.4308916 (534)	total: 14m 23s	remaining: 12m 27s
536:	learn: 0.9804517	test: 0.4308896	best: 0.4308916 (534)	total: 14m 25s	remaining: 12m 25s
537:	learn: 0.9806668	test: 0.4307843	best: 0.4308916 (534)	total: 14m 26s	remaining: 12m 24s
538:	learn: 0.9807652	test: 0.4307681	best: 0.4308916 (534)	total: 14m 28s	remaining: 12m 22s
539:	learn: 0.9807812	test: 0.4307667	best: 0.4308916 (534)	total: 14m 30s	remaining: 12m 21s
540:	learn: 0.9808582	test: 0.4308156	best: 0.4308916 (534)	total: 14m 31s	remaining: 12m 19s
541:	learn: 0.9808929	test: 0.4308010	best: 0.4308916 (534)	total: 14m 33s	remaining: 12m 18s
542:	learn: 0.9809085	test: 0.4308012	best: 0.4308916 (534)	total: 14m 35s	remaining: 12m 16s
543:	learn: 0.9809694	test: 0.4308821	best: 0.4308916 (534)	total: 14m 36s	remaining: 12m 15s
544:	learn: 0.9810968	test: 0.4308773	best: 0.4308916 (534)	

622:	learn: 0.9850362	test: 0.4322752	best: 0.4323753 (609)	total: 16m 57s	remaining: 10m 15s
623:	learn: 0.9851233	test: 0.4323753	best: 0.4323753 (609)	total: 16m 59s	remaining: 10m 14s
624:	learn: 0.9851343	test: 0.4323792	best: 0.4323792 (624)	total: 17m	remaining: 10m 12s
625:	learn: 0.9851449	test: 0.4323791	best: 0.4323792 (624)	total: 17m 2s	remaining: 10m 10s
626:	learn: 0.9851554	test: 0.4323775	best: 0.4323792 (624)	total: 17m 4s	remaining: 10m 9s
627:	learn: 0.9852087	test: 0.4324050	best: 0.4324050 (627)	total: 17m 5s	remaining: 10m 7s
628:	learn: 0.9852514	test: 0.4324049	best: 0.4324050 (627)	total: 17m 7s	remaining: 10m 5s
629:	learn: 0.9853643	test: 0.4323042	best: 0.4324050 (627)	total: 17m 8s	remaining: 10m 4s
630:	learn: 0.9854517	test: 0.4324113	best: 0.4324113 (630)	total: 17m 10s	remaining: 10m 2s
631:	learn: 0.9854616	test: 0.4324091	best: 0.4324113 (630)	total: 17m 12s	remaining: 10m 1s
632:	learn: 0.9855718	test: 0.4326559	best: 0.4326559 (632)	total: 17m 13s	

711:	learn: 0.9889344	test: 0.4335565	best: 0.4335565 (711)	total: 19m 28s	remaining: 7m 52s
712:	learn: 0.9889459	test: 0.4335536	best: 0.4335565 (711)	total: 19m 29s	remaining: 7m 50s
713:	learn: 0.9889532	test: 0.4335539	best: 0.4335565 (711)	total: 19m 31s	remaining: 7m 49s
714:	learn: 0.9889708	test: 0.4335512	best: 0.4335565 (711)	total: 19m 33s	remaining: 7m 47s
715:	learn: 0.9890451	test: 0.4335675	best: 0.4335675 (715)	total: 19m 34s	remaining: 7m 46s
716:	learn: 0.9890523	test: 0.4335665	best: 0.4335675 (715)	total: 19m 36s	remaining: 7m 44s
717:	learn: 0.9890624	test: 0.4335661	best: 0.4335675 (715)	total: 19m 38s	remaining: 7m 42s
718:	learn: 0.9891413	test: 0.4334411	best: 0.4335675 (715)	total: 19m 39s	remaining: 7m 41s
719:	learn: 0.9891482	test: 0.4334407	best: 0.4335675 (715)	total: 19m 41s	remaining: 7m 39s
720:	learn: 0.9891603	test: 0.4334400	best: 0.4335675 (715)	total: 19m 42s	remaining: 7m 37s
721:	learn: 0.9891936	test: 0.4334407	best: 0.4335675 (715)	total: 19m

800:	learn: 0.9916084	test: 0.4344199	best: 0.4344199 (800)	total: 21m 48s	remaining: 5m 25s
801:	learn: 0.9916143	test: 0.4344185	best: 0.4344199 (800)	total: 21m 50s	remaining: 5m 23s
802:	learn: 0.9916533	test: 0.4344477	best: 0.4344477 (802)	total: 21m 52s	remaining: 5m 21s
803:	learn: 0.9916803	test: 0.4344571	best: 0.4344571 (803)	total: 21m 53s	remaining: 5m 20s
804:	learn: 0.9917005	test: 0.4344519	best: 0.4344571 (803)	total: 21m 55s	remaining: 5m 18s
805:	learn: 0.9917521	test: 0.4344443	best: 0.4344571 (803)	total: 21m 56s	remaining: 5m 16s
806:	learn: 0.9917573	test: 0.4344437	best: 0.4344571 (803)	total: 21m 58s	remaining: 5m 15s
807:	learn: 0.9917641	test: 0.4344441	best: 0.4344571 (803)	total: 21m 59s	remaining: 5m 13s
808:	learn: 0.9918118	test: 0.4344199	best: 0.4344571 (803)	total: 22m 1s	remaining: 5m 12s
809:	learn: 0.9918501	test: 0.4344023	best: 0.4344571 (803)	total: 22m 3s	remaining: 5m 10s
810:	learn: 0.9918550	test: 0.4344031	best: 0.4344571 (803)	total: 22m 4

889:	learn: 0.9935444	test: 0.4350472	best: 0.4350472 (889)	total: 24m 11s	remaining: 2m 59s
890:	learn: 0.9936057	test: 0.4350357	best: 0.4350472 (889)	total: 24m 13s	remaining: 2m 57s
891:	learn: 0.9936094	test: 0.4350345	best: 0.4350472 (889)	total: 24m 15s	remaining: 2m 56s
892:	learn: 0.9936130	test: 0.4350326	best: 0.4350472 (889)	total: 24m 17s	remaining: 2m 54s
893:	learn: 0.9936454	test: 0.4350243	best: 0.4350472 (889)	total: 24m 18s	remaining: 2m 52s
894:	learn: 0.9936509	test: 0.4350247	best: 0.4350472 (889)	total: 24m 20s	remaining: 2m 51s
895:	learn: 0.9936721	test: 0.4350490	best: 0.4350490 (895)	total: 24m 21s	remaining: 2m 49s
896:	learn: 0.9936891	test: 0.4350820	best: 0.4350820 (896)	total: 24m 23s	remaining: 2m 48s
897:	learn: 0.9937267	test: 0.4350403	best: 0.4350820 (896)	total: 24m 25s	remaining: 2m 46s
898:	learn: 0.9937553	test: 0.4350243	best: 0.4350820 (896)	total: 24m 26s	remaining: 2m 44s
899:	learn: 0.9937599	test: 0.4350238	best: 0.4350820 (896)	total: 24m

<catboost.core.CatBoostRegressor at 0x28e2a0a8700>

In [77]:
with open(f"models/cb1000-1.pkl", "rb") as pkl_file:
     model1 = pickle.load(pkl_file)

In [79]:
preds = np.c_[
    model0.predict(X_test),
    model1.predict(X_test),
    model2.predict(X_test)
]
my_metric(y_test, preds)

(0.5276241169486666,
 0.17404969864039557,
 0.24570993351030682,
 0.10786448479796427)

## Проверим на лидерборде

In [80]:
X_test = preprocess.transform(test_df, embedings[7000:])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [81]:
preds = np.c_[
    model0.predict(X_test),
    model1.predict(X_test),
    model2.predict(X_test)
]


In [82]:
write_down_predictions(preds, "local4. catboost 3 models + embedings.csv")