In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import hstack

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor, Pool


pd.options.display.float_format = '{:,.2f}'.format
sns.set_theme()
nltk.download("stopwords");

RANDOM_STATE = 44

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
train_df = pd.read_csv("data/train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("data/test_dataset_test.csv", index_col=0)

train_df["is_train"] = 1
test_df["is_train"] = 0

df = pd.concat([train_df, test_df])

In [19]:
class RBKpreprocessor(BaseEstimator):
    def __init__(self):
        self._vectorizer_tags = TfidfVectorizer()
        self._vectorizer_authors = TfidfVectorizer()
        self._category_ohe = OneHotEncoder()
        self._category_from_title_ohe = OneHotEncoder()
        self._stop_words = stopwords.words("russian")
        self._stemmer = SnowballStemmer("russian")
        self._vectorizer_title = TfidfVectorizer()
        
        
    def _clean_list(self, title):
        return(title.
           replace("[", "").
           replace("]", "").
           replace(".", "").
           replace("'", "").
           replace(",", " ")
          )
    
    def _clean_title(self, title):
        if title.find("\n")>0:
            title = title[0:title.find("\n\n")].lower()
        title = " ".join([self._stemmer.stem(w) for w in title.split() if w not in self._stop_words])
        return title

    def _find_category_in_title(self, title):
        if title.find("\n")>0:
            title = title[title.find("\n\n"):].lower().strip()
        else:
            title = ""
        if "," in title:
            title = title[0:title.index(",")]
        else:
            title = ""

        return title
    
    def fit(self, df):
        self._category_ohe.fit(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        self._vectorizer_tags.fit(tags_clean)
        self._vectorizer_authors.fit(authors_clean);
        
        title_clean = df.title.apply(self._clean_title)
        category_from_title =  df.title.apply(self._find_category_in_title)
        self._category_from_title_ohe.fit(category_from_title.values.reshape(-1,1))
        self._vectorizer_title.fit(title_clean)
        
        return(self)
        
        
    def transform(self, df):
        ctr_zero = (df.ctr == 0)
        ctr_log = np.log(df.ctr)
        mean_ctr_log = np.mean(ctr_log.values, where=(ctr_log != -np.inf))
        ctr_log = np.where(df["ctr"] == 0, mean_ctr_log, ctr_log)
        
        category_sparse = self._category_ohe.transform(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        
        authors_count = authors_clean.apply(lambda x: len(x.split()))
        tags_count = tags_clean.apply(lambda x: len(x.split()))
        
        tags_sparse = self._vectorizer_tags.transform(tags_clean)
        authors_sparse = self._vectorizer_authors.transform(authors_clean)
        
        publish_date = pd.to_datetime(df.publish_date)
        publish_year = publish_date.dt.year * 100 + publish_date.dt.month
        publish_day = publish_date.dt.day
        publish_weekday = publish_date.dt.weekday
        publish_hour = publish_date.dt.hour
        
        title_clean = df.title.apply(self._clean_title)
        title_sparse  = self._vectorizer_title.transform(title_clean)
        
        category_from_title =  df.title.apply(self._find_category_in_title)
        category_from_title_sparse = self._category_from_title_ohe.transform(category_from_title.values.reshape(-1,1))
        
        return hstack([
            ctr_zero.values[:,None],
            ctr_log[:,None],
            category_sparse,
            authors_count.values[:,None],
            tags_count.values[:,None],
            tags_sparse,
            authors_sparse,
            publish_year.values[:,None],
            publish_day.values[:,None],
            publish_weekday.values[:,None],
            publish_hour.values[:,None],
            title_sparse,
            category_from_title_sparse
        ])
        
def my_metric(y_real, preds, detail=True):
    overall = (
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]) + 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]) + 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
    if not detail:
        return overall
    else:
        return (
            overall,
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]), 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]), 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
def write_down_predictions(preds, output_file = "output.csv"):
    solution = pd.read_csv("data\sample_solution.csv")
    solution.iloc[:,1:4] = preds
    solution.to_csv(output_file, index=False)

In [20]:
%%time
preprocess = RBKpreprocessor()
preprocess.fit(df)

Wall time: 3.98 s


RBKpreprocessor()

## Немного поанализируем текст

In [21]:
_stop_words = stopwords.words("russian")
_stemmer = SnowballStemmer("russian")
        
def _clean_title(title):
    if title.find("\n")>0:
        title = title[0:title.find("\n\n")].lower()
    title = " ".join([_stemmer.stem(w) for w in title.split() if w not in _stop_words])
    return title

In [22]:
titles = df.title.apply(_clean_title)

In [23]:
titles.apply(len).describe()

count   10,000.00
mean        49.98
std          9.67
min         10.00
25%         45.00
50%         51.00
75%         55.00
max        135.00
Name: title, dtype: float64

## Локальная валидация

In [26]:
valid_train_df, valid_test_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

In [27]:
%%time
X_train = preprocess.transform(valid_train_df)
y_train = valid_train_df[["views", "depth", "full_reads_percent"]]

X_test = preprocess.transform(valid_test_df)
y_test = valid_test_df[["views", "depth", "full_reads_percent"]]



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 2.77 s


## CatboostRegressor

In [19]:
model = CatBoostRegressor(iterations=20,
                          early_stopping_rounds=3,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          loss_function='MultiRMSE',
                          verbose=True)

In [20]:
%%time
model.fit(X_train, y_train, eval_set=Pool(data=X_test, label=y_test))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.Got unsafe target value = 2.5542e+06 at object #334 of dataset learn
Got unsafe target value = 2.5542e+06 at object #164 of dataset test #0


0:	learn: 85530.7791244	test: 125539.0041544	best: 125539.0041544 (0)	total: 1.79s	remaining: 33.9s
1:	learn: 85024.6884090	test: 125272.6614669	best: 125272.6614669 (1)	total: 7.08s	remaining: 1m 3s
2:	learn: 84356.0603643	test: 125010.3365896	best: 125010.3365896 (2)	total: 12.4s	remaining: 1m 10s
3:	learn: 84014.8571342	test: 124872.1330171	best: 124872.1330171 (3)	total: 17s	remaining: 1m 7s
4:	learn: 83358.3165231	test: 124596.8324311	best: 124596.8324311 (4)	total: 21.6s	remaining: 1m 4s
5:	learn: 82504.1805773	test: 124291.3635973	best: 124291.3635973 (5)	total: 26s	remaining: 1m
6:	learn: 81882.9984517	test: 124203.0536857	best: 124203.0536857 (6)	total: 30.9s	remaining: 57.4s
7:	learn: 81132.5591418	test: 123852.8156653	best: 123852.8156653 (7)	total: 35.1s	remaining: 52.7s
8:	learn: 80584.3516082	test: 123219.5671855	best: 123219.5671855 (8)	total: 39.4s	remaining: 48.1s
9:	learn: 80403.7560937	test: 122968.7477808	best: 122968.7477808 (9)	total: 43.5s	remaining: 43.5s
10:	le

<catboost.core.CatBoostRegressor at 0x239824b1f40>

In [21]:
preds = model.predict(X_test)
my_metric(y_test, preds)

(0.08884265310537756,
 0.033680461713825154,
 0.046408845992440474,
 0.008753345399111934)

In [22]:
model = CatBoostRegressor(iterations=50,
                          early_stopping_rounds=3,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          loss_function='MultiRMSE')

In [23]:
%%time
model.fit(X_train, y_train, eval_set=Pool(data=X_test, label=y_test))
preds = model.predict(X_test)
my_metric(y_test, preds)

Got unsafe target value = 2.5542e+06 at object #334 of dataset learn
Got unsafe target value = 2.5542e+06 at object #164 of dataset test #0


0:	learn: 85530.7791244	test: 125539.0041544	best: 125539.0041544 (0)	total: 1.09s	remaining: 53.3s
1:	learn: 85024.6884090	test: 125272.6614669	best: 125272.6614669 (1)	total: 2.31s	remaining: 55.3s
2:	learn: 84356.0603643	test: 125010.3365896	best: 125010.3365896 (2)	total: 7.32s	remaining: 1m 54s
3:	learn: 84014.8571342	test: 124872.1330171	best: 124872.1330171 (3)	total: 13.5s	remaining: 2m 35s
4:	learn: 83358.3165231	test: 124596.8324311	best: 124596.8324311 (4)	total: 18.3s	remaining: 2m 44s
5:	learn: 82504.1805773	test: 124291.3635973	best: 124291.3635973 (5)	total: 22.7s	remaining: 2m 46s
6:	learn: 81882.9984517	test: 124203.0536857	best: 124203.0536857 (6)	total: 26.8s	remaining: 2m 44s
7:	learn: 81132.5591418	test: 123852.8156653	best: 123852.8156653 (7)	total: 31.6s	remaining: 2m 45s
8:	learn: 80584.3516082	test: 123219.5671855	best: 123219.5671855 (8)	total: 35.8s	remaining: 2m 43s
9:	learn: 80403.7560937	test: 122968.7477808	best: 122968.7477808 (9)	total: 40.2s	remaining:

(0.17971151254219747,
 0.06627755569599025,
 0.09320289876036893,
 0.020231058085838315)

## А если сделать по одному бустинку на каждый выход, и метрику r2 оптимизировать

In [35]:
model1 = CatBoostRegressor(iterations=20,
                          early_stopping_rounds=3,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          eval_metric="R2",
                          loss_function="RMSE")
model2 = model1.copy()
model3 = model1.copy()

In [36]:
%%time
model1.fit(X_train, y_train.iloc[:,0], eval_set=Pool(data=X_test, label=y_test.iloc[:,0]))

Learning rate set to 0.5
0:	learn: 0.1457455	test: 0.0690903	best: 0.0690903 (0)	total: 450ms	remaining: 8.55s
1:	learn: 0.3192358	test: 0.1102355	best: 0.1102355 (1)	total: 867ms	remaining: 7.8s
2:	learn: 0.5451028	test: 0.2205832	best: 0.2205832 (2)	total: 1.28s	remaining: 7.27s
3:	learn: 0.6191707	test: 0.2545967	best: 0.2545967 (3)	total: 1.68s	remaining: 6.71s
4:	learn: 0.7028892	test: 0.3093535	best: 0.3093535 (4)	total: 2.1s	remaining: 6.31s
5:	learn: 0.7255360	test: 0.3272761	best: 0.3272761 (5)	total: 2.58s	remaining: 6.01s
6:	learn: 0.7407832	test: 0.3408906	best: 0.3408906 (6)	total: 3.44s	remaining: 6.4s
7:	learn: 0.7551583	test: 0.3519802	best: 0.3519802 (7)	total: 5.25s	remaining: 7.87s
8:	learn: 0.7656924	test: 0.3593509	best: 0.3593509 (8)	total: 7.54s	remaining: 9.22s
9:	learn: 0.7752481	test: 0.3665365	best: 0.3665365 (9)	total: 9.5s	remaining: 9.5s
10:	learn: 0.8156690	test: 0.4046898	best: 0.4046898 (10)	total: 11.7s	remaining: 9.57s
11:	learn: 0.8353679	test: 0.417

<catboost.core.CatBoostRegressor at 0x23982105a00>

In [37]:
%%time
model2.fit(X_train, y_train.iloc[:,1], eval_set=Pool(data=X_test, label=y_test.iloc[:,1]))

Learning rate set to 0.5
0:	learn: 0.4617745	test: 0.4599649	best: 0.4599649 (0)	total: 803ms	remaining: 15.3s
1:	learn: 0.6096927	test: 0.5611509	best: 0.5611509 (1)	total: 2.1s	remaining: 18.9s
2:	learn: 0.6751672	test: 0.6270159	best: 0.6270159 (2)	total: 3.53s	remaining: 20s
3:	learn: 0.7353214	test: 0.7108072	best: 0.7108072 (3)	total: 5.01s	remaining: 20s
4:	learn: 0.7431803	test: 0.7170892	best: 0.7170892 (4)	total: 6.92s	remaining: 20.8s
5:	learn: 0.7481745	test: 0.7170847	best: 0.7170892 (4)	total: 8.34s	remaining: 19.5s
6:	learn: 0.7532388	test: 0.7217736	best: 0.7217736 (6)	total: 9.8s	remaining: 18.2s
7:	learn: 0.7761962	test: 0.7355344	best: 0.7355344 (7)	total: 11.3s	remaining: 17s
8:	learn: 0.7800880	test: 0.7388071	best: 0.7388071 (8)	total: 12.7s	remaining: 15.5s
9:	learn: 0.7897370	test: 0.7443755	best: 0.7443755 (9)	total: 14.4s	remaining: 14.4s
10:	learn: 0.7932726	test: 0.7466024	best: 0.7466024 (10)	total: 15.7s	remaining: 12.9s
11:	learn: 0.7969714	test: 0.747523

<catboost.core.CatBoostRegressor at 0x239821059a0>

In [38]:
%%time
model3.fit(X_train, y_train.iloc[:,2], eval_set=Pool(data=X_test, label=y_test.iloc[:,2]))

Learning rate set to 0.5
0:	learn: 0.1899170	test: 0.1629200	best: 0.1629200 (0)	total: 1.81s	remaining: 34.5s
1:	learn: 0.2140387	test: 0.1630573	best: 0.1630573 (1)	total: 3.64s	remaining: 32.8s
2:	learn: 0.2328228	test: 0.1627615	best: 0.1630573 (1)	total: 5.54s	remaining: 31.4s
3:	learn: 0.3018429	test: 0.2168630	best: 0.2168630 (3)	total: 6.94s	remaining: 27.7s
4:	learn: 0.3139694	test: 0.2140174	best: 0.2168630 (3)	total: 8.49s	remaining: 25.5s
5:	learn: 0.3501655	test: 0.2402478	best: 0.2402478 (5)	total: 9.99s	remaining: 23.3s
6:	learn: 0.3574774	test: 0.2404505	best: 0.2404505 (6)	total: 11.6s	remaining: 21.5s
7:	learn: 0.3630384	test: 0.2403459	best: 0.2404505 (6)	total: 13.1s	remaining: 19.7s
8:	learn: 0.3931606	test: 0.2643137	best: 0.2643137 (8)	total: 14.5s	remaining: 17.8s
9:	learn: 0.3969076	test: 0.2588657	best: 0.2643137 (8)	total: 16.1s	remaining: 16.1s
10:	learn: 0.4003125	test: 0.2523575	best: 0.2643137 (8)	total: 17.6s	remaining: 14.4s
11:	learn: 0.4278480	test: 0

<catboost.core.CatBoostRegressor at 0x23982105d60>

In [42]:
preds = np.c_[
    model1.predict(X_test),
    model2.predict(X_test),
    model3.predict(X_test)
]
my_metric(y_test, preds)

(0.5000721823769023,
 0.17981116553200197,
 0.2285315081366026,
 0.09172950870829773)

## Выводы
Ну кажется, что оптимизация каждого параметра по отдельности более перспективна, чем считать одну модель на 3 предсказания

> Попробуем сделать обучение модели на этих параметрах, но на 2000 итерациях каждую (В разных ноутбуках)
> А потом можно будет  подобрать оптимальные гиперпараметры  для каждого признака

## Обучение дерева

In [56]:
GOAL_NUM = 0

In [57]:
model = model1 = CatBoostRegressor(iterations=1000,
                          early_stopping_rounds=30,
                          random_seed=RANDOM_STATE,
                          depth=10, 
                          eval_metric="R2",
                          loss_function="RMSE")

In [58]:
model.fit(X_train, 
          y_train.iloc[:,GOAL_NUM], 
          eval_set=Pool(
              data=X_test, 
              label=y_test.iloc[:,GOAL_NUM])
         )

Learning rate set to 0.063811
0:	learn: 0.0197464	test: 0.0088808	best: 0.0088808 (0)	total: 449ms	remaining: 7m 28s
1:	learn: 0.0483360	test: 0.0174482	best: 0.0174482 (1)	total: 862ms	remaining: 7m 10s
2:	learn: 0.0984822	test: 0.0373886	best: 0.0373886 (2)	total: 1.26s	remaining: 6m 58s
3:	learn: 0.1284547	test: 0.0461430	best: 0.0461430 (3)	total: 1.66s	remaining: 6m 52s
4:	learn: 0.1559520	test: 0.0541466	best: 0.0541466 (4)	total: 2.16s	remaining: 7m 10s
5:	learn: 0.1963612	test: 0.0698872	best: 0.0698872 (5)	total: 3.18s	remaining: 8m 46s
6:	learn: 0.2236294	test: 0.0788184	best: 0.0788184 (6)	total: 4.96s	remaining: 11m 43s
7:	learn: 0.2466260	test: 0.0860847	best: 0.0860847 (7)	total: 7.15s	remaining: 14m 46s
8:	learn: 0.2808455	test: 0.1005965	best: 0.1005965 (8)	total: 9.17s	remaining: 16m 49s
9:	learn: 0.3068980	test: 0.1091650	best: 0.1091650 (9)	total: 11.2s	remaining: 18m 26s
10:	learn: 0.3216794	test: 0.1158338	best: 0.1158338 (10)	total: 13s	remaining: 19m 24s
11:	lear

91:	learn: 0.8136352	test: 0.4108559	best: 0.4108559 (91)	total: 3m 7s	remaining: 30m 49s
92:	learn: 0.8143258	test: 0.4113634	best: 0.4113634 (92)	total: 3m 11s	remaining: 31m 7s
93:	learn: 0.8149447	test: 0.4118514	best: 0.4118514 (93)	total: 3m 14s	remaining: 31m 19s
94:	learn: 0.8157295	test: 0.4123655	best: 0.4123655 (94)	total: 3m 18s	remaining: 31m 28s
95:	learn: 0.8164833	test: 0.4130364	best: 0.4130364 (95)	total: 3m 22s	remaining: 31m 44s
96:	learn: 0.8171490	test: 0.4134718	best: 0.4134718 (96)	total: 3m 26s	remaining: 32m
97:	learn: 0.8177680	test: 0.4139413	best: 0.4139413 (97)	total: 3m 29s	remaining: 32m 8s
98:	learn: 0.8217220	test: 0.4167608	best: 0.4167608 (98)	total: 3m 34s	remaining: 32m 34s
99:	learn: 0.8250548	test: 0.4222202	best: 0.4222202 (99)	total: 3m 38s	remaining: 32m 50s
100:	learn: 0.8258059	test: 0.4237355	best: 0.4237355 (100)	total: 3m 42s	remaining: 33m 1s
101:	learn: 0.8265021	test: 0.4241579	best: 0.4241579 (101)	total: 3m 48s	remaining: 33m 32s
102

180:	learn: 0.9013181	test: 0.4831544	best: 0.4831544 (180)	total: 10m 20s	remaining: 46m 46s
181:	learn: 0.9016523	test: 0.4831110	best: 0.4831544 (180)	total: 10m 24s	remaining: 46m 46s
182:	learn: 0.9019094	test: 0.4831376	best: 0.4831544 (180)	total: 10m 29s	remaining: 46m 51s
183:	learn: 0.9021752	test: 0.4832062	best: 0.4832062 (183)	total: 10m 34s	remaining: 46m 55s
184:	learn: 0.9036155	test: 0.4836390	best: 0.4836390 (184)	total: 10m 39s	remaining: 46m 57s
185:	learn: 0.9051868	test: 0.4847144	best: 0.4847144 (185)	total: 10m 45s	remaining: 47m 3s
186:	learn: 0.9054326	test: 0.4847022	best: 0.4847144 (185)	total: 10m 50s	remaining: 47m 6s
187:	learn: 0.9066193	test: 0.4858107	best: 0.4858107 (187)	total: 10m 54s	remaining: 47m 7s
188:	learn: 0.9071732	test: 0.4858064	best: 0.4858107 (187)	total: 10m 59s	remaining: 47m 9s
189:	learn: 0.9073993	test: 0.4858061	best: 0.4858107 (187)	total: 11m 3s	remaining: 47m 9s
190:	learn: 0.9076295	test: 0.4857678	best: 0.4858107 (187)	total:

268:	learn: 0.9309480	test: 0.4948324	best: 0.4948900 (265)	total: 16m 36s	remaining: 45m 7s
269:	learn: 0.9310610	test: 0.4948706	best: 0.4948900 (265)	total: 16m 40s	remaining: 45m 3s
270:	learn: 0.9313749	test: 0.4950632	best: 0.4950632 (270)	total: 16m 44s	remaining: 45m 1s
271:	learn: 0.9314868	test: 0.4950451	best: 0.4950632 (270)	total: 16m 48s	remaining: 44m 58s
272:	learn: 0.9315929	test: 0.4950466	best: 0.4950632 (270)	total: 16m 52s	remaining: 44m 55s
273:	learn: 0.9320425	test: 0.4951222	best: 0.4951222 (273)	total: 16m 56s	remaining: 44m 53s
274:	learn: 0.9321506	test: 0.4951177	best: 0.4951222 (273)	total: 17m	remaining: 44m 50s
275:	learn: 0.9324669	test: 0.4951612	best: 0.4951612 (275)	total: 17m 4s	remaining: 44m 47s
276:	learn: 0.9325707	test: 0.4951943	best: 0.4951943 (276)	total: 17m 8s	remaining: 44m 45s
277:	learn: 0.9327052	test: 0.4951788	best: 0.4951943 (276)	total: 17m 12s	remaining: 44m 42s
278:	learn: 0.9328130	test: 0.4951839	best: 0.4951943 (276)	total: 17

<catboost.core.CatBoostRegressor at 0x2398328de80>

In [59]:
with open(f"models/cb1000-{GOAL_NUM}.pkl", "wb") as pkl_file:
     pickle.dump(model, pkl_file)

> Подгрузим модели, натренированные в других ноутбуках

In [60]:
with open(f"models/cb1000-1.pkl", "rb") as pkl_file:
     model1 = pickle.load(pkl_file)

In [61]:
with open(f"models/cb1000-2.pkl", "rb") as pkl_file:
     model2 = pickle.load(pkl_file)

In [63]:
preds = np.c_[
    model.predict(X_test),
    model1.predict(X_test),
    model2.predict(X_test)
]
my_metric(y_test, preds)

(0.5543378172027033,
 0.19812731291956354,
 0.24570993351030682,
 0.11050057077283289)

> не очень, ну проверим на лидер борде

In [65]:
X_test = preprocess.transform(test_df)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [67]:
preds = np.c_[
    model.predict(X_test),
    model1.predict(X_test),
    model2.predict(X_test)
]


In [68]:
write_down_predictions(preds, "local5. catboost 3 models.csv")

## Результат
Такая модель показала хороший результат, На лидерборде получилось 0.68  
Пока это лучшая из моих моделей  

## Попробуем комбайн
- первый таргет от мультитаргетного леса
- 2-ой и 3-ий таргет от специального кэтбуста

думается мне что это пока это лучшие модели (кетбуст оказался слаб для первого таргета)

In [6]:
X_test = preprocess.transform(test_df)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [9]:
with open("models/rf2000.pkl", "rb") as pkl_file:
    model0 = pickle.load(pkl_file)

In [7]:
with open(f"models/cb1000-1.pkl", "rb") as pkl_file:
     model1 = pickle.load(pkl_file)

In [8]:
with open(f"models/cb1000-2.pkl", "rb") as pkl_file:
     model2 = pickle.load(pkl_file)

In [18]:
preds = np.c_[
    model0.predict(X_test)[:,0],
    model1.predict(X_test),
    model2.predict(X_test)
]


In [19]:
preds.shape

(3000, 3)

In [20]:
write_down_predictions(preds, "local5. rf2000-1 + catboost2-3models.csv")

**Ожидания не оправдались, результат стал заметно хуже на ЛБ**