In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

In [None]:
X_train = pd.read_pickle("data/features/x_train.pkl")
X_test = pd.read_pickle("data/features/x_test.pkl")

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric="MAE",
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5803157	test: 1.5799920	best: 1.5799920 (0)	total: 140ms	remaining: 23m 18s
1:	total: 200ms	remaining: 16m 40s
2:	total: 257ms	remaining: 14m 16s
3:	total: 312ms	remaining: 12m 58s
4:	total: 390ms	remaining: 12m 58s
5:	learn: 1.5721257	test: 1.5717210	best: 1.5717210 (5)	total: 448ms	remaining: 12m 25s
6:	total: 504ms	remaining: 11m 59s
7:	total: 559ms	remaining: 11m 38s
8:	total: 624ms	remaining: 11m 32s
9:	total: 686ms	remaining: 11m 25s
10:	learn: 1.5642058	test: 1.5635532	best: 1.5635532 (10)	total: 745ms	remaining: 11m 16s
11:	total: 806ms	remaining: 11m 10s
12:	total: 865ms	remaining: 11m 4s
13:	total: 925ms	remaining: 11m
14:	total: 987ms	remaining: 10m 57s
15:	learn: 1.5562902	test: 1.5554651	best: 1.5554651 (15)	total: 1.05s	remaining: 10m 53s
16:	total: 1.11s	remaining: 10m 49s
17:	total: 1.17s	remaining: 10m 47s
18:	total: 1.23s	remaining: 10m 45s
19:	total: 1.29s	remaining: 10m 46s
20:	learn: 1.5485204	test: 1.5475165	best: 1.5475165 (20)	total: 1.36s	remaining:

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5802116	test: 1.5802150	best: 1.5802150 (0)	total: 60.6ms	remaining: 10m 5s
1:	total: 121ms	remaining: 10m 7s
2:	total: 181ms	remaining: 10m 1s
3:	total: 240ms	remaining: 10m
4:	total: 302ms	remaining: 10m 2s
5:	learn: 1.5719272	test: 1.5720738	best: 1.5720738 (5)	total: 367ms	remaining: 10m 10s
6:	total: 423ms	remaining: 10m 3s
7:	total: 482ms	remaining: 10m 2s
8:	total: 538ms	remaining: 9m 57s
9:	total: 593ms	remaining: 9m 52s
10:	learn: 1.5639700	test: 1.5641462	best: 1.5641462 (10)	total: 647ms	remaining: 9m 47s
11:	total: 701ms	remaining: 9m 43s
12:	total: 755ms	remaining: 9m 39s
13:	total: 810ms	remaining: 9m 37s
14:	total: 865ms	remaining: 9m 35s
15:	learn: 1.5559703	test: 1.5562333	best: 1.5562333 (15)	total: 919ms	remaining: 9m 33s
16:	total: 979ms	remaining: 9m 34s
17:	total: 1.03s	remaining: 9m 32s
18:	total: 1.09s	remaining: 9m 31s
19:	total: 1.14s	remaining: 9m 29s
20:	learn: 1.5481348	test: 1.5484880	best: 1.5484880 (20)	total: 1.2s	remaining: 9m 30s
21:	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5802052	test: 1.5804993	best: 1.5804993 (0)	total: 60.7ms	remaining: 10m 6s
1:	total: 121ms	remaining: 10m 2s
2:	total: 177ms	remaining: 9m 49s
3:	total: 231ms	remaining: 9m 36s
4:	total: 286ms	remaining: 9m 30s
5:	learn: 1.5720109	test: 1.5723458	best: 1.5723458 (5)	total: 340ms	remaining: 9m 26s
6:	total: 394ms	remaining: 9m 22s
7:	total: 454ms	remaining: 9m 26s
8:	total: 509ms	remaining: 9m 24s
9:	total: 563ms	remaining: 9m 22s
10:	learn: 1.5640953	test: 1.5645431	best: 1.5645431 (10)	total: 618ms	remaining: 9m 21s
11:	total: 672ms	remaining: 9m 19s
12:	total: 726ms	remaining: 9m 18s
13:	total: 781ms	remaining: 9m 16s
14:	total: 837ms	remaining: 9m 16s
15:	learn: 1.5561232	test: 1.5566572	best: 1.5566572 (15)	total: 889ms	remaining: 9m 14s
16:	total: 949ms	remaining: 9m 17s
17:	total: 1s	remaining: 9m 17s
18:	total: 1.06s	remaining: 9m 16s
19:	total: 1.12s	remaining: 9m 17s
20:	learn: 1.5480580	test: 1.5486974	best: 1.5486974 (20)	total: 1.17s	remaining: 9m 18s
21:	total

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5801244	test: 1.5802033	best: 1.5802033 (0)	total: 57.7ms	remaining: 9m 36s
1:	total: 122ms	remaining: 10m 9s
2:	total: 177ms	remaining: 9m 49s
3:	total: 236ms	remaining: 9m 49s
4:	total: 292ms	remaining: 9m 42s
5:	learn: 1.5718075	test: 1.5721939	best: 1.5721939 (5)	total: 352ms	remaining: 9m 46s
6:	total: 407ms	remaining: 9m 40s
7:	total: 466ms	remaining: 9m 42s
8:	total: 522ms	remaining: 9m 39s
9:	total: 577ms	remaining: 9m 36s
10:	learn: 1.5637733	test: 1.5644270	best: 1.5644270 (10)	total: 630ms	remaining: 9m 32s
11:	total: 690ms	remaining: 9m 34s
12:	total: 745ms	remaining: 9m 32s
13:	total: 799ms	remaining: 9m 30s
14:	total: 853ms	remaining: 9m 27s
15:	learn: 1.5556941	test: 1.5566049	best: 1.5566049 (15)	total: 909ms	remaining: 9m 27s
16:	total: 968ms	remaining: 9m 28s
17:	total: 1.02s	remaining: 9m 26s
18:	total: 1.08s	remaining: 9m 25s
19:	total: 1.13s	remaining: 9m 24s
20:	learn: 1.5475110	test: 1.5487224	best: 1.5487224 (20)	total: 1.19s	remaining: 9m 23s
21:	to

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5802266	test: 1.5802215	best: 1.5802215 (0)	total: 60.6ms	remaining: 10m 6s
1:	total: 126ms	remaining: 10m 28s
2:	total: 183ms	remaining: 10m 8s
3:	total: 241ms	remaining: 10m 2s
4:	total: 296ms	remaining: 9m 52s
5:	learn: 1.5720667	test: 1.5721606	best: 1.5721606 (5)	total: 352ms	remaining: 9m 46s
6:	total: 352ms	remaining: 9m 46s
7:	total: 3.02s	remaining: 1h 11m 58s
8:	total: 3.08s	remaining: 1h 4m 7s
9:	total: 3.13s	remaining: 57m 59s
10:	learn: 1.5641345	test: 1.5643252	best: 1.5643252 (10)	total: 3.13s	remaining: 57m 59s
11:	total: 5.8s	remaining: 1h 36m 37s
12:	total: 5.86s	remaining: 1h 28m 42s
13:	total: 5.92s	remaining: 1h 22m 5s
14:	total: 5.97s	remaining: 1h 16m 28s
15:	learn: 1.5561037	test: 1.5564136	best: 1.5564136 (15)	total: 6.03s	remaining: 1h 11m 38s
16:	total: 6.08s	remaining: 1h 7m 28s
17:	total: 6.14s	remaining: 1h 3m 48s
18:	total: 6.19s	remaining: 1h 35s
19:	total: 6.19s	remaining: 1h 35s
20:	learn: 1.5481991	test: 1.5485826	best: 1.5485826 (20)	tota

In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
4792,4.99316,time_between_last_and_end_hours
4791,3.917853,time_between_last_and_end_days
4652,3.609428,embed_147
3751,2.807856,ts_day_max_max__number_peaks__n_1
2070,2.468145,ts_day_sum_sum__number_peaks__n_1
580,1.696215,"ts_week_sum_sum__agg_linear_trend__attr_""inter..."
4709,1.616346,embed_204
2871,1.428449,ts_day_mean_mean__number_peaks__n_1
4656,1.138719,embed_151
1765,1.018891,"ts_week_max_max__agg_linear_trend__attr_""inter..."


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

In [None]:
y_pred = np.clip(y_pred, 0, 7)

In [None]:
print(np.min(y_pred), np.max(y_pred))

0.0 6.749825182932597


In [None]:
submission["target"] = y_pred

submission.to_csv("submissions/full_features_catboost.csv", index=False)