In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

In [None]:
X_train = pd.read_pickle("data/features/x_train.pkl")
X_test = pd.read_pickle("data/features/x_test.pkl")

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAPE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric="MAPE",
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

0:	learn: 0.5777192	test: 0.5775597	best: 0.5775597 (0)	total: 117ms	remaining: 19m 32s
1:	learn: 0.5769365	test: 0.5767360	best: 0.5767360 (1)	total: 162ms	remaining: 13m 30s
2:	learn: 0.5768134	test: 0.5765739	best: 0.5765739 (2)	total: 209ms	remaining: 11m 36s
3:	learn: 0.5767719	test: 0.5765544	best: 0.5765544 (3)	total: 253ms	remaining: 10m 32s
4:	learn: 0.5766604	test: 0.5764465	best: 0.5764465 (4)	total: 300ms	remaining: 9m 58s
5:	learn: 0.5765763	test: 0.5763855	best: 0.5763855 (5)	total: 342ms	remaining: 9m 29s
6:	learn: 0.5765003	test: 0.5763197	best: 0.5763197 (6)	total: 386ms	remaining: 9m 11s
7:	learn: 0.5764022	test: 0.5762024	best: 0.5762024 (7)	total: 428ms	remaining: 8m 54s
8:	learn: 0.5763312	test: 0.5761312	best: 0.5761312 (8)	total: 470ms	remaining: 8m 41s
9:	learn: 0.5762675	test: 0.5760453	best: 0.5760453 (9)	total: 513ms	remaining: 8m 32s
10:	learn: 0.5761883	test: 0.5759676	best: 0.5759676 (10)	total: 559ms	remaining: 8m 27s
11:	learn: 0.5761162	test: 0.5758805	

In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
4792,3.388317,time_between_last_and_end_hours
4791,2.564308,time_between_last_and_end_days
2067,1.449367,ts_day_sum_sum__permutation_entropy__dimension...
2470,0.987248,ts_day_std_std__permutation_entropy__dimension...
3751,0.781654,ts_day_max_max__number_peaks__n_1
2882,0.749969,ts_day_mean_mean__permutation_entropy__dimensi...
5859,0.557629,fraction_mcc_code_count_5262
2871,0.454975,ts_day_mean_mean__number_peaks__n_1
6084,0.454288,max
5961,0.396752,fraction_mcc_code_count_6009


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

In [None]:
y_pred = np.clip(y_pred, 0, 7)

In [None]:
print(np.min(y_pred), np.max(y_pred))

0.0 3.5019006481198502


In [None]:
submission["target"] = y_pred

submission.to_csv("submissions/full_features_catboost_MAPE.csv", index=False)