In [None]:
import json

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

In [None]:
X_train = pd.read_parquet("data/features/x_train.pa")
X_test = pd.read_parquet("data/features/x_test.pa")


with open("data/feature_selection/selected_features_500.json") as json_file:
    loaded_dict = json.load(json_file)


X_train = X_train[loaded_dict["selected_features_names"] + ["target", "client_num"]]
X_test = X_test[loaded_dict["selected_features_names"] + ["client_num"]]

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAPE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric="MAPE",
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

0:	learn: 0.5777369	test: 0.5775848	best: 0.5775848 (0)	total: 25.7ms	remaining: 4m 16s
1:	learn: 0.5768944	test: 0.5766968	best: 0.5766968 (1)	total: 43.3ms	remaining: 3m 36s
2:	learn: 0.5767436	test: 0.5765292	best: 0.5765292 (2)	total: 61.6ms	remaining: 3m 25s
3:	learn: 0.5766350	test: 0.5764321	best: 0.5764321 (3)	total: 80ms	remaining: 3m 19s
4:	learn: 0.5764071	test: 0.5761945	best: 0.5761945 (4)	total: 98.2ms	remaining: 3m 16s
5:	learn: 0.5763806	test: 0.5761920	best: 0.5761920 (5)	total: 120ms	remaining: 3m 19s
6:	learn: 0.5763104	test: 0.5761043	best: 0.5761043 (6)	total: 133ms	remaining: 3m 9s
7:	learn: 0.5762317	test: 0.5760233	best: 0.5760233 (7)	total: 146ms	remaining: 3m 1s
8:	learn: 0.5761430	test: 0.5759419	best: 0.5759419 (8)	total: 160ms	remaining: 2m 58s
9:	learn: 0.5760619	test: 0.5758685	best: 0.5758685 (9)	total: 174ms	remaining: 2m 54s
10:	learn: 0.5759764	test: 0.5757554	best: 0.5757554 (10)	total: 188ms	remaining: 2m 50s
11:	learn: 0.5759226	test: 0.5756951	bes

In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
420,4.569786,time_between_last_and_end_hours
419,3.268507,time_between_last_and_end_days
222,3.132178,ts_day_mean_mean__permutation_entropy__dimensi...
292,1.572631,ts_day_max_max__number_peaks__n_1
139,1.154484,ts_day_sum_sum__number_peaks__n_1
498,1.074704,max
495,0.860751,fraction_mcc_code_count_6538
486,0.82733,fraction_mcc_code_count_5262
147,0.816102,ts_day_sum_sum__quantile__q_0.9
492,0.798351,fraction_mcc_code_count_6009


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

In [None]:
y_pred = np.clip(y_pred, 0, 7)

In [None]:
print(np.min(y_pred), np.max(y_pred))

0.0 4.399006143457948


In [None]:
submission["target"] = y_pred

submission.to_csv("submissions/500_features_catboost_MAPE.csv", index=False)