![image.png](attachment:0eec74de-f543-4807-a8f0-15878dcda490.png)

In [None]:
import json

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

In [None]:
X_train_path = "/kaggle/input/alfa-challenge-features/x_train.pa"
X_test_path = "/kaggle/input/alfa-challenge-features/x_test.pa"
sub_path = "/kaggle/working/"
features_select_path = "/kaggle/input/alfa-challenge-features/selected_features_500.json"

# Catboost MAE

In [None]:
X_train = pd.read_parquet(X_train_path)
X_test = pd.read_parquet(X_test_path)

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        verbose=0,
        eval_metric="MAE",
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU
Default metric period is 5 because MAE is/are not implemented for GPU


1.2066019270017905 0.006473895923643304


In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
4792,4.998593,time_between_last_and_end_hours
4791,3.91473,time_between_last_and_end_days
4652,3.614641,embed_147
3751,2.806967,ts_day_max_max__number_peaks__n_1
2070,2.468993,ts_day_sum_sum__number_peaks__n_1
580,1.699672,"ts_week_sum_sum__agg_linear_trend__attr_""inter..."
4709,1.60981,embed_204
2871,1.42703,ts_day_mean_mean__number_peaks__n_1
4656,1.157532,embed_151
1765,1.01599,"ts_week_max_max__agg_linear_trend__attr_""inter..."


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

y_pred = np.clip(y_pred, 0, 7)
print(np.min(y_pred), np.max(y_pred))

0.0 6.78878977528799


In [None]:
submission["target"] = y_pred

submission.to_csv(f"{sub_path}/full_features_catboost.csv", index=False)

# Catboost MAPE

In [None]:
X_train = pd.read_parquet(X_train_path)
X_test = pd.read_parquet(X_test_path)


with open(features_select_path) as json_file:
    loaded_dict = json.load(json_file)


X_train = X_train[loaded_dict["selected_features_names"] + ["target", "client_num"]]
X_test = X_test[loaded_dict["selected_features_names"] + ["client_num"]]

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAPE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        verbose=0,
        eval_metric="MAPE",
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

1.420796439595278 0.011441009531485608


In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
420,3.814436,time_between_last_and_end_hours
419,2.972397,time_between_last_and_end_days
222,2.556143,ts_day_mean_mean__permutation_entropy__dimensi...
292,1.264362,ts_day_max_max__number_peaks__n_1
139,0.974242,ts_day_sum_sum__number_peaks__n_1
498,0.923698,max
495,0.769313,fraction_mcc_code_count_6538
147,0.722812,ts_day_sum_sum__quantile__q_0.9
448,0.714331,fraction_month_count_9
492,0.704222,fraction_mcc_code_count_6009


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

In [None]:
y_pred = np.clip(y_pred, 0, 7)
print(np.min(y_pred), np.max(y_pred))

0.0 4.434838538205958


In [None]:
submission["target"] = y_pred

submission.to_csv(f"{sub_path}/500_features_catboost_MAPE.csv", index=False)

# Blend

In [None]:
sub_1 = pd.read_csv(f"{sub_path}/500_features_catboost_MAPE.csv")
sub_2 = pd.read_csv(f"{sub_path}/full_features_catboost.csv")

In [None]:
# можно было применить np.clip и на финальный результат,
# чтобы ещё сильнее снизить ошибку
sub_1["target"] = 0.5 * sub_1["target"] + 0.5 * (np.clip((sub_2["target"] - 0.5), 0, 7))

In [None]:
sub_1.to_csv(
    f"{sub_path}/submission.csv",
    index=False,
)