In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

In [None]:
X_train = pd.read_parquet("data/features/x_train.pa")
X_test = pd.read_parquet("data/features/x_test.pa")

y_train = X_train["target"]
submission = X_test[["client_num"]]

X_train = X_train.drop(["client_num", "target"], axis=1)
X_test = X_test.drop("client_num", axis=1)

In [None]:
CAT_FEATURES = X_train.select_dtypes("object").columns.to_list()

models_list = []
scores_list = []

splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in tqdm(enumerate(splitter.split(X_train, y_train))):
    X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
    X_fold_test, y_fold_test = X_train.iloc[test_index], y_train.iloc[test_index]

    model = CatBoostRegressor(
        iterations=10000,
        loss_function="MAE",
        cat_features=CAT_FEATURES,
        learning_rate=0.03,
        depth=4,
        # verbose=0,
        eval_metric="MAE",
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(X_fold_train, y_fold_train, eval_set=(X_fold_test, y_fold_test))
    preds = model.predict(X_fold_test)

    score = mean_absolute_error(y_fold_test, preds)

    models_list.append(model)
    scores_list.append(score)

print(np.mean(scores_list), np.std(scores_list))

0it [00:00, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 1.5803156	test: 1.5799922	best: 1.5799922 (0)	total: 142ms	remaining: 23m 41s
1:	total: 205ms	remaining: 17m 6s
2:	total: 250ms	remaining: 13m 53s
3:	total: 296ms	remaining: 12m 20s
4:	total: 341ms	remaining: 11m 21s
5:	learn: 1.5721261	test: 1.5717208	best: 1.5717208 (5)	total: 390ms	remaining: 10m 50s
6:	total: 435ms	remaining: 10m 20s
7:	total: 480ms	remaining: 9m 59s
8:	total: 529ms	remaining: 9m 47s
9:	total: 529ms	remaining: 9m 47s
10:	learn: 1.5642059	test: 1.5635533	best: 1.5635533 (10)	total: 575ms	remaining: 9m 34s
11:	total: 617ms	remaining: 9m 20s
12:	total: 662ms	remaining: 9m 11s
13:	total: 711ms	remaining: 9m 5s
14:	total: 762ms	remaining: 9m 3s
15:	learn: 1.5562906	test: 1.5554654	best: 1.5554654 (15)	total: 806ms	remaining: 8m 56s
16:	total: 849ms	remaining: 8m 49s
17:	total: 896ms	remaining: 8m 46s
18:	total: 940ms	remaining: 8m 41s
19:	total: 986ms	remaining: 8m 37s
20:	learn: 1.5485202	test: 1.5475165	best: 1.5475165 (20)	total: 3.64s	remaining: 30m 18s
21

KeyboardInterrupt: 

Exception ignored in: '_catboost._WriteLog'
Traceback (most recent call last):
  File "/home/seara/Desktop/Github/alfa-challenge/.venv/lib/python3.10/site-packages/ipykernel/iostream.py", line 655, in write
    def write(self, string: str) -> Optional[int]:  # type:ignore[override]
KeyboardInterrupt: 


4110:	learn: 1.1638443	test: 1.2056738	best: 1.2056738 (4110)	total: 9m 46s	remaining: 14m 33s
4111:	total: 9m 46s	remaining: 14m 33s
4112:	total: 9m 49s	remaining: 14m 37s
4113:	total: 9m 49s	remaining: 14m 37s
4114:	total: 9m 49s	remaining: 14m 37s
4115:	learn: 1.1637896	test: 1.2056701	best: 1.2056701 (4115)	total: 9m 49s	remaining: 14m 36s
4116:	total: 9m 49s	remaining: 14m 36s
4117:	total: 9m 52s	remaining: 14m 39s
4118:	total: 9m 52s	remaining: 14m 39s
4119:	total: 9m 52s	remaining: 14m 39s
4120:	learn: 1.1637232	test: 1.2056638	best: 1.2056638 (4120)	total: 9m 52s	remaining: 14m 38s
4121:	total: 9m 52s	remaining: 14m 38s
4122:	total: 9m 52s	remaining: 14m 38s
4123:	total: 9m 52s	remaining: 14m 37s
4124:	total: 9m 52s	remaining: 14m 37s
4125:	learn: 1.1636572	test: 1.2056443	best: 1.2056443 (4125)	total: 9m 52s	remaining: 14m 37s
4126:	total: 9m 52s	remaining: 14m 37s
4127:	total: 9m 52s	remaining: 14m 36s
4128:	total: 9m 52s	remaining: 14m 36s
4129:	total: 9m 52s	remaining: 14m 

In [None]:
pd.DataFrame(
    zip(models_list[0].feature_importances_, X_train.columns, strict=False)
).sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0,1
4792,4.99316,time_between_last_and_end_hours
4791,3.917853,time_between_last_and_end_days
4652,3.609428,embed_147
3751,2.807856,ts_day_max_max__number_peaks__n_1
2070,2.468145,ts_day_sum_sum__number_peaks__n_1
580,1.696215,"ts_week_sum_sum__agg_linear_trend__attr_""inter..."
4709,1.616346,embed_204
2871,1.428449,ts_day_mean_mean__number_peaks__n_1
4656,1.138719,embed_151
1765,1.018891,"ts_week_max_max__agg_linear_trend__attr_""inter..."


In [None]:
y_pred = np.zeros(X_test.shape[0])

for model in models_list:
    y_pred += model.predict(X_test)

y_pred /= len(models_list)

In [None]:
y_pred = np.clip(y_pred, 0, 7)

In [None]:
print(np.min(y_pred), np.max(y_pred))

0.0 6.749825182932597


In [None]:
submission["target"] = y_pred

submission.to_csv("submissions/full_features_catboost.csv", index=False)