# Hack&Change 2025

## Трек: Альфа-банк
## Команда: Polar Express

**Попытка улучшить прогноз**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor, Pool
from itertools import product
from lightgbm import LGBMRegressor
import shap
import json
import joblib
import warnings
warnings.filterwarnings('ignore')

In [3]:
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return (weights * np.abs(y_true - y_pred)).mean()

In [4]:
train_df = pd.read_csv('hackathon_income_train.csv', sep=';')
test_df  = pd.read_csv('hackathon_income_test.csv', sep=';')

In [28]:
X_train_final = pd.read_parquet("X_train_final.parquet")
X_test_final = pd.read_parquet("X_test_final.parquet")

In [29]:
X_train_final.shape, X_test_final.shape

((76786, 202), (73214, 202))

In [30]:
train_size = train_df.shape[0]
y_train = train_df["target"].values
w_train = train_df["w"].values


In [31]:
X_tr, X_val, y_tr, y_val, w_tr, w_val = train_test_split(
    X_train_final, y_train, w_train, test_size=0.2, random_state=42
)

In [10]:
cat_final = CatBoostRegressor()
cat_final.load_model("cat_final.cbm")
lgb_final = joblib.load("lgb_final.pkl")

In [32]:
bundle = joblib.load("final_3models.pkl")

### PSEUDO-LABELING + ENSEMBLE

In [None]:
# Генерируем pseudo-labels
test_cat = cat_final.predict(X_test_final)
test_lgb = lgb_final.predict(X_test_final.select_dtypes('number'))
test_pseudo = 0.35 * test_cat + 0.65 * test_lgb

In [None]:
# Удваиваем датасет
X_pseudo_full = pd.concat([X_train_final, X_test_final], axis=0).reset_index(drop=True)
X_pseudo_num = X_pseudo_full.select_dtypes('number').reset_index(drop=True)
y_pseudo = np.concatenate([y_train, test_pseudo])
w_pseudo = np.concatenate([w_train, np.ones(len(test_pseudo)) * 0.3])

In [None]:
# обучаем новые модели
cat_pseudo = CatBoostRegressor(
    depth=8, learning_rate=0.03, iterations=1200,
    loss_function="MAE", l2_leaf_reg=2, random_seed=42, verbose=200
)
cat_pseudo.fit(X_pseudo_full, y_pseudo, sample_weight=w_pseudo, cat_features=cat_features_idx)

In [None]:
lgb_pseudo = LGBMRegressor(
    n_estimators=800, max_depth=8, learning_rate=0.04,
    random_state=42, verbose=-1, reg_alpha=1.0, reg_lambda=1.0
)
lgb_pseudo.fit(X_pseudo_num, y_pseudo, sample_weight=w_pseudo)

In [None]:
xgb_pseudo = XGBRegressor(
    n_estimators=800, max_depth=8, learning_rate=0.04,
    random_state=42, reg_alpha=1.0, reg_lambda=1.0
)
xgb_pseudo.fit(X_pseudo_num, y_pseudo, sample_weight=w_pseudo)

эти модели я обучал в каггле

In [None]:
X_val_num = X_val.select_dtypes('number')

oof_cat = cat_pseudo.predict(X_val)
oof_lgb = lgb_pseudo.predict(X_val_num)
oof_xgb = xgb_pseudo.predict(X_val_num)


In [None]:
oof_stack = np.column_stack([oof_cat, oof_lgb, oof_xgb])
meta_model = Ridge(alpha=1.0)
meta_model.fit(oof_stack, y_val, sample_weight=w_val)

val_pred = meta_model.predict(oof_stack)
val_wmae = weighted_mean_absolute_error(y_val, val_pred, w_val)
print(f" pseudo + ensemble WMAE val: {val_wmae:.2f}")

In [None]:
joblib.dump({
    'cat_model': cat_pseudo,
    'lgb_model': lgb_pseudo,
    'xgb_model': xgb_pseudo,
    'meta_model': meta_model,
    'cat_features_idx': cat_features_idx,
    'num_features': X_pseudo_num.columns.tolist(),
    'val_wmae': val_wmae
}, "final_3models.pkl")

In [None]:
bundle = joblib.load("final_3models.pkl")

финальный сабмит

In [37]:
def predict_pseudo(X):
    X_test_num = X_test_final.select_dtypes('number')

    test_cat = bundle['cat_model'].predict(X_test_final)
    test_lgb = bundle['lgb_model'].predict(X_test_num)
    test_xgb = bundle['xgb_model'].predict(X_test_num)

    test_stack = np.column_stack([test_cat, test_lgb, test_xgb])
    final_pred = bundle['meta_model'].predict(test_stack)

    return final_pred

In [38]:
final_pred = predict_pseudo(X_test_final)
print(f"Val WMAE при обучении: {bundle['val_wmae']:.0f}")

Val WMAE при обучении: 15187


In [52]:
sub3 = pd.DataFrame({"id": test_df["id"], "target": final_pred})
sub3.to_csv("submission_final_3models.csv", index=False)

In [39]:
explainer = shap.TreeExplainer(bundle['cat_model'])
shap_values = explainer.shap_values(X_test_final[:100])

# Добавляем explainer в бандл
bundle['explainer'] = explainer
joblib.dump(bundle, "final_3models_with_shap.pkl")