# Хакатон: Прогноз доходов (5-Fold Ensemble)

**Стратегия победы:**
1. **K-Fold Cross-Validation:** Обучаем 5 независимых моделей, чтобы убрать дисперсию.
2. **Ensembling:** Усредняем предсказания 5 моделей. Это должно сократить разрыв между локальной валидацией и лидербордом.
3. **High Regularization:** L2=15 для борьбы с переобучением.

In [None]:
!pip install catboost shap pandas scikit-learn numpy

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import json
import gc

# Метрика WMAE
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return (weights * np.abs(y_true - y_pred)).mean()

In [None]:
print("Загрузка данных...")
try:
    train_df = pd.read_csv('hackathon_income_train.csv', decimal=',', sep=';', low_memory=False)
    test_df = pd.read_csv('hackathon_income_test.csv', decimal=',', sep=';', low_memory=False)
except FileNotFoundError:
    print("ОШИБКА: Загрузите файлы csv!")

Загрузка данных...


In [None]:
# Список признаков для удаления (оставляем как есть, это работает)
USELESS_FEATURES = [
    'addrref', 'city_smart_name', 'dp_ewb_last_employment_position',
    'client_active_flag', 'vert_has_app_ru_tinkoff_investing',
    'dp_ewb_dismissal_due_contract_violation_by_lb_cnt', 'period_last_act_ad',
    'ovrd_sum', 'businessTelSubs', 'dp_ils_days_ip_share_5y',
    'nonresident_flag', 'vert_has_app_ru_vtb_invest',
    'hdb_bki_total_pil_cnt', 'accountsalary_out_flag',
    'id', 'dt'
]

In [None]:
# Функция предобработки (Та же самая, надежная)
def preprocess_data(df, is_train=True):
    df_proc = df.copy()

    # 1. Удаляем мусор
    cols_to_drop = [c for c in USELESS_FEATURES if c in df_proc.columns]
    df_proc = df_proc.drop(columns=cols_to_drop, errors='ignore')

    # 2. Текстовые числа
    object_cols = df_proc.select_dtypes(include='object').columns
    for col in object_cols:
        if df_proc[col].nunique() > 50:
            try:
                temp_col = df_proc[col].astype(str).str.replace(' ', '').str.replace(',', '.')
                df_proc[col] = pd.to_numeric(temp_col, errors='coerce')
            except:
                pass

    # 3. Smart Features
    important_nans = ['salary_6to12m_avg', 'first_salary_income']
    for col in important_nans:
        if col in df_proc.columns:
            df_proc[f'{col}_is_missing'] = df_proc[col].isna().astype(int)

    if is_train:
        # Очистка таргета и весов
        df_proc['target'] = pd.to_numeric(df_proc['target'], errors='coerce')
        df_proc['w'] = pd.to_numeric(df_proc['w'], errors='coerce')

        df_proc = df_proc.dropna(subset=['target'])
        df_proc['target'] = df_proc['target'].clip(lower=0)
        df_proc['w'] = df_proc['w'].fillna(0).clip(lower=0)

        y = df_proc['target']
        w = df_proc['w']
        df_proc = df_proc.drop(columns=['target', 'w'], errors='ignore')

    # 4. Заполнение нулями
    zero_fill_keywords = ['sum', 'count', 'cnt', 'amount', 'turn', 'limit', 'outstanding', 'balance']
    cols_to_zero = [c for c in df_proc.columns if any(k in c.lower() for k in zero_fill_keywords) and df_proc[c].dtype != 'object']
    df_proc[cols_to_zero] = df_proc[cols_to_zero].fillna(0)

    # 5. Категории
    cat_cols = df_proc.select_dtypes(include=['object']).columns
    df_proc[cat_cols] = df_proc[cat_cols].fillna("MISSING")

    if is_train:
        return df_proc, y, w
    else:
        df_proc = df_proc.drop(columns=['target', 'w'], errors='ignore')
        return df_proc

print("Препроцессинг...")
X, y, w = preprocess_data(train_df, is_train=True)
X_submit = preprocess_data(test_df, is_train=False)

cat_features = list(X.select_dtypes(include=['object']).columns)
feature_names = list(X.columns)

Препроцессинг...


In [None]:
# --- 5. K-FOLD ОБУЧЕНИЕ (MAIN LOGIC) ---

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Массив для хранения предсказаний на тесте от каждой модели
test_preds_accum = np.zeros(len(X_submit))

# Для локальной метрики
oof_preds = np.zeros(len(X))

print(f"Запуск {N_FOLDS}-Fold кросс-валидации...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1}/{N_FOLDS} ---")

    # Разделение данных
    X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
    y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]
    w_train_f, w_val_f = w.iloc[train_idx], w.iloc[val_idx]

    # Log Target
    y_train_log = np.log1p(y_train_f)
    y_val_log = np.log1p(y_val_f)

    # Pools
    train_pool = Pool(X_train_f, y_train_log, cat_features=cat_features, weight=w_train_f)
    val_pool = Pool(X_val_f, y_val_log, cat_features=cat_features, weight=w_val_f)

    # Model definition (L2=15 для борьбы с переобучением)
    model = CatBoostRegressor(
        iterations=4000,          # Даем модели больше времени
        learning_rate=0.03,
        depth=6,
        l2_leaf_reg=15,           # Сильная регуляризация!
        loss_function='RMSE',
        eval_metric='MAE',
        random_seed=42,
        verbose=500,
        early_stopping_rounds=300,
        allow_writing_files=False,
        task_type="CPU"
    )

    model.fit(train_pool, eval_set=val_pool)

    # 1. Предсказание на валидации (для проверки)
    log_val_pred = model.predict(X_val_f)
    val_pred = np.expm1(log_val_pred)
    val_pred = np.maximum(val_pred, 0)
    oof_preds[val_idx] = val_pred

    score = weighted_mean_absolute_error(y_val_f, val_pred, w_val_f)
    print(f"Fold {fold+1} WMAE: {score:.2f}")

    # 2. Предсказание на ТЕСТЕ (для сабмита)
    log_sub_pred = model.predict(X_submit)
    sub_pred = np.expm1(log_sub_pred)
    sub_pred = np.maximum(sub_pred, 0)

    # Накапливаем сумму предсказаний
    test_preds_accum += sub_pred

    # Чистим память
    del train_pool, val_pool, X_train_f, X_val_f
    gc.collect()

# --- ИТОГИ ---
overall_score = weighted_mean_absolute_error(y, oof_preds, w)
print(f"\n========================================")
print(f"OOF (Average) WMAE: {overall_score:.2f}")
print(f"========================================")

Запуск 5-Fold кросс-валидации...

--- Fold 1/5 ---
0:	learn: 0.9454956	test: 0.9562651	best: 0.9562651 (0)	total: 242ms	remaining: 16m 8s
500:	learn: 0.4426174	test: 0.4631726	best: 0.4631726 (500)	total: 1m 43s	remaining: 12m 1s
1000:	learn: 0.4092418	test: 0.4456534	best: 0.4456534 (1000)	total: 3m 21s	remaining: 10m 3s
1500:	learn: 0.3908341	test: 0.4390584	best: 0.4390584 (1500)	total: 4m 59s	remaining: 8m 18s
2000:	learn: 0.3769466	test: 0.4354148	best: 0.4354148 (2000)	total: 6m 34s	remaining: 6m 34s
2500:	learn: 0.3653744	test: 0.4336089	best: 0.4336089 (2500)	total: 8m 16s	remaining: 4m 57s
3000:	learn: 0.3551331	test: 0.4324558	best: 0.4324349 (2992)	total: 9m 51s	remaining: 3m 16s
3500:	learn: 0.3463288	test: 0.4316848	best: 0.4316665 (3470)	total: 11m 27s	remaining: 1m 37s
3999:	learn: 0.3384983	test: 0.4310230	best: 0.4310144 (3997)	total: 13m 2s	remaining: 0us

bestTest = 0.4310144166
bestIteration = 3997

Shrink model to first 3998 iterations.
Fold 1 WMAE: 38834.07

--- F

In [None]:
# 6. Формирование сабмита
# Делим накопленную сумму на количество фолдов (среднее арифметическое)
final_preds = test_preds_accum / N_FOLDS

submission = test_df[['id']].copy()
submission['target'] = final_preds
submission.to_csv('submission_ensemble.csv', index=False)
print("Файл submission_ensemble.csv готов к отправке!")

# 7. Сохранение ПОСЛЕДНЕЙ модели и фичей для сервиса
# Для сервиса нам достаточно одной хорошей модели, чтобы не грузить 5 штук в память
model.save_model("model.cbm")

metadata = {
    "feature_names": feature_names,
    "cat_features": cat_features
}
with open("features.json", "w") as f:
    json.dump(metadata, f)

print("Артефакты для Docker-сервиса (model.cbm, features.json) сохранены.")

Файл submission_ensemble.csv готов к отправке!
Артефакты для Docker-сервиса (model.cbm, features.json) сохранены.
