In [1]:
# Загрузка данных с kaggle. Нужно загрузить kaggle.json с профиля в Kaggle.
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c school-of-quants-hackathon-2025-finals
!unzip school-of-quants-hackathon-2025-finals.zip

Downloading school-of-quants-hackathon-2025-finals.zip to /content
  0% 0.00/43.2M [00:00<?, ?B/s]
100% 43.2M/43.2M [00:00<00:00, 818MB/s]
Archive:  school-of-quants-hackathon-2025-finals.zip
  inflating: X_test.csv              
  inflating: X_train.csv             
  inflating: y_train.csv             


In [None]:
# improved_pipeline_bagging.py
import os
import sys
import gc
import math
import random
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score
import lightgbm as lgb

# ----------------- Конфигурация -----------------
DATA_DIR = "."
TRAIN_FILE = "X_train.csv"
TEST_FILE = "X_test.csv"
Y_FILE = "y_train.csv"
SUBMISSION_FILE = "submission.csv"

N_BAGS = 5                  # сколько разных undersample-багов
RATIO_NEG_PER_POS = 4       # отношение neg:pos в каждом баге (пример: 4 -> 4 negatives на 1 positive)
FOLDS = 4                   # Stratified KFold внутри каждого бага для OOF и ранней остановки
RANDOM_SEED = 42
LGB_PARAMS = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'learning_rate': 0.03,
    'num_leaves': 64,
    'max_depth': -1,
    'n_estimators': 4000,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'min_child_samples': 100,
    'reg_alpha': 1.0,
    'reg_lambda': 1.0,
    'verbosity': -1,
    'n_jobs': -1,
    'seed': RANDOM_SEED
}
EARLY_STOPPING_ROUNDS = 100
VERBOSE = 100

# ----------------- Вспомогательные -----------------
def seed_everything(seed=RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)

seed_everything()

def reduce_mem_usage(df, verbose=True):
    """Понизить dtypes для экономии памяти."""
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == object:
            continue
        if str(col_type).startswith('int') or str(col_type).startswith('uint'):
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min >= 0:
                if c_max < 255:
                    df[col] = df[col].astype(np.uint8)
                elif c_max < 65535:
                    df[col] = df[col].astype(np.uint16)
                else:
                    df[col] = df[col].astype(np.uint32)
            else:
                if np.iinfo(np.int8).min < c_min < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif np.iinfo(np.int16).min < c_min < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                else:
                    df[col] = df[col].astype(np.int32)
        else:
            # float
            df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f"Mem usage reduced from {start_mem:.2f} MB to {end_mem:.2f} MB")
    return df

# ----------------- Feature engineering -----------------
def eng_enc_paym_stats(df, prefix='enc_paym_'):
    cols = [c for c in df.columns if c.startswith(prefix)]
    if not cols:
        return pd.DataFrame(index=df.index)
    arr = df[cols].fillna(-999).astype(int).values  # -999 for missing
    # last non-missing
    last = np.full(arr.shape[0], -999, dtype=np.int32)
    first = np.full(arr.shape[0], -999, dtype=np.int32)
    nuniq = np.zeros(arr.shape[0], dtype=np.int32)
    miss_frac = (arr == -999).mean(axis=1).astype(np.float32)
    # counts of specific statuses (we don't know cardinality, so count top-k statuses)
    # compute last and first efficiently:
    valid_mask = (arr != -999)
    any_valid = valid_mask.any(axis=1)
    # first
    idx_first = valid_mask.argmax(axis=1)
    first[any_valid] = arr[np.arange(arr.shape[0])[any_valid], idx_first[any_valid]]
    # last
    rev_idx = valid_mask[:, ::-1].argmax(axis=1)
    pos_last = arr.shape[1] - 1 - rev_idx
    last[any_valid] = arr[np.arange(arr.shape[0])[any_valid], pos_last[any_valid]]
    # unique counts
    for i in range(arr.shape[0]):
        row = arr[i]
        vals = row[row != -999]
        if vals.size == 0:
            nuniq[i] = 0
        else:
            nuniq[i] = np.unique(vals).size
    # mean/std ignoring -999
    arr_float = np.where(arr == -999, np.nan, arr).astype(np.float32)
    mean = np.nanmean(arr_float, axis=1)
    std = np.nanstd(arr_float, axis=1)
    # compute longest consecutive streak of same status (simple)
    def longest_streak(row):
        prev = None
        cur = 0
        best = 0
        for v in row:
            if v == -999:
                continue
            if prev is None or v != prev:
                cur = 1
                prev = v
            else:
                cur += 1
            if cur > best:
                best = cur
        return best
    longest = np.array([longest_streak(r) for r in arr], dtype=np.int32)
    out = pd.DataFrame({
        'enc_first': first,
        'enc_last': last,
        'enc_n_unique': nuniq,
        'enc_missing_frac': miss_frac,
        'enc_mean': mean,
        'enc_std': std,
        'enc_longest_streak': longest
    }, index=df.index)
    return out

def eng_overdues_stats(df, prefix='overdues_'):
    cols = [c for c in df.columns if c.startswith(prefix)]
    if not cols:
        return pd.DataFrame(index=df.index)
    arr = df[cols].fillna(0).astype(np.float32).values
    total = arr.sum(axis=1)
    n_nonzero = (arr > 0).sum(axis=1)
    # weighted sum by bucket index (to capture severity)
    weights = np.arange(1, arr.shape[1] + 1).astype(np.float32)
    weighted = (arr * weights).sum(axis=1)
    max_bucket = np.argmax(arr, axis=1)
    out = pd.DataFrame({
        'over_total': total,
        'over_nonzero_cnt': n_nonzero,
        'over_weighted': weighted,
        'over_max_bucket': max_bucket
    }, index=df.index)
    return out

def eng_no_overdues_stats(df, prefix='no_overdues_'):
    cols = [c for c in df.columns if c.startswith(prefix)]
    if not cols:
        return pd.DataFrame(index=df.index)
    arr = df[cols].fillna(0).astype(np.float32).values
    total = arr.sum(axis=1)
    mean = arr.mean(axis=1)
    out = pd.DataFrame({
        'no_over_total': total,
        'no_over_mean': mean
    }, index=df.index)
    return out

def numeric_interactions(df):
    df = df.copy()
    EPS = 1e-9
    # maturity diff and ratio
    if 'maturity_plan' in df.columns and 'maturity_fact' in df.columns:
        df['maturity_diff'] = df['maturity_fact'] - df['maturity_plan']
        df['maturity_ratio'] = df['maturity_fact'] / (df['maturity_plan'] + EPS)
        df['closed_early'] = (df['maturity_fact'] < df['maturity_plan']).astype(np.int8)
    if 'sum_left_to_pay' in df.columns and 'credit_limit' in df.columns:
        df['utilization'] = df['sum_left_to_pay'] / (df['credit_limit'] + EPS)
    if 'next_payment_sum' in df.columns and 'credit_limit' in df.columns:
        df['next_payment_ratio'] = df['next_payment_sum'] / (df['credit_limit'] + EPS)
    if 'current_overdue_debt' in df.columns:
        df['has_current_overdue'] = (df['current_overdue_debt'] > 0).astype(np.int8)
    money_cols = ['credit_limit','next_payment_sum','sum_left_to_pay','current_overdue_debt','max_overdue_debt','full_credit_cost']
    for c in money_cols:
        if c in df.columns:
            df[c + '_log1p'] = np.log1p(df[c].fillna(0.0).astype(float))
    if 'maturity_plan' in df.columns and 'days_since_confirmed' in df.columns:
        df['maturity_remaining'] = df['maturity_plan'] - df['days_since_confirmed']
    return df

# ----------------- Target encoding -----------------
def target_encode_kfold(train_df, target, col, n_splits=5, seed=RANDOM_SEED, min_samples_leaf=100, smoothing=10.0):
    """
    K-fold target encoding with smoothing.
    Returns encoded column for train (oof) and mapping to apply on test.
    """
    oof = pd.Series(index=train_df.index, dtype=np.float32)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    prior = target.mean()
    for tr_idx, val_idx in kf.split(train_df, target):
        X_tr = train_df.iloc[tr_idx]
        y_tr = target.iloc[tr_idx]
        stats = y_tr.groupby(X_tr[col]).agg(['mean','count'])
        means = stats['mean']
        counts = stats['count']
        # smoothing
        smooth = (counts * means + smoothing * prior) / (counts + smoothing)
        mapping = smooth.to_dict()
        # map validation
        oof.iloc[val_idx] = train_df.iloc[val_idx][col].map(mapping).fillna(prior).astype(np.float32)
    # full mapping from full train for test transformation
    full_stats = target.groupby(train_df[col]).agg(['mean','count'])
    full_smooth = (full_stats['count'] * full_stats['mean'] + smoothing * prior) / (full_stats['count'] + smoothing)
    full_map = full_smooth.to_dict()
    return oof, full_map, prior

# ----------------- Balancing helpers -----------------
def sample_negatives_indices(y_series, desired_neg_ratio=RATIO_NEG_PER_POS, random_state=None):
    rng = np.random.RandomState(random_state)
    idx = y_series.index.values
    pos_mask = (y_series.values == 1)
    pos_idx = idx[pos_mask]
    neg_idx = idx[~pos_mask]
    n_pos = pos_idx.shape[0]
    n_neg_req = int(n_pos * desired_neg_ratio)
    n_neg_req = min(n_neg_req, neg_idx.shape[0])
    sampled_neg = rng.choice(neg_idx, size=n_neg_req, replace=False)
    selected = np.concatenate([pos_idx, sampled_neg])
    rng.shuffle(selected)
    return selected

# ----------------- Training / OOF / Predict -----------------
def find_best_threshold(y_true, probs):
    precision, recall, thresholds = precision_recall_curve(y_true, probs)
    f1_scores = 2 * precision * recall / (precision + recall + 1e-12)
    best_idx = np.nanargmax(f1_scores)
    best_thr = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    return float(best_thr), float(f1_scores[best_idx])

def train_bagging_ensemble(X, y, X_test, cat_feats=None,
                           n_bags=N_BAGS, ratio_neg=RATIO_NEG_PER_POS,
                           folds=FOLDS, seed=RANDOM_SEED):
    """
    Обучаем ансамбль из n_bags: в каждом баге делаем undersample негативов, затем внутри StratifiedKFold.
    Возвращаем список моделей, ооф (усредненный по багам), и средние прогнозы на тест.
    """
    n_train = X.shape[0]
    oof_preds_accum = np.zeros(n_train, dtype=np.float32)
    test_preds_accum = np.zeros(X_test.shape[0], dtype=np.float32)
    total_models = 0
    models = []

    for b in range(n_bags):
        print(f"\n=== BAG {b+1}/{n_bags} ===")
        sel_idx = sample_negatives_indices(y, desired_neg_ratio=ratio_neg, random_state=seed + b)
        X_b = X.loc[sel_idx]
        y_b = y.loc[sel_idx]
        print("Bag train shape:", X_b.shape, "Pos:", int(y_b.sum()), "Neg:", int((y_b==0).sum()))

        # inside bag CV
        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed + b)
        oof_b = np.zeros(n_train, dtype=np.float32)  # will write to positions for validation folds only
        for fold, (tr_idx_local, val_idx_local) in enumerate(skf.split(X_b, y_b)):
            tr_idx = sel_idx[tr_idx_local]  # original indices
            val_idx = sel_idx[val_idx_local]
            X_tr, X_val = X.loc[tr_idx], X.loc[val_idx]
            y_tr, y_val = y.loc[tr_idx], y.loc[val_idx]

            lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_feats if cat_feats else 'auto', free_raw_data=False)
            lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, categorical_feature=cat_feats if cat_feats else 'auto', free_raw_data=False)

            params = LGB_PARAMS.copy()
            # scale_pos_weight might help when within-bag still imbalanced
            pos = int(y_tr.sum())
            neg = int(y_tr.shape[0] - pos)
            if pos > 0:
                spw = max(1.0, neg / (pos + 1e-9))
                params['scale_pos_weight'] = spw

            model = lgb.train(
                params=params,
                train_set=lgb_train,
                valid_sets=[lgb_val],
                num_boost_round=1000,
                callbacks=[lgb.early_stopping(stopping_rounds=100)]
            )
            # val predictions (on original train index positions)
            val_pred = model.predict(X.loc[val_idx], num_iteration=model.best_iteration)
            oof_b[val_idx] = val_pred
            # add test predictions
            test_pred = model.predict(X_test, num_iteration=model.best_iteration)
            test_preds_accum += test_pred
            total_models += 1
            models.append(model)
            print(f"Bag {b+1} Fold {fold+1}: val ROC AUC {roc_auc_score(y.loc[val_idx], val_pred):.4f}")
            gc.collect()

        # accumulate oof from this bag (we average across bags later)
        oof_preds_accum += oof_b

    # average oof and test
    oof_preds = oof_preds_accum / n_bags
    test_preds = test_preds_accum / total_models  # because we added test prediction per model
    print("\nTotal models trained:", total_models)
    return models, oof_preds, test_preds

# ----------------- Main pipeline -----------------
def main():
    required = [os.path.join(DATA_DIR, TRAIN_FILE), os.path.join(DATA_DIR, TEST_FILE), os.path.join(DATA_DIR, Y_FILE)]
    if not all(os.path.exists(p) for p in required):
        print("Не найдены входные файлы. Поместите X_train.csv, X_test.csv, y_train.csv в текущую папку.")
        sys.exit(1)

    print("Loading...")
    X_train = pd.read_csv(os.path.join(DATA_DIR, TRAIN_FILE))
    X_test = pd.read_csv(os.path.join(DATA_DIR, TEST_FILE))
    y_df = pd.read_csv(os.path.join(DATA_DIR, Y_FILE))

    # Находим столбец флага
    if 'flag' in y_df.columns:
        y = y_df['flag'].astype(int)
    else:
        y = y_df.iloc[:,0].astype(int)
    print("Train shape:", X_train.shape, "Test shape:", X_test.shape, "Y shape:", y.shape)
    print("Target distribution:", y.value_counts(normalize=True).to_dict())

    # Сохраним id, если есть
    id_col = 'id' if 'id' in X_test.columns else None

    # Feature engineering
    print("Feature engineering...")
    # numeric interactions
    Xtr_num = numeric_interactions(X_train)
    Xte_num = numeric_interactions(X_test)
    # enc_paym stats
    enc_tr = eng_enc_paym_stats(Xtr_num, prefix='enc_paym_')
    enc_te = eng_enc_paym_stats(Xte_num, prefix='enc_paym_')
    # overdues
    over_tr = eng_overdues_stats(Xtr_num, prefix='overdues_')
    over_te = eng_overdues_stats(Xte_num, prefix='overdues_')
    # no overdues
    noover_tr = eng_no_overdues_stats(Xtr_num, prefix='no_overdues_')
    noover_te = eng_no_overdues_stats(Xte_num, prefix='no_overdues_')

    # Drop raw big blocks to save memory
    drop_prefixes = ['enc_paym_', 'overdues_', 'no_overdues_']
    to_drop_tr = [c for c in Xtr_num.columns if any(c.startswith(p) for p in drop_prefixes)]
    to_drop_te = [c for c in Xte_num.columns if any(c.startswith(p) for p in drop_prefixes)]
    Xtr_small = Xtr_num.drop(columns=to_drop_tr, errors='ignore').copy()
    Xte_small = Xte_num.drop(columns=to_drop_te, errors='ignore').copy()

    # concat engineered features
    Xtr_small = pd.concat([Xtr_small, enc_tr, over_tr, noover_tr], axis=1)
    Xte_small = pd.concat([Xte_small, enc_te, over_te, noover_te], axis=1)

    # Label encode small categoricals then target encode with kfold smoothing
    cat_cols = [c for c in ['credit_type','credit_currency'] if c in Xtr_small.columns]
    for c in cat_cols:
        Xtr_small[c] = Xtr_small[c].fillna("nan").astype(str)
        Xte_small[c] = Xte_small[c].fillna("nan").astype(str)
        le = LabelEncoder()
        le.fit(list(Xtr_small[c].values) + list(Xte_small[c].values))
        Xtr_small[c + '_le'] = le.transform(Xtr_small[c])
        Xte_small[c + '_le'] = le.transform(Xte_small[c])
        # target encoding (KFold)
        oof_te, map_full, prior = target_encode_kfold(Xtr_small, y, c, n_splits=5, seed=RANDOM_SEED, smoothing=20.0)
        Xtr_small[c + '_te'] = oof_te
        Xte_small[c + '_te'] = Xte_small[c].map(map_full).fillna(prior).astype(np.float32)
    # drop original text categorical columns
    Xtr_small.drop(columns=cat_cols, inplace=True, errors='ignore')
    Xte_small.drop(columns=cat_cols, inplace=True, errors='ignore')

    # Impute missing values (median)
    imputer = SimpleImputer(strategy='median')
    Xtr_small.iloc[:,:] = imputer.fit_transform(Xtr_small)
    Xte_small.iloc[:,:] = imputer.transform(Xte_small)

    # Reduce memory
    Xtr_small = reduce_mem_usage(Xtr_small)
    Xte_small = reduce_mem_usage(Xte_small)

    # Align indices with y
    Xtr_small = Xtr_small.loc[y.index]

    # Select categorical features for LightGBM (use _le columns if any)
    cat_feats = [c for c in Xtr_small.columns if c.endswith('_le')]
    print("Final feature shape:", Xtr_small.shape, "Categorical features:", cat_feats)

    # Train bagging ensemble
    print("Training bagging ensemble...")
    t0 = time.time()
    models, oof_preds, test_preds = train_bagging_ensemble(Xtr_small, y, Xte_small, cat_feats,
                                                           n_bags=N_BAGS, ratio_neg=RATIO_NEG_PER_POS,
                                                           folds=FOLDS, seed=RANDOM_SEED)
    t1 = time.time()
    print(f"Training finished in {(t1-t0)/60:.2f} min")

    # threshold tuning on oof
    best_thr, best_f1 = find_best_threshold(y.values, oof_preds)
    print("Best threshold on OOF:", best_thr, "Best OOF F1:", best_f1)
    # final predicted labels
    test_labels = (test_preds >= best_thr).astype(int)

    # Build submission
    if id_col and id_col in X_test.columns:
        ids = X_test[id_col].values
    else:
        ids = np.arange(len(test_labels))

    submission = pd.DataFrame({'id': ids, 'flag': test_labels})
    submission.to_csv(SUBMISSION_FILE, index=False)
    print("Saved", SUBMISSION_FILE, "shape:", submission.shape)
    # report OOF metrics
    print("OOF ROC AUC:", roc_auc_score(y, oof_preds))
    print("OOF F1 at best_thr:", f1_score(y, (oof_preds >= best_thr).astype(int)))

if __name__ == "__main__":
    main()

Loading...
Train shape: (1827404, 48) Test shape: (456852, 48) Y shape: (1827404,)
Target distribution: {0: 0.9671862379638, 1: 0.032813762036199984}
Feature engineering...
Mem usage reduced from 456.60 MB to 170.79 MB
Mem usage reduced from 114.15 MB to 42.70 MB
Final feature shape: (1827404, 41) Categorical features: ['credit_type_le', 'credit_currency_le']
Training bagging ensemble...

=== BAG 1/5 ===
Bag train shape: (299820, 41) Pos: 59964 Neg: 239856
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[676]	valid_0's auc: 0.646679
Bag 1 Fold 1: val ROC AUC 0.6467
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1024]	valid_0's auc: 0.648741
Bag 1 Fold 2: val ROC AUC 0.6487
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[731]	valid_0's auc: 0.644478
Bag 1 Fold 3: val ROC AUC 0.6445
Training until validation scores don't improve for 100 rou