In [1]:
# Загрузка данных с kaggle. Нужно загрузить kaggle.json с профиля в Kaggle.
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c school-of-quants-hackathon-2025-finals
!unzip school-of-quants-hackathon-2025-finals.zip

Downloading school-of-quants-hackathon-2025-finals.zip to /content
  0% 0.00/43.2M [00:00<?, ?B/s]
100% 43.2M/43.2M [00:00<00:00, 818MB/s]
Archive:  school-of-quants-hackathon-2025-finals.zip
  inflating: X_test.csv              
  inflating: X_train.csv             
  inflating: y_train.csv             


In [1]:
"""
Improved training pipeline for default prediction with robust class balancing.

Features:
- Vectorized feature engineering for enc_paym_* and overdues_ blocks
- LightGBM with class imbalance handling: undersample / scale_pos_weight / both
- Stratified K-Fold CV + out-of-fold predictions
- Threshold tuning to maximize F1
- Avoid expensive OneHotEncoder; use LabelEncoder for small categoricals
- Memory-friendly transforms where possible

Usage:
- Place X_train.csv, X_test.csv, y_train.csv in the same folder
- Run: python improved_solution_balanced.py
Produces: submission.csv with columns ['id', 'flag']
"""

import os
import sys
import gc
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score
import lightgbm as lgb

DATA_DIR = "."

def load_data(data_dir=DATA_DIR):
    X_train = pd.read_csv(os.path.join(data_dir, "X_train.csv"))
    X_test = pd.read_csv(os.path.join(data_dir, "X_test.csv"))
    y_train = pd.read_csv(os.path.join(data_dir, "y_train.csv"))
    return X_train, X_test, y_train

def basic_checks(X_train, X_test, y_train):
    print("Shapes:", X_train.shape, X_test.shape, y_train.shape)
    # if y_train contains more than one column, try to find a 'flag' column, else take first
    if y_train.shape[1] > 1:
        if 'flag' in y_train.columns:
            y = y_train['flag'].astype(int)
        else:
            # fallback: take first column
            y = y_train.iloc[:, 0].astype(int)
    else:
        y = y_train.iloc[:, 0].astype(int)
    # print distribution
    vc = y.value_counts(normalize=True)
    print("Target distribution:\n", vc.rename("proportion"))
    return y

# ---------------- Feature engineering ----------------

def eng_enc_paym_features(df, enc_cols):
    if len(enc_cols) == 0:
        return pd.DataFrame(index=df.index)
    arr = df[enc_cols].fillna(-99).values  # sentinel for missing
    valid_mask = (arr != -99)
    # first value (not index)
    def first_val(a, vm):
        out = np.full(a.shape[0], np.nan, dtype=float)
        idx = vm.argmax(axis=1)
        any_valid = vm.any(axis=1)
        out[any_valid] = a[np.arange(a.shape[0])[any_valid], idx[any_valid]]
        return out
    first_v = first_val(arr, valid_mask)
    # last value
    def last_val(a, vm):
        out = np.full(a.shape[0], np.nan, dtype=float)
        rev_idx = vm[:, ::-1].argmax(axis=1)
        any_valid = vm.any(axis=1)
        rev_pos = a.shape[1] - 1 - rev_idx
        out[any_valid] = a[np.arange(a.shape[0])[any_valid], rev_pos[any_valid]]
        return out
    last_v = last_val(arr, valid_mask)
    mean_v = np.where(valid_mask, arr, np.nan).mean(axis=1)
    std_v = np.nanstd(np.where(valid_mask, arr, np.nan), axis=1)
    # number of unique statuses (excluding missing)
    n_unique = np.apply_along_axis(lambda r: len(np.unique(r[r != -99])), 1, arr)
    missing_frac = (arr == -99).mean(axis=1)
    diffs = np.abs(np.diff(np.where(arr == -99, np.nan, arr), axis=1))
    num_changes = np.nansum((~np.isnan(diffs)) & (diffs > 0), axis=1)
    out = pd.DataFrame({
        "enc_first": first_v,
        "enc_last": last_v,
        "enc_mean": mean_v,
        "enc_std": std_v,
        "enc_n_unique": n_unique,
        "enc_missing_frac": missing_frac,
        "enc_num_changes": num_changes
    }, index=df.index)
    return out

def eng_overdues_features(df, over_cols, no_over_cols):
    out = pd.DataFrame(index=df.index)
    if len(over_cols) > 0:
        arr = df[over_cols].fillna(0).values.astype(float)
        out['over_total'] = arr.sum(axis=1)
        out['over_num_nonzero'] = (arr > 0).sum(axis=1)
        out['over_max_bucket'] = np.argmax(arr, axis=1).astype(int)
    else:
        out['over_total'] = 0.0
        out['over_num_nonzero'] = 0
        out['over_max_bucket'] = -1
    if len(no_over_cols) > 0:
        arr2 = df[no_over_cols].fillna(0).values.astype(float)
        out['no_over_total'] = arr2.sum(axis=1)
        out['no_over_mean'] = arr2.mean(axis=1)
    else:
        out['no_over_total'] = 0.0
        out['no_over_mean'] = 0.0
    return out

def num_features_engineer(df):
    df = df.copy()
    EPS = 1e-9
    if 'maturity_plan' in df.columns and 'maturity_fact' in df.columns:
        df['maturity_diff'] = df['maturity_fact'] - df['maturity_plan']
        df['maturity_ratio'] = df['maturity_fact'] / (df['maturity_plan'] + EPS)
        df['closed_early'] = (df['maturity_fact'] < df['maturity_plan']).astype(int)
    if 'sum_left_to_pay' in df.columns and 'credit_limit' in df.columns:
        df['utilization'] = df['sum_left_to_pay'] / (df['credit_limit'] + EPS)
    if 'next_payment_sum' in df.columns and 'credit_limit' in df.columns:
        df['next_payment_ratio'] = df['next_payment_sum'] / (df['credit_limit'] + EPS)
    if 'current_overdue_debt' in df.columns:
        df['has_current_overdue'] = (df['current_overdue_debt'] > 0).astype(int)
    money_cols = ['credit_limit','next_payment_sum','sum_left_to_pay','current_overdue_debt','max_overdue_debt','full_credit_cost']
    for c in money_cols:
        if c in df.columns:
            df[c + '_log1p'] = np.log1p(df[c].fillna(0.0).astype(float))
    if 'maturity_plan' in df.columns and 'days_since_confirmed' in df.columns:
        df['maturity_remaining'] = df['maturity_plan'] - df['days_since_confirmed']
    return df

def prepare_features(X_train, X_test):
    all_cols = X_train.columns.tolist()
    enc_paym_cols = [c for c in all_cols if c.startswith('enc_paym_')]
    over_cols = [c for c in all_cols if c.startswith('overdues_')]
    no_over_cols = [c for c in all_cols if c.startswith('no_overdues_')]
    cat_cols = [c for c in ['credit_type','credit_currency'] if c in all_cols]
    id_col = 'id' if 'id' in all_cols else None

    # numeric engineering
    X_train_num = num_features_engineer(X_train)
    X_test_num  = num_features_engineer(X_test)

    # enc_paym features
    enc_tr = eng_enc_paym_features(X_train_num, enc_paym_cols)
    enc_te = eng_enc_paym_features(X_test_num, enc_paym_cols)

    # overdues features
    ov_tr = eng_overdues_features(X_train_num, over_cols, no_over_cols)
    ov_te = eng_overdues_features(X_test_num, over_cols, no_over_cols)

    # concat and drop original large blocks (to save memory)
    drop_cols = enc_paym_cols + over_cols + no_over_cols
    X_train_small = pd.concat([X_train_num.drop(columns=[c for c in drop_cols if c in X_train_num.columns]), enc_tr, ov_tr], axis=1)
    X_test_small  = pd.concat([X_test_num.drop(columns=[c for c in drop_cols if c in X_test_num.columns]), enc_te, ov_te], axis=1)

    # label-encode small categorical columns (inplace)
    for c in cat_cols:
        le = LabelEncoder()
        X_train_small[c] = X_train_small[c].fillna(-999)
        X_test_small[c]  = X_test_small[c].fillna(-999)
        le.fit(list(X_train_small[c].astype(str).values) + list(X_test_small[c].astype(str).values))
        X_train_small[c] = le.transform(X_train_small[c].astype(str))
        X_test_small[c]  = le.transform(X_test_small[c].astype(str))

    # impute remaining nan with median and cast to float32 for memory
    imputer = SimpleImputer(strategy='median')
    X_train_small[:] = imputer.fit_transform(X_train_small)
    X_test_small[:]  = imputer.transform(X_test_small)

    # convert numeric columns to float32 to save memory
    for c in X_train_small.select_dtypes(include=[np.number]).columns:
        # categorical columns likely ints; keep them as ints where appropriate
        if c in cat_cols:
            X_train_small[c] = X_train_small[c].astype('int32')
            X_test_small[c] = X_test_small[c].astype('int32')
        else:
            X_train_small[c] = X_train_small[c].astype('float32')
            X_test_small[c] = X_test_small[c].astype('float32')

    return X_train_small, X_test_small, id_col, cat_cols

# ---------------- Balancing helpers and training ----------------

def undersample_negatives_idx(y, desired_neg_ratio=4, random_state=None):
    """
    Return indices to keep: all positives + sample of negatives such that neg:pos ~ desired_neg_ratio.
    y is a pd.Series indexed by original indices.
    """
    rng = np.random.RandomState(random_state)
    idx_all = y.index.values
    pos_mask = (y.values == 1)
    neg_mask = ~pos_mask
    pos_idx = idx_all[pos_mask]
    neg_idx = idx_all[neg_mask]

    n_pos = pos_idx.shape[0]
    n_neg_req = int(n_pos * desired_neg_ratio)
    n_neg_req = min(n_neg_req, neg_idx.shape[0])

    if n_neg_req <= 0:
        return idx_all

    sampled_neg = rng.choice(neg_idx, size=n_neg_req, replace=False)
    selected = np.concatenate([pos_idx, sampled_neg])
    rng.shuffle(selected)
    return selected

def train_lgb_oof(X, y, cat_cols, n_splits=5, seed=42,
                  balance_method='undersample', desired_neg_ratio=4):
    """
    balance_method: 'scale' | 'undersample' | 'both' | 'none'
    desired_neg_ratio: negatives per positive when undersampling (e.g. 4)
    Returns: (models list, oof_preds)
    """
    base_params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_leaves': 127,
        'n_estimators': 2000,
        'random_state': seed,
        'n_jobs': -1,
        'verbosity': -1
    }

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(X.shape[0], dtype=float)
    models = []

    from lightgbm import early_stopping, log_evaluation

    for fold, (tr_idx, val_idx) in enumerate(folds.split(X, y)):
        print("Fold", fold + 1)
        X_tr_full, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr_full, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        # undersample only training part if requested
        if balance_method in ('undersample', 'both'):
            sel_idx = undersample_negatives_idx(y_tr_full, desired_neg_ratio=desired_neg_ratio, random_state=seed + fold)
            # sel_idx are original indices from y_tr_full.index
            X_tr = X_tr_full.loc[sel_idx]
            y_tr = y_tr_full.loc[sel_idx]
            print(f"  Undersampled train: pos={int((y_tr==1).sum())}, neg={int((y_tr==0).sum())} (ratio ~ {desired_neg_ratio}:1)")
        else:
            X_tr, y_tr = X_tr_full, y_tr_full
            print(f"  Full train used: pos={int((y_tr==1).sum())}, neg={int((y_tr==0).sum())}")

        params = base_params.copy()

        # scale_pos_weight if requested
        if balance_method in ('scale', 'both'):
            pos = int(y_tr.sum())
            neg = int(y_tr.shape[0] - pos)
            spw = max(1.0, neg / (pos + 1e-9))
            params['scale_pos_weight'] = spw
            print(f"  Using scale_pos_weight = {spw:.3f}")

        lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, categorical_feature=cat_cols, free_raw_data=False)

        model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_val],
            num_boost_round=params.get('n_estimators', 2000),
            callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)]
        )

        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        oof_preds[val_idx] = val_pred
        models.append(model)
        print("Fold ROC AUC:", roc_auc_score(y_val, val_pred))
        gc.collect()

    return models, oof_preds

def find_best_threshold(y_true, probs):
    precision, recall, thresholds = precision_recall_curve(y_true, probs)
    f1_scores = 2 * precision * recall / (precision + recall + 1e-12)
    best_idx = np.nanargmax(f1_scores)
    best_thr = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    return best_thr, f1_scores[best_idx]

def predict_ensemble(models, X, average=True):
    preds = np.column_stack([m.predict(X, num_iteration=m.best_iteration) for m in models])
    return np.mean(preds, axis=1) if average else preds.mean(axis=1)

# ---------------- Main ----------------

def main():
    required = ["X_train.csv", "X_test.csv", "y_train.csv"]
    ok = all(os.path.exists(f) for f in required)
    if not ok:
        print("Не найден один из файлов:", required)
        print("Поместите CSV-файлы в текущую папку и запустите снова.")
        sys.exit(1)

    print("Loading data...")
    X_train, X_test, y_df = load_data(DATA_DIR)
    y = basic_checks(X_train, X_test, y_df)

    print("Preparing features...")
    X_tr, X_te, id_col, cat_cols = prepare_features(X_train, X_test)
    print("Prepared feature shapes:", X_tr.shape, X_te.shape)
    # Ensure indexing aligns with y
    X_tr = X_tr.loc[y.index]

    # TRAIN: try undersample by default; change `balance_method` if you prefer only scale_pos_weight
    balance_method = 'undersample'  # options: 'undersample', 'scale', 'both', 'none'
    desired_neg_ratio = 4  # negatives per positive in undersampled train set

    print("Training with balance_method =", balance_method)
    models, oof = train_lgb_oof(X_tr, y, cat_cols, n_splits=5, seed=42,
                                balance_method=balance_method, desired_neg_ratio=desired_neg_ratio)

    best_thr, best_f1 = find_best_threshold(y, oof)
    print("Best threshold on OOF:", best_thr, "Best OOF F1:", best_f1)

    # Final predictions for test
    print("Predicting test set...")
    test_probs = predict_ensemble(models, X_te)
    test_pred = (test_probs >= best_thr).astype(int)

    if id_col is None or id_col not in X_test.columns:
        sub_ids = np.arange(len(test_pred))
    else:
        sub_ids = X_test[id_col].astype(int).values

    submission = pd.DataFrame({"id": sub_ids, "flag": test_pred})
    submission.to_csv("submission.csv", index=False)
    print("Saved submission.csv, shape:", submission.shape)

if __name__ == "__main__":
    main()

Loading data...
Shapes: (1827404, 48) (456852, 48) (1827404, 2)
Target distribution:
 flag
0    0.967186
1    0.032814
Name: proportion, dtype: float64
Preparing features...
Prepared feature shapes: (1827404, 38) (456852, 38)
Training with balance_method = undersample
Fold 1
  Undersampled train: pos=47971, neg=191884 (ratio ~ 4:1)
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.641402
[200]	valid_0's auc: 0.643712
Early stopping, best iteration is:
[212]	valid_0's auc: 0.643956
Fold ROC AUC: 0.6439562779164065
Fold 2
  Undersampled train: pos=47971, neg=191884 (ratio ~ 4:1)
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.634949
[200]	valid_0's auc: 0.637715
[300]	valid_0's auc: 0.638687
[400]	valid_0's auc: 0.63924
Early stopping, best iteration is:
[428]	valid_0's auc: 0.639599
Fold ROC AUC: 0.6395986996626063
Fold 3
  Undersampled train: pos=47971, neg=191884 (ratio ~ 4:1)
Training until validation scores don't 