# Loan Payback Prediction - Optimized Version

Clean implementation with hyperparameter tuning using Optuna.
- **Models**: LightGBM + XGBoost Ensemble
- **Optimization**: Automated hyperparameter search
- **Features**: 12 engineered features

## 1. Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configuration
SKIP_TUNING = False  # Set True to use manual parameters (faster)
N_TRIALS = 100  # Number of optimization trials per model

## 2. Load Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

## 3. Feature Engineering

In [None]:
def create_features(df):
    df = df.copy()
    
    # Financial ratios
    df['loan_to_income_ratio'] = df['loan_amount'] / (df['annual_income'] + 1)
    df['estimated_monthly_payment'] = (df['loan_amount'] * (df['interest_rate'] / 100) / 12) / (1 - (1 + df['interest_rate'] / 100 / 12) ** -60)
    df['payment_to_income_ratio'] = (df['estimated_monthly_payment'] * 12) / (df['annual_income'] + 1)
    df['total_debt'] = df['annual_income'] * df['debt_to_income_ratio']
    df['total_debt_with_loan'] = df['total_debt'] + df['loan_amount']
    df['new_debt_to_income_ratio'] = df['total_debt_with_loan'] / (df['annual_income'] + 1)
    
    # Categorical features
    df['credit_score_category'] = pd.cut(df['credit_score'], bins=[0, 580, 670, 740, 800, 850], 
                                          labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])
    df['income_category'] = pd.cut(df['annual_income'], bins=[0, 30000, 50000, 75000, 100000, np.inf], 
                                   labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])
    df['interest_rate_category'] = pd.cut(df['interest_rate'], bins=[0, 7, 11, 15, np.inf], 
                                          labels=['Low', 'Medium', 'High', 'Very High'])
    
    # Grade decomposition
    df['grade'] = df['grade_subgrade'].str[0]
    df['subgrade'] = df['grade_subgrade'].str[1:]
    
    # Interaction features
    df['income_credit_interaction'] = df['annual_income'] * df['credit_score']
    df['risk_score'] = (df['debt_to_income_ratio'] * 0.3 + df['interest_rate'] * 0.2 - df['credit_score'] / 1000 * 0.5)
    
    return df

train_fe = create_features(train)
test_fe = create_features(test)

## 4. Data Preprocessing

In [None]:
def preprocess_data(train_df, test_df):
    X_train = train_df.drop(['id', 'loan_paid_back'], axis=1)
    y_train = train_df['loan_paid_back']
    X_test = test_df.drop(['id'], axis=1)
    
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    # Label encode categoricals
    for col in categorical_cols:
        le = LabelEncoder()
        combined = pd.concat([X_train[col], X_test[col]], axis=0)
        le.fit(combined.astype(str))
        X_train[col] = le.transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
    
    # Handle missing values
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_test.median())
    
    # Scale numerical features
    scaler = StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    
    return X_train, X_test, y_train

X_train, X_test, y_train = preprocess_data(train_fe, test_fe)

# Create validation split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
)

## 5. Hyperparameter Optimization

In [None]:
def optimize_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'random_state': RANDOM_STATE,
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
    }
    
    lgb_train = lgb.Dataset(X_train_split, y_train_split)
    lgb_val = lgb.Dataset(X_val_split, y_val_split, reference=lgb_train)
    
    model = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_val],
                     callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=0)])
    
    y_pred = model.predict(X_val_split, num_iteration=model.best_iteration)
    return roc_auc_score(y_val_split, y_pred)

if not SKIP_TUNING:
    print(f"Optimizing LightGBM ({N_TRIALS} trials)...")
    study_lgb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_STATE))
    study_lgb.optimize(optimize_lgb, n_trials=N_TRIALS, show_progress_bar=True)
    print(f"Best LightGBM AUC: {study_lgb.best_value:.5f}\n")
    
    lgb_params = {'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
                  'random_state': RANDOM_STATE, 'n_jobs': -1, 'verbose': -1, **study_lgb.best_params}
else:
    lgb_params = {'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'num_leaves': 31,
                  'learning_rate': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5,
                  'max_depth': -1, 'min_child_samples': 20, 'reg_alpha': 0.1, 'reg_lambda': 0.1,
                  'random_state': RANDOM_STATE, 'n_jobs': -1, 'verbose': -1}

In [None]:
def optimize_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': RANDOM_STATE,
        'tree_method': 'hist',
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
    }
    
    xgb_train = xgb.DMatrix(X_train_split, label=y_train_split)
    xgb_valid = xgb.DMatrix(X_val_split, label=y_val_split)
    
    model = xgb.train(params, xgb_train, num_boost_round=1000, evals=[(xgb_valid, 'valid')],
                     early_stopping_rounds=50, verbose_eval=0)
    
    y_pred = model.predict(xgb.DMatrix(X_val_split))
    return roc_auc_score(y_val_split, y_pred)

if not SKIP_TUNING:
    print(f"Optimizing XGBoost ({N_TRIALS} trials)...")
    study_xgb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=RANDOM_STATE))
    study_xgb.optimize(optimize_xgb, n_trials=N_TRIALS, show_progress_bar=True)
    print(f"Best XGBoost AUC: {study_xgb.best_value:.5f}\n")
    
    xgb_params = {'objective': 'binary:logistic', 'eval_metric': 'auc', 'random_state': RANDOM_STATE,
                  'n_jobs': -1, 'tree_method': 'hist', **study_xgb.best_params}
else:
    xgb_params = {'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 6,
                  'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8,
                  'min_child_weight': 3, 'reg_alpha': 0.1, 'reg_lambda': 1,
                  'random_state': RANDOM_STATE, 'n_jobs': -1}

## 6. Train Final Models

In [None]:
# Train LightGBM
lgb_train = lgb.Dataset(X_train_split, y_train_split)
lgb_val = lgb.Dataset(X_val_split, y_val_split, reference=lgb_train)
lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=1000, valid_sets=[lgb_val],
                     callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=0)])

# Train XGBoost
xgb_train = xgb.DMatrix(X_train_split, label=y_train_split)
xgb_valid = xgb.DMatrix(X_val_split, label=y_val_split)
xgb_model = xgb.train(xgb_params, xgb_train, num_boost_round=1000, evals=[(xgb_valid, 'valid')],
                     early_stopping_rounds=50, verbose_eval=0)

# Validation scores
y_val_pred_lgb = lgb_model.predict(X_val_split, num_iteration=lgb_model.best_iteration)
y_val_pred_xgb = xgb_model.predict(xgb.DMatrix(X_val_split))
y_val_pred_ensemble = (y_val_pred_lgb + y_val_pred_xgb) / 2

print(f"Validation Results:")
print(f"LightGBM AUC: {roc_auc_score(y_val_split, y_val_pred_lgb):.5f}")
print(f"XGBoost AUC: {roc_auc_score(y_val_split, y_val_pred_xgb):.5f}")
print(f"Ensemble AUC: {roc_auc_score(y_val_split, y_val_pred_ensemble):.5f}")

In [None]:
# Retrain on full data
lgb_train_full = lgb.Dataset(X_train, y_train)
lgb_model_final = lgb.train(lgb_params, lgb_train_full, num_boost_round=lgb_model.best_iteration,
                           callbacks=[lgb.log_evaluation(period=0)])

xgb_train_full = xgb.DMatrix(X_train, label=y_train)
xgb_model_final = xgb.train(xgb_params, xgb_train_full, num_boost_round=xgb_model.best_iteration,
                           verbose_eval=0)

print("Final models trained on full dataset")

## 7. Generate Predictions

In [None]:
# Generate predictions
test_pred_lgb = lgb_model_final.predict(X_test, num_iteration=lgb_model_final.best_iteration)
test_pred_xgb = xgb_model_final.predict(xgb.DMatrix(X_test))
test_pred_ensemble = (test_pred_lgb + test_pred_xgb) / 2

# Create submission
submission = pd.DataFrame({
    'id': test_fe['id'],
    'loan_paid_back': test_pred_ensemble
})

submission.to_csv('submission.csv', index=False)
print(f"Submission saved: {submission.shape}")
print(f"Prediction range: [{submission['loan_paid_back'].min():.4f}, {submission['loan_paid_back'].max():.4f}]")