In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load the data
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
sample_sub = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nTrain columns:", train.columns.tolist())

Train shape: (593994, 13)
Test shape: (254569, 12)

Train columns: ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade', 'loan_paid_back']


In [4]:
test.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
# Check target distribution
print("=== TARGET DISTRIBUTION ===")
print(train['loan_paid_back'].value_counts(normalize=True))

# Check data types
print("\n=== DATA TYPES ===")
print(train.dtypes)

# Check for missing values
print("\n=== MISSING VALUES ===")
print("Train missing:", train.isnull().sum().sum())
print("Test missing:", test.isnull().sum().sum())

=== TARGET DISTRIBUTION ===
loan_paid_back
1.0    0.79882
0.0    0.20118
Name: proportion, dtype: float64

=== DATA TYPES ===
id                        int64
annual_income           float64
debt_to_income_ratio    float64
credit_score              int64
loan_amount             float64
interest_rate           float64
gender                   object
marital_status           object
education_level          object
employment_status        object
loan_purpose             object
grade_subgrade           object
loan_paid_back          float64
dtype: object

=== MISSING VALUES ===
Train missing: 0
Test missing: 0


In [7]:
# Create powerful financial ratios and features
def create_advanced_features(df):
    df = df.copy()
    
    # Financial ratios
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['monthly_debt_burden'] = df['annual_income'] * df['debt_to_income_ratio'] / 12
    df['affordability_score'] = df['annual_income'] / (df['loan_amount'] * df['interest_rate'] + 1)
    df['risk_score'] = df['debt_to_income_ratio'] * df['interest_rate']
    
    # Credit score bins
    df['credit_score_category'] = pd.cut(df['credit_score'], 
                                       bins=[0, 580, 670, 740, 800, 850],
                                       labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])
    
    # Loan amount relative to income
    df['loan_to_income_ratio'] = df['loan_amount'] / (df['annual_income'] + 1)
    
    # Interest rate risk categories
    df['interest_rate_category'] = pd.cut(df['interest_rate'],
                                        bins=[0, 5, 10, 15, 20, 100],
                                        labels=['Low', 'Medium', 'High', 'Very High', 'Extreme'])
    
    return df

# Apply feature engineering
train_enhanced = create_advanced_features(train)
test_enhanced = create_advanced_features(test)

print("New feature names:", [col for col in train_enhanced.columns if col not in train.columns])

New feature names: ['income_to_loan_ratio', 'monthly_debt_burden', 'affordability_score', 'risk_score', 'credit_score_category', 'loan_to_income_ratio', 'interest_rate_category']


In [8]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Identify categorical and numerical columns
categorical_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 
                   'loan_purpose', 'grade_subgrade', 'credit_score_category', 'interest_rate_category']
numerical_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
                 'interest_rate', 'income_to_loan_ratio', 'monthly_debt_burden', 
                 'affordability_score', 'risk_score', 'loan_to_income_ratio']

# Prepare features
X = train_enhanced.drop(['id', 'loan_paid_back'], axis=1)
y = train_enhanced['loan_paid_back']
X_test = test_enhanced.drop('id', axis=1)

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    if col in X.columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
        label_encoders[col] = le

print("Preprocessing completed!")
print(f"Final training shape: {X.shape}")

Preprocessing completed!
Final training shape: (593994, 18)


In [9]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# Initialize advanced models with optimized parameters
models = {
    'XGBoost': XGBClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        tree_method='hist'
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    ),
    'CatBoost': CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        random_state=42,
        verbose=False
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
}

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("=== ADVANCED MODEL CV RESULTS ===")
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    cv_results[name] = scores.mean()
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

=== ADVANCED MODEL CV RESULTS ===
XGBoost: 0.9043 (+/- 0.0010)
LightGBM: 0.9058 (+/- 0.0012)
CatBoost: 0.9063 (+/- 0.0015)
RandomForest: 0.9025 (+/- 0.0013)


In [10]:
# Create weighted ensemble based on CV performance
best_models = [
    ('xgb', models['XGBoost']),
    ('lgb', models['LightGBM']),
    ('cat', models['CatBoost'])
]

# Voting Classifier with weights
ensemble = VotingClassifier(estimators=best_models, voting='soft')

# Train and validate ensemble
ensemble_scores = cross_val_score(ensemble, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"\n=== ENSEMBLE PERFORMANCE ===")
print(f"Ensemble CV: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std() * 2:.4f})")


=== ENSEMBLE PERFORMANCE ===
Ensemble CV: 0.9063 (+/- 0.0011)


In [11]:
# Train final ensemble on all data
print("Training final ensemble model...")
ensemble.fit(X, y)

# Predict probabilities for test set
test_predictions = ensemble.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': test_predictions
})

# Save submission
submission.to_csv('submission.csv', index=False)
print("‚úÖ Submission file created!")

# Analyze predictions
print(f"\n=== PREDICTION DISTRIBUTION ===")
print(submission['loan_paid_back'].value_counts(normalize=True))

Training final ensemble model...
‚úÖ Submission file created!

=== PREDICTION DISTRIBUTION ===
loan_paid_back
1.0    0.859991
0.0    0.140009
Name: proportion, dtype: float64


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

# GPU-OPTIMIZED FEATURE ENGINEERING
def create_gpu_features(df):
    df = df.copy()
    
    # Vectorized operations (GPU friendly)
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['debt_service_ratio'] = (df['annual_income'] * df['debt_to_income_ratio']) / (df['loan_amount'] * df['interest_rate'] + 1)
    df['credit_utilization'] = df['loan_amount'] / (df['annual_income'] + 1)
    
    # Risk scores
    df['risk_score'] = df['debt_to_income_ratio'] * df['interest_rate']
    df['composite_score'] = (df['credit_score'] / 850) * (1 - df['debt_to_income_ratio'])
    
    # Binning
    df['credit_bin'] = pd.cut(df['credit_score'], bins=[0, 580, 670, 740, 800, 850], labels=False)
    df['income_bin'] = pd.cut(df['annual_income'], bins=5, labels=False)
    
    return df

print("üöÄ Creating GPU-optimized features...")
train_gpu = create_gpu_features(train)
test_gpu = create_gpu_features(test)

# SELECT OPTIMAL FEATURES
feature_columns = [
    'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
    'income_to_loan_ratio', 'debt_service_ratio', 'credit_utilization', 'risk_score', 'composite_score',
    'credit_bin', 'income_bin', 'employment_status', 'grade_subgrade'
]

# PREPROCESSING
X = train_gpu[feature_columns].copy()
y = train_gpu['loan_paid_back']
X_test = test_gpu[feature_columns].copy()

# Encode categoricals
categorical_cols = ['employment_status', 'grade_subgrade']
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

print(f"üìä Training shape: {X.shape}")

# GPU-OPTIMIZED MODELS
print("\n=== TRAINING GPU-ACCELERATED MODELS ===")

# Model 1: XGBoost with GPU
xgb_gpu = XGBClassifier(
    n_estimators=3000,           # More trees since GPU is faster
    max_depth=8,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    tree_method='gpu_hist',      # GPU ACCELERATION
    predictor='gpu_predictor',   # GPU prediction
    eval_metric='logloss',
    verbosity=0
)

# Model 2: LightGBM with GPU
lgb_gpu = LGBMClassifier(
    n_estimators=3000,
    max_depth=8,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    device='gpu',               # GPU ACCELERATION
    gpu_platform_id=0,
    gpu_device_id=0,
    verbose=-1
)

# FAST GPU VALIDATION
def gpu_temporal_validation(X, y, model, n_splits=3):
    """Fast GPU validation with fewer splits"""
    tscv = TimeSeriesSplit(n_splits=n_splits)
    scores = []
    
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        print(f"üîÑ GPU Fold {fold+1}/{n_splits}...")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = np.mean(preds == y_val)
        scores.append(score)
        print(f"   Fold {fold+1} Accuracy: {score:.4f}")
    
    return np.mean(scores), np.std(scores)

# QUICK GPU VALIDATION
print("‚ö° Running fast GPU validation...")
xgb_score, xgb_std = gpu_temporal_validation(X, y, xgb_gpu, n_splits=3)
lgb_score, lgb_std = gpu_temporal_validation(X, y, lgb_gpu, n_splits=3)

print(f"\nüìà XGBoost GPU CV: {xgb_score:.4f} (+/- {xgb_std:.4f})")
print(f"üìà LightGBM GPU CV: {lgb_score:.4f} (+/- {lgb_std:.4f})")

# SELECT BEST GPU MODEL
if xgb_score >= lgb_score:
    best_gpu_model = xgb_gpu
    print("üéØ Selected: XGBoost (GPU)")
else:
    best_gpu_model = lgb_gpu
    print("üéØ Selected: LightGBM (GPU)")

# FULL TRAINING ON GPU
print("\nüî• Training final model on GPU...")
best_gpu_model.fit(X, y)

# PREDICT WITH GPU
print("‚ö° Making GPU-accelerated predictions...")
test_probs = best_gpu_model.predict_proba(X_test)[:, 1]

# SMART THRESHOLD TUNING
threshold = 0.75  # More conservative
test_predictions = (test_probs > threshold).astype(int)

# CREATE SUBMISSION
submission_gpu = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': test_predictions
})

# ANALYZE PREDICTIONS
print(f"\n=== GPU PREDICTION DISTRIBUTION ===")
pred_dist = submission_gpu['loan_paid_back'].value_counts(normalize=True)
print(pred_dist)

# Ensure we're not too far from training distribution
train_dist = train['loan_paid_back'].value_counts(normalize=True)
print(f"Training distribution: {train_dist[1.0]:.3f} paid, {train_dist[0.0]:.3f} default")
print(f"Test predictions: {pred_dist[1]:.3f} paid, {pred_dist[0]:.3f} default")



üöÄ Creating GPU-optimized features...
üìä Training shape: (593994, 14)

=== TRAINING GPU-ACCELERATED MODELS ===
‚ö° Running fast GPU validation...
üîÑ GPU Fold 1/3...
   Fold 1 Accuracy: 0.9036
üîÑ GPU Fold 2/3...
   Fold 2 Accuracy: 0.9045
üîÑ GPU Fold 3/3...
   Fold 3 Accuracy: 0.9043
üîÑ GPU Fold 1/3...




   Fold 1 Accuracy: 0.9062
üîÑ GPU Fold 2/3...
   Fold 2 Accuracy: 0.9060
üîÑ GPU Fold 3/3...
   Fold 3 Accuracy: 0.9054

üìà XGBoost GPU CV: 0.9041 (+/- 0.0004)
üìà LightGBM GPU CV: 0.9059 (+/- 0.0003)
üéØ Selected: LightGBM (GPU)

üî• Training final model on GPU...
‚ö° Making GPU-accelerated predictions...

=== GPU PREDICTION DISTRIBUTION ===
loan_paid_back
1    0.782216
0    0.217784
Name: proportion, dtype: float64
Training distribution: 0.799 paid, 0.201 default
Test predictions: 0.782 paid, 0.218 default


In [6]:
# SAVE SUBMISSION
submission_gpu.to_csv('submission_gpu.csv', index=False)
print("‚úÖ GPU-accelerated submission created!")

# FEATURE IMPORTANCE (GPU)
if hasattr(best_gpu_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_gpu_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nüéØ TOP FEATURES (GPU):")
    print(importance_df.head(8))



‚úÖ GPU-accelerated submission created!

üéØ TOP FEATURES (GPU):
                feature  importance
1  debt_to_income_ratio       23226
2          credit_score       10862
3           loan_amount        9627
0         annual_income        8412
4         interest_rate        8313
9       composite_score        7542
8            risk_score        4816
6    debt_service_ratio        4623


In [7]:
# ENSEMBLE OPTION (If you want to try both)
print("\nü§ñ Creating GPU Ensemble...")
ensemble_probs = (xgb_gpu.predict_proba(X_test)[:, 1] + 
                  lgb_gpu.predict_proba(X_test)[:, 1]) / 2
ensemble_preds = (ensemble_probs > 0.75).astype(int)

submission_ensemble = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': ensemble_preds
})

print(f"Ensemble distribution:")
print(submission_ensemble['loan_paid_back'].value_counts(normalize=True))
submission_ensemble.to_csv('submission_gpu_ensemble.csv', index=False)
print("‚úÖ GPU Ensemble submission created!")


ü§ñ Creating GPU Ensemble...
Ensemble distribution:
loan_paid_back
1    0.781163
0    0.218837
Name: proportion, dtype: float64
‚úÖ GPU Ensemble submission created!


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# ADVANCED FEATURE ENGINEERING
def create_advanced_features(df):
    df = df.copy()
    
    # Financial ratios
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['debt_service_ratio'] = (df['annual_income'] * df['debt_to_income_ratio']) / (df['loan_amount'] * df['interest_rate'] + 1)
    df['credit_utilization'] = df['loan_amount'] / (df['annual_income'] + 1)
    
    # Advanced risk scores
    df['risk_score_1'] = df['debt_to_income_ratio'] * df['interest_rate'] * (1 - df['credit_score']/850)
    df['risk_score_2'] = (df['loan_amount'] / df['annual_income']) * df['debt_to_income_ratio'] * 100
    
    # Credit score transformations
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['credit_score_log'] = np.log1p(df['credit_score'])
    
    # Income transformations
    df['income_log'] = np.log1p(df['annual_income'])
    df['loan_amount_log'] = np.log1p(df['loan_amount'])
    
    # Interaction features
    df['credit_income_interaction'] = df['credit_score'] * df['annual_income'] / 100000
    df['debt_interest_interaction'] = df['debt_to_income_ratio'] * df['interest_rate']
    
    # Binning with more granular categories
    df['credit_bin'] = pd.cut(df['credit_score'], bins=[0, 500, 600, 700, 750, 800, 850], labels=False)
    df['income_bin'] = pd.cut(df['annual_income'], bins=10, labels=False)
    df['dti_bin'] = pd.cut(df['debt_to_income_ratio'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], labels=False)
    
    return df

print("üöÄ Creating advanced features...")
train_adv = create_advanced_features(train)
test_adv = create_advanced_features(test)

# EXPANDED FEATURE SET
feature_columns = [
    # Original features
    'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
    
    # Engineered features
    'income_to_loan_ratio', 'debt_service_ratio', 'credit_utilization',
    'risk_score_1', 'risk_score_2', 
    'credit_score_squared', 'credit_score_log',
    'income_log', 'loan_amount_log',
    'credit_income_interaction', 'debt_interest_interaction',
    'credit_bin', 'income_bin', 'dti_bin',
    
    # Categorical features
    'employment_status', 'grade_subgrade', 'loan_purpose', 'education_level'
]

# PREPROCESSING
X = train_adv[feature_columns].copy()
y = train_adv['loan_paid_back']
X_test = test_adv[feature_columns].copy()

# Encode categoricals
categorical_cols = ['employment_status', 'grade_subgrade', 'loan_purpose', 'education_level']
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

print(f"üìä Final training shape: {X.shape}")

# ADVANCED GPU MODELS WITH TUNING
models = {
    'xgb': XGBClassifier(
        n_estimators=3000,
        max_depth=7,
        learning_rate=0.02,  # Lower learning rate
        subsample=0.75,
        colsample_bytree=0.75,
        reg_alpha=3,         # Stronger regularization
        reg_lambda=3,
        gamma=0.1,
        random_state=42,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='auc',   # Use AUC for imbalance
        scale_pos_weight=0.25,  # Handle class imbalance
        verbosity=0
    ),
    
    'lgb': LGBMClassifier(
        n_estimators=3000,
        max_depth=7,
        learning_rate=0.02,
        subsample=0.75,
        colsample_bytree=0.75,
        reg_alpha=3,
        reg_lambda=3,
        min_child_samples=20,
        random_state=42,
        device='gpu',
        class_weight='balanced',  # Handle imbalance
        verbose=-1
    ),
    
    'cat': CatBoostClassifier(
        iterations=3000,
        depth=7,
        learning_rate=0.02,
        random_state=42,
        verbose=False,
        task_type='GPU',  # GPU acceleration
        class_weights=[0.8, 1.2]  # Handle imbalance
    )
}

# STACKING APPROACH
print("\n=== TRAINING ADVANCED STACKING MODELS ===")

# Train all models and create meta-features
from sklearn.model_selection import KFold

def create_stacking_features(X, X_test, y, models, n_folds=5):
    """Create stacking features using out-of-fold predictions"""
    stacking_train = np.zeros((X.shape[0], len(models)))
    stacking_test = np.zeros((X_test.shape[0], len(models)))
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    for i, (name, model) in enumerate(models.items()):
        print(f"üîÑ Creating stacking features with {name}...")
        test_fold_preds = []
        
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_train, y_train)
            val_preds = model.predict_proba(X_val)[:, 1]
            stacking_train[val_idx, i] = val_preds
            
            # Get test predictions for this fold
            test_preds = model.predict_proba(X_test)[:, 1]
            test_fold_preds.append(test_preds)
        
        # Average test predictions across folds
        stacking_test[:, i] = np.mean(test_fold_preds, axis=0)
    
    return stacking_train, stacking_test

# Create stacking features
stacking_train, stacking_test = create_stacking_features(X, X_test, y, models)

# Combine original features with stacking features
X_stacked = np.hstack([X.values, stacking_train])
X_test_stacked = np.hstack([X_test.values, stacking_test])

print(f"üìä Stacked features shape: {X_stacked.shape}")

# META-LEARNER (XGBoost on stacked features)
meta_learner = XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.05,
    reg_alpha=2,
    reg_lambda=2,
    random_state=42,
    tree_method='gpu_hist',
    eval_metric='auc',
    verbosity=0
)

# VALIDATE STACKING APPROACH
print("‚ö° Validating stacking approach...")
tscv = TimeSeriesSplit(n_splits=3)
stacking_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_stacked)):
    print(f"   Fold {fold+1}/3...")
    X_tr, X_val = X_stacked[train_idx], X_stacked[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    meta_learner.fit(X_tr, y_tr)
    val_preds = meta_learner.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, val_preds)
    stacking_scores.append(score)
    print(f"   Fold {fold+1} AUC: {score:.4f}")

print(f"üìà Stacking CV AUC: {np.mean(stacking_scores):.4f} (+/- {np.std(stacking_scores):.4f})")

# TRAIN FINAL STACKING MODEL
print("\nüî• Training final stacking model...")
meta_learner.fit(X_stacked, y)

# PREDICT WITH OPTIMAL THRESHOLD
final_probs = meta_learner.predict_proba(X_test_stacked)[:, 1]

# Find optimal threshold (you can tune this)
optimal_threshold = 0.70  # More conservative for default prediction
final_predictions = (final_probs > optimal_threshold).astype(int)

# CREATE SUBMISSION
submission_stacked = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': final_predictions
})

print(f"\n=== FINAL PREDICTION DISTRIBUTION ===")
print(submission_stacked['loan_paid_back'].value_counts(normalize=True))

# SAVE
submission_stacked.to_csv('submission_stacked.csv', index=False)
print("‚úÖ Advanced stacking submission created!")

# ALSO CREATE SIMPLE ENSEMBLE AS BACKUP
print("\nü§ñ Creating simple ensemble backup...")
simple_probs = (
    models['xgb'].predict_proba(X_test)[:, 1] + 
    models['lgb'].predict_proba(X_test)[:, 1] + 
    models['cat'].predict_proba(X_test)[:, 1]
) / 3

simple_predictions = (simple_probs > 0.72).astype(int)

submission_simple = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': simple_predictions
})

submission_simple.to_csv('submission_simple_ensemble.csv', index=False)
print("‚úÖ Simple ensemble submission created!")

print("\nüéØ Try both submissions and see which works better!")
print("   - submission_stacked.csv (Advanced stacking)")
print("   - submission_simple_ensemble.csv (Simple ensemble)")

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# ADVANCED FEATURE ENGINEERING (same as before)
def create_advanced_features(df):
    df = df.copy()
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['debt_service_ratio'] = (df['annual_income'] * df['debt_to_income_ratio']) / (df['loan_amount'] * df['interest_rate'] + 1)
    df['credit_utilization'] = df['loan_amount'] / (df['annual_income'] + 1)
    df['risk_score_1'] = df['debt_to_income_ratio'] * df['interest_rate'] * (1 - df['credit_score']/850)
    df['risk_score_2'] = (df['loan_amount'] / df['annual_income']) * df['debt_to_income_ratio'] * 100
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['credit_score_log'] = np.log1p(df['credit_score'])
    df['income_log'] = np.log1p(df['annual_income'])
    df['loan_amount_log'] = np.log1p(df['loan_amount'])
    df['credit_income_interaction'] = df['credit_score'] * df['annual_income'] / 100000
    df['debt_interest_interaction'] = df['debt_to_income_ratio'] * df['interest_rate']
    df['credit_bin'] = pd.cut(df['credit_score'], bins=[0, 500, 600, 700, 750, 800, 850], labels=False)
    df['income_bin'] = pd.cut(df['annual_income'], bins=10, labels=False)
    df['dti_bin'] = pd.cut(df['debt_to_income_ratio'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], labels=False)
    return df

print("üöÄ Creating advanced features...")
train_adv = create_advanced_features(train)
test_adv = create_advanced_features(test)

# FEATURE SET
feature_columns = [
    'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
    'income_to_loan_ratio', 'debt_service_ratio', 'credit_utilization', 'risk_score_1', 'risk_score_2',
    'credit_score_squared', 'credit_score_log', 'income_log', 'loan_amount_log',
    'credit_income_interaction', 'debt_interest_interaction', 'credit_bin', 'income_bin', 'dti_bin',
    'employment_status', 'grade_subgrade', 'loan_purpose', 'education_level'
]

# PREPROCESSING
X = train_adv[feature_columns].copy()
y = train_adv['loan_paid_back']
X_test = test_adv[feature_columns].copy()

categorical_cols = ['employment_status', 'grade_subgrade', 'loan_purpose', 'education_level']
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

print(f"üìä Final training shape: {X.shape}")

# OPTIMIZED MODELS WITH LR=0.01
print("\n=== TRAINING WITH LEARNING RATE 0.01 ===")

models = {
    'xgb': XGBClassifier(
        n_estimators=5000,      # More trees for lower LR
        max_depth=7,
        learning_rate=0.01,     # LOWER LEARNING RATE
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=2,
        reg_lambda=2,
        gamma=0.1,
        random_state=42,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='auc',
        scale_pos_weight=0.25,
        verbosity=0
    ),
    
    'lgb': LGBMClassifier(
        n_estimators=5000,      # More trees
        max_depth=7,
        learning_rate=0.01,     # LOWER LEARNING RATE
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=2,
        reg_lambda=2,
        min_child_samples=20,
        random_state=42,
        device='gpu',
        class_weight='balanced',
        verbose=-1
    ),
    
    'cat': CatBoostClassifier(
        iterations=5000,        # More iterations
        depth=7,
        learning_rate=0.01,     # LOWER LEARNING RATE
        random_state=42,
        verbose=False,
        task_type='GPU',
        class_weights=[0.8, 1.2]
    )
}

# QUICK VALIDATION WITH LOW LR
def quick_validate_low_lr(X, y, models, n_splits=3):
    """Quick validation to check if low LR helps"""
    tscv = TimeSeriesSplit(n_splits=n_splits)
    results = {}
    
    for name, model in models.items():
        print(f"üîç Validating {name} with LR=0.01...")
        scores = []
        
        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Use early stopping with low LR
            if name == 'xgb':
                model.fit(X_train, y_train, 
                         eval_set=[(X_val, y_val)],
                         early_stopping_rounds=100,
                         verbose=False)
            elif name == 'lgb':
                model.fit(X_train, y_train,
                         eval_set=[(X_val, y_val)],
                         early_stopping_rounds=100,
                         verbose=False)
            else:  # catboost
                model.fit(X_train, y_train,
                         eval_set=[(X_val, y_val)],
                         early_stopping_rounds=100,
                         verbose=False)
            
            val_preds = model.predict_proba(X_val)[:, 1]
            score = np.mean((val_preds > 0.5) == y_val)
            scores.append(score)
        
        results[name] = (np.mean(scores), np.std(scores))
        print(f"   {name}: {results[name][0]:.4f} (+/- {results[name][1]:.4f})")
    
    return results

# Run validation
print("‚ö° Quick validation with LR=0.01...")
validation_results = quick_validate_low_lr(X, y, models)

# TRAIN FINAL MODELS WITH FULL DATA
print("\nüî• Training final models with LR=0.01 on full data...")

# Train all models without early stopping (use full iterations)
for name, model in models.items():
    print(f"   Training {name}...")
    model.fit(X, y)

# CREATE ENSEMBLE PREDICTIONS
print("ü§ñ Creating ensemble predictions...")

# Weighted average based on validation performance
weights = {
    'xgb': validation_results['xgb'][0],
    'lgb': validation_results['lgb'][0], 
    'cat': validation_results['cat'][0]
}

# Normalize weights
total_weight = sum(weights.values())
for key in weights:
    weights[key] /= total_weight

print(f"üéØ Model weights: {weights}")

# Weighted ensemble predictions
ensemble_probs = (
    weights['xgb'] * models['xgb'].predict_proba(X_test)[:, 1] +
    weights['lgb'] * models['lgb'].predict_proba(X_test)[:, 1] +
    weights['cat'] * models['cat'].predict_proba(X_test)[:, 1]
)

# Optimized threshold
optimal_threshold = 0.68  # Slightly more conservative
final_predictions = (ensemble_probs > optimal_threshold).astype(int)

# CREATE SUBMISSION
submission_low_lr = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': final_predictions
})

print(f"\n=== PREDICTION DISTRIBUTION (LR=0.01) ===")
print(submission_low_lr['loan_paid_back'].value_counts(normalize=True))

submission_low_lr.to_csv('submission_low_lr.csv', index=False)
print("‚úÖ Low learning rate submission created!")

# ALSO TRY BEST SINGLE MODEL
best_single_name = max(validation_results, key=lambda x: validation_results[x][0])
best_single_model = models[best_single_name]
print(f"\nüèÜ Best single model: {best_single_name}")

single_probs = best_single_model.predict_proba(X_test)[:, 1]
single_predictions = (single_probs > 0.70).astype(int)

submission_single = pd.DataFrame({
    'id': test['id'], 
    'loan_paid_back': single_predictions
})

submission_single.to_csv('submission_best_single.csv', index=False)
print("‚úÖ Best single model submission created!")

print("\nüéØ Submit both and compare:")
print("   - submission_low_lr.csv (Weighted ensemble)")
print("   - submission_best_single.csv (Best single model)")

üöÄ Creating advanced features...
üìä Final training shape: (593994, 23)

=== TRAINING WITH LEARNING RATE 0.01 ===
‚ö° Quick validation with LR=0.01...
üîç Validating xgb with LR=0.01...
   xgb: 0.8692 (+/- 0.0010)
üîç Validating lgb with LR=0.01...


TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# ADVANCED FEATURE ENGINEERING
def create_advanced_features(df):
    df = df.copy()
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['debt_service_ratio'] = (df['annual_income'] * df['debt_to_income_ratio']) / (df['loan_amount'] * df['interest_rate'] + 1)
    df['credit_utilization'] = df['loan_amount'] / (df['annual_income'] + 1)
    df['risk_score_1'] = df['debt_to_income_ratio'] * df['interest_rate'] * (1 - df['credit_score']/850)
    df['risk_score_2'] = (df['loan_amount'] / df['annual_income']) * df['debt_to_income_ratio'] * 100
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['credit_score_log'] = np.log1p(df['credit_score'])
    df['income_log'] = np.log1p(df['annual_income'])
    df['loan_amount_log'] = np.log1p(df['loan_amount'])
    df['credit_income_interaction'] = df['credit_score'] * df['annual_income'] / 100000
    df['debt_interest_interaction'] = df['debt_to_income_ratio'] * df['interest_rate']
    df['credit_bin'] = pd.cut(df['credit_score'], bins=[0, 500, 600, 700, 750, 800, 850], labels=False)
    df['income_bin'] = pd.cut(df['annual_income'], bins=10, labels=False)
    df['dti_bin'] = pd.cut(df['debt_to_income_ratio'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0], labels=False)
    return df

print("üöÄ Creating advanced features...")
train_adv = create_advanced_features(train)
test_adv = create_advanced_features(test)

# FEATURE SET
feature_columns = [
    'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate',
    'income_to_loan_ratio', 'debt_service_ratio', 'credit_utilization', 'risk_score_1', 'risk_score_2',
    'credit_score_squared', 'credit_score_log', 'income_log', 'loan_amount_log',
    'credit_income_interaction', 'debt_interest_interaction', 'credit_bin', 'income_bin', 'dti_bin',
    'employment_status', 'grade_subgrade', 'loan_purpose', 'education_level'
]

# PREPROCESSING
X = train_adv[feature_columns].copy()
y = train_adv['loan_paid_back']
X_test = test_adv[feature_columns].copy()

categorical_cols = ['employment_status', 'grade_subgrade', 'loan_purpose', 'education_level']
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

print(f"üìä Final training shape: {X.shape}")

# OPTIMIZED MODELS WITH LR=0.01 (FIXED)
print("\n=== TRAINING WITH LEARNING RATE 0.01 ===")

# Initialize models with early stopping built-in
xgb_model = XGBClassifier(
    n_estimators=5000,
    max_depth=7,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=2,
    reg_lambda=2,
    gamma=0.1,
    random_state=42,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    eval_metric='auc',
    scale_pos_weight=0.25,
    verbosity=0
)

lgb_model = LGBMClassifier(
    n_estimators=5000,
    max_depth=7,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=2,
    reg_lambda=2,
    min_child_samples=20,
    random_state=42,
    device='gpu',
    class_weight='balanced',
    verbose=-1
)

cat_model = CatBoostClassifier(
    iterations=5000,
    depth=7,
    learning_rate=0.01,
    random_state=42,
    verbose=False,
    task_type='GPU',
    class_weights=[0.8, 1.2]
)

models = {
    'xgb': xgb_model,
    'lgb': lgb_model, 
    'cat': cat_model
}

# SIMPLIFIED VALIDATION (NO EARLY STOPPING ISSUES)
def simple_validate(X, y, models, n_splits=3):
    """Simple validation without early stopping complications"""
    tscv = TimeSeriesSplit(n_splits=n_splits)
    results = {}
    
    for name, model in models.items():
        print(f"üîç Validating {name} with LR=0.01...")
        scores = []
        
        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Train without early stopping for simplicity
            if name == 'xgb':
                # For XGBoost, use smaller version for validation speed
                temp_model = XGBClassifier(
                    n_estimators=5000,
                    max_depth=7,
                    learning_rate=0.01,
                    random_state=42,
                    tree_method='gpu_hist',
                    verbosity=0
                )
                temp_model.fit(X_train, y_train)
                val_preds = temp_model.predict_proba(X_val)[:, 1]
            else:
                # For others, use the full model but with fewer iterations for validation
                temp_model = model.__class__(**{**model.get_params(), 'n_estimators': 1000, 'iterations': 1000})
                temp_model.fit(X_train, y_train)
                val_preds = temp_model.predict_proba(X_val)[:, 1]
            
            score = np.mean((val_preds > 0.5) == y_val)
            scores.append(score)
            print(f"      Fold {fold+1}: {score:.4f}")
        
        results[name] = (np.mean(scores), np.std(scores))
        print(f"   {name} Average: {results[name][0]:.4f} (+/- {results[name][1]:.4f})\n")
    
    return results

# Run validation
print("‚ö° Quick validation with LR=0.01...")
validation_results = simple_validate(X, y, models)

# TRAIN FINAL MODELS WITH FULL DATA AND LR=0.01
print("\nüî• Training final models with LR=0.01 on full data...")

final_models = {}
for name, model in models.items():
    print(f"   Training {name}...")
    # Train on full data with all iterations
    model.fit(X, y)
    final_models[name] = model
    print(f"   ‚úÖ {name} training complete")

# CREATE ENSEMBLE PREDICTIONS
print("ü§ñ Creating ensemble predictions...")

# Get predictions from all models
xgb_probs = final_models['xgb'].predict_proba(X_test)[:, 1]
lgb_probs = final_models['lgb'].predict_proba(X_test)[:, 1] 
cat_probs = final_models['cat'].predict_proba(X_test)[:, 1]

# Simple average (more robust than weighted)
ensemble_probs = (xgb_probs + lgb_probs + cat_probs) / 3

# Optimized threshold
optimal_threshold = 0.70
final_predictions = (ensemble_probs > optimal_threshold).astype(int)

# CREATE SUBMISSION
submission_low_lr = pd.DataFrame({
    'id': test['id'],
    'loan_paid_back': final_predictions
})

print(f"\n=== PREDICTION DISTRIBUTION (LR=0.01) ===")
pred_dist = submission_low_lr['loan_paid_back'].value_counts(normalize=True)
print(pred_dist)

submission_low_lr.to_csv('submission_low_lr.csv', index=False)
print("‚úÖ Low learning rate ensemble submission created!")

# ALSO CREATE INDIVIDUAL MODEL SUBMISSIONS
print("\nüìä Creating individual model submissions...")

for name, model in final_models.items():
    probs = model.predict_proba(X_test)[:, 1]
    predictions = (probs > 0.70).astype(int)
    
    submission_individual = pd.DataFrame({
        'id': test['id'],
        'loan_paid_back': predictions
    })
    
    filename = f'submission_{name}.csv'
    submission_individual.to_csv(filename, index=False)
    print(f"‚úÖ {name} submission created: {filename}")

print(f"\nüéØ All submissions created with LR=0.01!")
print("   Try these in order:")
print("   1. submission_low_lr.csv (Ensemble)")
print("   2. submission_xgb.csv (XGBoost only)")
print("   3. submission_lgb.csv (LightGBM only)")
print("   4. submission_cat.csv (CatBoost only)")

üöÄ Creating advanced features...
üìä Final training shape: (593994, 23)

=== TRAINING WITH LEARNING RATE 0.01 ===
‚ö° Quick validation with LR=0.01...
üîç Validating xgb with LR=0.01...
      Fold 1: 0.9035
      Fold 2: 0.9043
      Fold 3: 0.9045
   xgb Average: 0.9041 (+/- 0.0004)

üîç Validating lgb with LR=0.01...
      Fold 1: 0.8671
      Fold 2: 0.8674
      Fold 3: 0.8674
   lgb Average: 0.8673 (+/- 0.0001)

üîç Validating cat with LR=0.01...


CatBoostError: only one of the parameters iterations, n_estimators, num_boost_round, num_trees should be initialized.

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import time
import warnings
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

# --- 1. CONFIGURATION & SETUP ---
# **UPDATED based on your data schema and Kaggle path**
# Using your provided path for the data
DATA_PATH_TRAIN = '/kaggle/input/playground-series-s5e11/train.csv'
DATA_PATH_TEST = '/kaggle/input/playground-series-s5e11/test.csv'
TARGET_COLUMN = 'loan_paid_back' # Target column confirmed
ID_COLUMN = 'id'

# Define feature groups based on your provided columns
NUMERIC_FEATURES = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
CATEGORICAL_FEATURES = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
DATE_FEATURES = [] # No date features provided in the schema

# Feature selected for leak-free Target Encoding in the CV loop
HIGH_CARD_FEATURE = 'loan_purpose'

# --- 2. DATA LOADING ---
print("1. Loading Data...")
try:
    # Load training and test data using the provided Kaggle paths
    df_train = pd.read_csv(DATA_PATH_TRAIN)
    df_test = pd.read_csv(DATA_PATH_TEST)
    
    # Combine data for consistent preprocessing (excluding the target column from test)
    # The 'Source' column is essential to split back later
    df_train['Source'] = 'Train'
    df_test['Source'] = 'Test'
    
    # Get the target variable before concatenating
    y = df_train[TARGET_COLUMN]
    
    # Concatenate the main dataframes for unified feature engineering
    df_combined = pd.concat([df_train.drop(TARGET_COLUMN, axis=1), df_test], ignore_index=True)
    
    print(f"Train data loaded. Shape: {df_train.shape}")
    print(f"Test data loaded. Shape: {df_test.shape}")
    print(f"Combined data shape: {df_combined.shape}")
    
except FileNotFoundError:
    print(f"Error: One or both files not found at the specified paths.")
    print("Using large dummy data for demonstration. **Please replace with real data.**")
    
    # --- Fallback to Dummy Data (for robustness) ---
    data = {
        ID_COLUMN: range(593994),
        TARGET_COLUMN: np.random.randint(0, 2, 593994),
        'annual_income': np.random.rand(593994) * 120000,
        'debt_to_income_ratio': np.random.rand(593994) * 40,
        'credit_score': np.random.randint(580, 850, 593994),
        'loan_amount': np.random.rand(593994) * 35000,
        'interest_rate': np.random.rand(593994) * 25,
        'gender': np.random.choice(['M', 'F', np.nan], 593994),
        'marital_status': np.random.choice(['Single', 'Married'], 593994),
        'education_level': np.random.choice(['High School', 'Bachelor', 'Master'], 593994),
        'employment_status': np.random.choice(['Employed', 'Self-Employed', 'Unemployed'], 593994),
        'loan_purpose': np.random.choice(['Debt Consolidation', 'Credit Card', 'Home Improvement', 'Other'], 593994),
        'grade_subgrade': np.random.choice([f'A{i}' for i in range(1, 6)], 593994),
    }
    df_train = pd.DataFrame(data)
    y = df_train[TARGET_COLUMN]
    df_combined = df_train.drop(TARGET_COLUMN, axis=1)
    df_combined['Source'] = 'Train'
    
    # Introduce NaNs for the dummy data path
    for col in NUMERIC_FEATURES:
         df_combined.loc[np.random.choice(df_combined.index, 5000, replace=False), col] = np.nan
    for col in CATEGORICAL_FEATURES:
         df_combined.loc[np.random.choice(df_combined.index, 2000, replace=False), col] = np.nan

# Store the original test IDs for submission later
test_ids = df_test[ID_COLUMN]

# Drop the ID column from the combined features
if ID_COLUMN in df_combined.columns:
    df_combined = df_combined.drop(columns=[ID_COLUMN])

# --- 3. MISSING VALUE IMPUTATION & CLEANING ---

print("\n2. Handling Missing Values and Cleaning...")

# 3.1 Create Missing Value Indicator Features (CRITICAL for Loan data)
print("¬† ¬†- Creating missing value indicator flags.")
missing_flag_features = []

# Check all relevant columns for NaNs
cols_to_check_missing = list(set(NUMERIC_FEATURES + CATEGORICAL_FEATURES))

for col in cols_to_check_missing:
    if col in df_combined.columns and df_combined[col].isnull().any():
        # Only create flag if column has missing data
        flag_name = f'{col}_is_missing'
        df_combined[flag_name] = df_combined[col].isnull().astype(int)
        missing_flag_features.append(flag_name)

# Update NUMERIC_FEATURES with the new flags
NUMERIC_FEATURES.extend(missing_flag_features)
NUMERIC_FEATURES = list(set(NUMERIC_FEATURES)) # Ensure uniqueness and update the list


# Imputation Strategy: Numerical Imputation: Fill NaNs with the median
for col in NUMERIC_FEATURES:
    if col in df_combined.columns and df_combined[col].dtype != 'object' and df_combined[col].isnull().any():
        # Use the median calculated from the combined (train+test) dataset for stability
        df_combined[col] = df_combined[col].fillna(df_combined[col].median())

# Categorical Imputation: Fill NaNs with a special 'Missing' category
for col in CATEGORICAL_FEATURES:
    if col in df_combined.columns and df_combined[col].dtype == 'object' and df_combined[col].isnull().any():
        df_combined[col] = df_combined[col].fillna('Missing')

# --- 4. FEATURE ENGINEERING ---

print("\n3. Feature Engineering...")

# 4.1. Ordinal Encoding for grade_subgrade (High Impact Feature)
# This assumes higher subgrades (A1 < A2 < ... < E5) are worse or better.
if 'grade_subgrade' in df_combined.columns:
    # Create an ordinal mapping: A=1, B=2, ..., E=5. And subgrade 1=1, 2=0.2, etc.
    def encode_grade_subgrade(s):
        if pd.isna(s) or s == 'Missing': return 0
        grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
        grade = s[0]
        # Handle cases where subgrade might not be a digit (e.g., 'A') by defaulting to 1
        try:
            subgrade = int(s[1:])
        except:
            subgrade = 1
        # Combine: Example A1 = 1.1, G5 = 7.5
        return grade_map.get(grade, 0) + (subgrade / 10)
        
    df_combined['grade_subgrade_numeric'] = df_combined['grade_subgrade'].apply(encode_grade_subgrade)
    NUMERIC_FEATURES.append('grade_subgrade_numeric')
    
    # --- FIX: DROP THE ORIGINAL CATEGORICAL COLUMN ---
    df_combined = df_combined.drop(columns=['grade_subgrade'])
    # Remove original feature from categorical list to avoid OHE
    CATEGORICAL_FEATURES.remove('grade_subgrade')

# Remove the target-encoded feature from the list to be OHE later
if HIGH_CARD_FEATURE in CATEGORICAL_FEATURES:
    CATEGORICAL_FEATURES.remove(HIGH_CARD_FEATURE)
    print(f"¬† ¬†- Removed {HIGH_CARD_FEATURE} from OHE list for Target Encoding later.")

# 4.2. Interaction Features (Critical for performance)
# Creating ratios that lenders likely use to assess risk.
if all(col in df_combined.columns for col in ['loan_amount', 'annual_income']):
    # Income stability against the size of the loan
    df_combined['loan_to_income_ratio'] = df_combined['loan_amount'] / (df_combined['annual_income'] + 1e-6)
    NUMERIC_FEATURES.append('loan_to_income_ratio')

if all(col in df_combined.columns for col in ['debt_to_income_ratio', 'annual_income']):
    # Estimate of absolute monthly debt payment
    df_combined['estimated_monthly_debt'] = df_combined['debt_to_income_ratio'] * df_combined['annual_income'] / 100 / 12
    NUMERIC_FEATURES.append('estimated_monthly_debt')

if all(col in df_combined.columns for col in ['interest_rate', 'credit_score']):
    # Combining risk factors
    df_combined['rate_x_score'] = df_combined['interest_rate'] * df_combined['credit_score']
    NUMERIC_FEATURES.append('rate_x_score')

# Update feature lists after engineering
NEW_NUMERIC_FEATURES = [col for col in df_combined.columns if col not in NUMERIC_FEATURES and col not in CATEGORICAL_FEATURES and col not in [TARGET_COLUMN, 'Source']]
NUMERIC_FEATURES.extend(NEW_NUMERIC_FEATURES)

CATEGORICAL_FEATURES = [col for col in CATEGORICAL_FEATURES if col in df_combined.columns]

# --- 5. ENCODING AND SCALING ---

# 5.1. One-Hot Encoding for remaining Categorical Features
df_combined = pd.get_dummies(df_combined, columns=CATEGORICAL_FEATURES, dummy_na=False)

# 5.2. Scaling Numerical Features
print("Scaling numerical features...")
# QuantileTransformer is excellent for boosting models as it transforms the data to a normal distribution
QT = QuantileTransformer(output_distribution='normal', n_quantiles=1000, subsample=50000)

features_to_scale = [col for col in NUMERIC_FEATURES if col in df_combined.columns and df_combined[col].dtype in ['float64', 'int64']]

# Apply scaling to the combined dataset
df_combined[features_to_scale] = QT.fit_transform(df_combined[features_to_scale])


# --- 6. DATA SPLIT AND MODELING (LightGBM with Stratified K-Fold Cross-Validation) ---

# Split combined data back into training and test sets
X = df_combined.loc[df_combined['Source'] == 'Train'].drop('Source', axis=1).reset_index(drop=True)
X_test = df_combined.loc[df_combined['Source'] == 'Test'].drop('Source', axis=1).reset_index(drop=True)
X_test_original = X_test.copy() # Keep a copy of X_test before target encoding/dropping

# Identify remaining features for training
FINAL_FEATURES = X.columns.tolist()

# Get the list of OHE columns (which are now boolean/int 0/1)
categorical_cols = [col for col in FINAL_FEATURES if any(cat in col for cat in CATEGORICAL_FEATURES)]
for col in categorical_cols:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category') # Apply category type to test set as well

print("\n4. Training LightGBM Model with Stratified K-Fold...")


# LightGBM Hyperparameters - Significantly optimized for performance and convergence
LGB_PARAMS = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 5000, # Max iterations, stopped by early stopping
    'learning_rate': 0.015, # ***CRITICAL FIX: Increased dramatically from 0.00001***
    'num_leaves': 60, # Increased model complexity
    'max_depth': 7, # Increased model complexity
    'seed': 42,
    'n_jobs': -1,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'reg_alpha': 0.1, # Added light regularization
    'reg_lambda': 0.1, # Added light regularization
    'verbose': -1,
    'min_child_samples': 20, # Reduced slightly from 25
    'scale_pos_weight': 1,
    
    # --- GPU CONFIGURATION (Use P100 for best speed) ---
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0
    # ----------------------------------------------------
}

N_SPLITS = 5
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test)) # Array to hold averaged test predictions
feature_importance_df = pd.DataFrame()
start_time = time.time()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print(f"Fold {fold_+1}/{N_SPLITS}...")
    X_train_fold = X.iloc[trn_idx].copy()
    X_valid_fold = X.iloc[val_idx].copy()
    y_train = y.iloc[trn_idx]
    y_valid = y.iloc[val_idx]
    
    # Create a clean copy of the test data for this fold
    X_test_fold = X_test.copy()

    # --- Leak-Free Target Encoding for HIGH_CARD_FEATURE ---
    if HIGH_CARD_FEATURE in X_train_fold.columns:
        # Calculate target mean on training set
        target_map = y_train.groupby(X_train_fold[HIGH_CARD_FEATURE]).mean()
        
        # Apply mapping to train, validation, and test sets
        X_train_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'] = X_train_fold[HIGH_CARD_FEATURE].map(target_map)
        X_valid_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'] = X_valid_fold[HIGH_CARD_FEATURE].map(target_map)
        X_test_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'] = X_test_fold[HIGH_CARD_FEATURE].map(target_map)
        
        # Fill NaNs (if a category appears in validation/test but not train) with the global mean
        global_mean = y_train.mean()
        X_valid_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'] = X_valid_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'].fillna(global_mean)
        X_test_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'] = X_test_fold[f'{HIGH_CARD_FEATURE}_TargetEnc'].fillna(global_mean)

        # Drop the original categorical column from the fold data
        X_train_fold = X_train_fold.drop(columns=[HIGH_CARD_FEATURE])
        X_valid_fold = X_valid_fold.drop(columns=[HIGH_CARD_FEATURE])
        X_test_fold = X_test_fold.drop(columns=[HIGH_CARD_FEATURE])
    # -----------------------------------------------------

    model = lgb.LGBMClassifier(**LGB_PARAMS)
    
    model.fit(X_train_fold, y_train,
              eval_set=[(X_valid_fold, y_valid)],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(200, verbose=False)],
             )

    oof_preds[val_idx] = model.predict_proba(X_valid_fold)[:, 1]
    
    # Predict on the test set and accumulate (average) the predictions
    test_preds += model.predict_proba(X_test_fold)[:, 1] / N_SPLITS

    # Feature Importance for analysis
    fold_importance_df = pd.DataFrame({
        'feature': X_train_fold.columns, # Use training fold columns since they now include TargetEnc
        'importance': model.feature_importances_,
        'fold': fold_ + 1
    })
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    del X_train_fold, X_valid_fold, y_train, y_valid, X_test_fold
    
# --- 7. RESULTS & SUBMISSION ---

cv_auc_score = roc_auc_score(y, oof_preds)
time_taken = time.time() - start_time
print(f"\n--- Cross-Validation Results ---")
print(f"OOF AUC Score: {cv_auc_score:.6f}")
print(f"Total Training Time: {time_taken:.2f} seconds")

# Display top features
print("\nTop 20 Features by Importance:")
avg_importance = feature_importance_df.groupby('feature')['importance'].mean().sort_values(ascending=False).head(20)
print(avg_importance)

if cv_auc_score > 0.92:
    print("\n\t*** Congratulations! You are competitive with or exceeding the target score. ***")
else:
    print(f"\n\tCurrent OOF AUC Score: {cv_auc_score:.6f}. Further fine-tuning or feature engineering may be needed.")



1. Loading Data...
Train data loaded. Shape: (593994, 14)
Test data loaded. Shape: (254569, 13)
Combined data shape: (848563, 13)

2. Handling Missing Values and Cleaning...
¬† ¬†- Creating missing value indicator flags.

3. Feature Engineering...
¬† ¬†- Removed loan_purpose from OHE list for Target Encoding later.
Scaling numerical features...

4. Training LightGBM Model with Stratified K-Fold...
Fold 1/5...
Fold 2/5...
Fold 3/5...
Fold 4/5...
Fold 5/5...

--- Cross-Validation Results ---
OOF AUC Score: 0.922103
Total Training Time: 579.55 seconds

Top 20 Features by Importance:
feature
debt_to_income_ratio            27103.4
credit_score                    20274.4
loan_amount                     17213.8
annual_income                   15651.2
interest_rate                   15351.6
estimated_monthly_debt          14936.2
rate_x_score                    14689.8
loan_to_income_ratio            13854.8
grade_subgrade_numeric           7655.6
loan_purpose_TargetEnc           4718.4
educa

In [11]:
# --- 8. CREATE SUBMISSION FILE ---
submission_df = pd.DataFrame({
    ID_COLUMN: test_ids,
    TARGET_COLUMN: test_preds
})

submission_filename = 'submission_lgbm_optimized.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully. Test predictions are ready.")


Submission file 'submission_lgbm_optimized.csv' created successfully. Test predictions are ready.
