<a href="https://www.kaggle.com/code/samithsachidanandan/predicting-loan-payback-xgboost?scriptVersionId=273133384" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb




### Loading the Data 

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

In [3]:
train_ids = train['id']
test_ids = test['id']

In [4]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

### Basic Statistics

In [5]:
train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [6]:
test.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


In [7]:
train.shape

(593994, 12)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   annual_income         593994 non-null  float64
 1   debt_to_income_ratio  593994 non-null  float64
 2   credit_score          593994 non-null  int64  
 3   loan_amount           593994 non-null  float64
 4   interest_rate         593994 non-null  float64
 5   gender                593994 non-null  object 
 6   marital_status        593994 non-null  object 
 7   education_level       593994 non-null  object 
 8   employment_status     593994 non-null  object 
 9   loan_purpose          593994 non-null  object 
 10  grade_subgrade        593994 non-null  object 
 11  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 54.4+ MB


In [9]:
train.dtypes

annual_income           float64
debt_to_income_ratio    float64
credit_score              int64
loan_amount             float64
interest_rate           float64
gender                   object
marital_status           object
education_level          object
employment_status        object
loan_purpose             object
grade_subgrade           object
loan_paid_back          float64
dtype: object

In [10]:
print("Target column statistics (loan_paid_back):")

train['loan_paid_back'].describe()

Target column statistics (loan_paid_back):


count    593994.000000
mean          0.798820
std           0.400883
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: loan_paid_back, dtype: float64

In [11]:
train.isnull().sum()

annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [12]:
print("Duplicated Rows:",train.duplicated().sum())

Duplicated Rows: 0


In [13]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annual_income,593994.0,48212.202976,26711.942078,6002.43,27934.4,46557.68,60981.32,393381.74
debt_to_income_ratio,593994.0,0.120696,0.068573,0.011,0.072,0.096,0.156,0.627
credit_score,593994.0,680.916009,55.424956,395.0,646.0,682.0,719.0,849.0
loan_amount,593994.0,15020.297629,6926.530568,500.09,10279.62,15000.22,18858.58,48959.95
interest_rate,593994.0,12.356345,2.008959,3.2,10.99,12.37,13.68,20.99
loan_paid_back,593994.0,0.79882,0.400883,0.0,1.0,1.0,1.0,1.0


### Removing the outliers

In [14]:
def remove_outliers(train_df, test_df=None):
  
    train_df = train_df.copy()
    
   
    credit_upper = train_df['credit_score'].mean() + 3 * train_df['credit_score'].std()
    credit_lower = train_df['credit_score'].mean() - 3 * train_df['credit_score'].std()
    rate_upper = train_df['interest_rate'].mean() + 3 * train_df['interest_rate'].std()
    rate_lower = train_df['interest_rate'].mean() - 3 * train_df['interest_rate'].std()
    
  
    train_df['credit_score'] = np.clip(train_df['credit_score'], credit_lower, credit_upper)
    train_df['interest_rate'] = np.clip(train_df['interest_rate'], rate_lower, rate_upper)
    

    features = ['annual_income', 'debt_to_income_ratio', 'loan_amount']
    limits = {}
    
    for feature in features:
        Q1 = train_df[feature].quantile(0.25)
        Q3 = train_df[feature].quantile(0.75)
        IQR = Q3 - Q1
        limits[feature] = {
            'lower': Q1 - 1.5 * IQR,
            'upper': Q3 + 1.5 * IQR
        }
        train_df[feature] = np.clip(train_df[feature], limits[feature]['lower'], limits[feature]['upper'])
    

    if test_df is not None:
        test_df = test_df.copy()
        test_df['credit_score'] = np.clip(test_df['credit_score'], credit_lower, credit_upper)
        test_df['interest_rate'] = np.clip(test_df['interest_rate'], rate_lower, rate_upper)
        
        for feature in features:
            test_df[feature] = np.clip(test_df[feature], limits[feature]['lower'], limits[feature]['upper'])
        
        return train_df, test_df
    
    return train_df







### Feature Engineering 

In [15]:
def engineer_features(df):
   
    df = df.copy()
    

    df['monthly_income'] = df['annual_income'] / 12
    df['monthly_payment'] = (df['loan_amount'] * df['interest_rate'] / 100) / 12
    df['payment_to_income_ratio'] = df['monthly_payment'] / df['monthly_income']
    
    df['total_debt'] = df['loan_amount'] * df['debt_to_income_ratio']
    df['monthly_debt'] = df['total_debt'] / 12
    df['remaining_income'] = df['monthly_income'] - df['monthly_debt']
    
    df['credit_efficiency'] = df['credit_score'] / (df['debt_to_income_ratio'] + 0.001)
    df['loan_to_income_ratio'] = df['loan_amount'] / df['annual_income']
    
  
    df['risk_score'] = (df['debt_to_income_ratio'] * df['interest_rate']) / (df['credit_score'] + 1)
    
 
    df['income_credit_interaction'] = df['annual_income'] * df['credit_score']
    df['debt_credit_interaction'] = df['debt_to_income_ratio'] * df['credit_score']
    
   
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['debt_ratio_squared'] = df['debt_to_income_ratio'] ** 2
    df['income_log'] = np.log1p(df['annual_income'])
    df['loan_amount_log'] = np.log1p(df['loan_amount'])
    

    df['gender_marital'] = df['gender'] + '_' + df['marital_status']
    df['education_employment'] = df['education_level'] + '_' + df['employment_status']
    

    df['high_risk_flag'] = ((df['debt_to_income_ratio'] > 0.4) | 
                            (df['credit_score'] < 650) | 
                            (df['interest_rate'] > 15)).astype(int)
    
    df['excellent_credit_flag'] = (df['credit_score'] >= 750).astype(int)
    df['high_income_flag'] = (df['annual_income'] >= 50000).astype(int)
    df['has_advanced_degree'] = (df['education_level'].isin(["Master's", "PhD"])).astype(int)
    
    return df


In [16]:
train, test = remove_outliers(train, test)

In [17]:
train.columns

Index(['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
       'interest_rate', 'gender', 'marital_status', 'education_level',
       'employment_status', 'loan_purpose', 'grade_subgrade',
       'loan_paid_back'],
      dtype='object')

In [18]:
train_df = engineer_features(train)
test_df = engineer_features(test)

### Preprocesing and Scaling Pipeline

In [19]:
y_train = train_df['loan_paid_back']
X_train = train_df.drop('loan_paid_back', axis=1)

X_test = test_df.copy()

#### Dropping the temporary columns

In [20]:
cols_to_drop = [col for col in X_train.columns if col.startswith('_')]
if cols_to_drop:
    X_train = X_train.drop(columns=cols_to_drop)
    X_test = X_test.drop(columns=cols_to_drop)
print(f"Dropped temporary columns: {cols_to_drop}")

Dropped temporary columns: []


In [21]:
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()


categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()



print("*"*180)
print("Numeric:", numeric_cols)

print("*"*180)

print("Categorical:", categorical_cols)
print("*"*180)



************************************************************************************************************************************************************************************
Numeric: ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'monthly_income', 'monthly_payment', 'payment_to_income_ratio', 'total_debt', 'monthly_debt', 'remaining_income', 'credit_efficiency', 'loan_to_income_ratio', 'risk_score', 'income_credit_interaction', 'debt_credit_interaction', 'credit_score_squared', 'debt_ratio_squared', 'income_log', 'loan_amount_log', 'high_risk_flag', 'excellent_credit_flag', 'high_income_flag', 'has_advanced_degree']
************************************************************************************************************************************************************************************
Categorical: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade', 'gender_marital', 'education_

In [22]:
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols),
    ('scale', MinMaxScaler(), numeric_cols)
])

### Model Training 

#### XGBoost parameters

In [23]:
# xgb_params = {
#     'objective': 'binary:logistic',
#     'eval_metric': 'auc',
#     'tree_method': 'hist',
#     'device': 'cuda',
#     'random_state': 42,
#     'learning_rate': 0.010433357477511243,
#     'n_estimators': 20000,
#     'max_depth': 4,
#     'min_child_weight': 20,
#     'gamma': 0.6845363006652688,
#     'subsample': 0.8879829126651821,
#     'colsample_bytree': 0.5543148418738543,
#     'reg_alpha': 0.2399421158144976,
#     'reg_lambda': 0.28254661049782354,
#     'enable_categorical': False,  
# }



xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'device': 'cuda',
    'random_state': 42,
    'learning_rate': 0.010433357477511243,
    'n_estimators': 20000,
    'max_depth': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_jobs': -1,
    'enable_categorical': False,
    'scale_pos_weight': 0.8,
    "min_samples_split": 5,
    'lambda': 5.0, 
    'alpha': 2.5,
    'max_bin': 512
}

In [24]:
# xgb_params = {
#     'objective': 'binary:logistic',
#     'eval_metric': 'auc',
#     'tree_method': 'hist',
#     'device': 'cuda',
#     'random_state': 42,  
#     'learning_rate': 0.01,
#     'n_estimators': 10000,                 
#     'max_depth': 3,                        
#     'min_child_weight': 8,                 
#     'gamma': 0.2,                          
#     'subsample': 0.95,
#     'colsample_bytree': 0.8,
#     'colsample_bylevel': 0.8,
#     'colsample_bynode': 0.8,
#     'reg_alpha': 1.0,
#     'reg_lambda': 3.0,
#     'max_leaves': 32,
#     'enable_categorical': False,
    
# }





#### Cross-validation

In [25]:

print("Performing 5-fold cross-validation")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
best_iterations = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_tr_raw, X_val_raw = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]


    X_tr = preprocessor.fit_transform(X_tr_raw)
    X_val = preprocessor.transform(X_val_raw)


    model = xgb.XGBClassifier(**xgb_params)


    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=300,
        verbose=False
    )

    best_iter = model.best_iteration
    best_iterations.append(best_iter)

    y_pred = model.predict_proba(X_val,iteration_range=(0, best_iter + 1))[:, 1]
    score = roc_auc_score(y_val, y_pred)
    cv_scores.append(score)

    print(f"Fold {fold} | Best Trees: {best_iter} | AUC: {score:.5f}")

print("\nMean CV ROC AUC:", np.mean(cv_scores))
print("Std Dev:", np.std(cv_scores))
print("Avg Best Trees:", np.mean(best_iterations))


Performing 5-fold cross-validation
Fold 1 | Best Trees: 3845 | AUC: 0.92375
Fold 2 | Best Trees: 4946 | AUC: 0.92358
Fold 3 | Best Trees: 4407 | AUC: 0.92196
Fold 4 | Best Trees: 4235 | AUC: 0.92270
Fold 5 | Best Trees: 4033 | AUC: 0.92217

Mean CV ROC AUC: 0.9228331528482296
Std Dev: 0.0007232279890443546
Avg Best Trees: 4293.2


### Full data training for final predictions

In [26]:
print(" Training final model on full training set")

optimal_trees = int(np.mean(best_iterations))


final_params = xgb_params.copy()
final_params['n_estimators'] = optimal_trees

final_model = xgb.XGBClassifier(**final_params)


final_pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', final_model)
])

final_pipeline.fit(X_train, y_train)

print("Final model trained with", optimal_trees, "trees")

 Training final model on full training set
Final model trained with 4293 trees


###  Make predictions

In [27]:

y_pred_proba = final_pipeline.predict_proba(X_test)[:, 1]

### Submission 

In [28]:
submission['loan_paid_back'] = y_pred_proba
submission.to_csv('submission.csv', index=False)

print("Submission saved to 'submission.csv'")

print(f"Prediction range: [{y_pred_proba.min():.4f}, {y_pred_proba.max():.4f}]")

Submission saved to 'submission.csv'
Prediction range: [0.0003, 0.9998]
