<a href="https://www.kaggle.com/code/samithsachidanandan/predicting-loan-payback-xgboost?scriptVersionId=273087149" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

### Import Libraries 

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from pandas.plotting import scatter_matrix
from sklearn import set_config
set_config(display='diagram')


from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.model_selection import train_test_split




### Loading the Data 

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

### Basic Statistics

In [3]:
train.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
test.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


In [5]:
train.shape

(593994, 13)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [7]:
train.dtypes

id                        int64
annual_income           float64
debt_to_income_ratio    float64
credit_score              int64
loan_amount             float64
interest_rate           float64
gender                   object
marital_status           object
education_level          object
employment_status        object
loan_purpose             object
grade_subgrade           object
loan_paid_back          float64
dtype: object

In [8]:
print("Target column statistics (loan_paid_back):")

train['loan_paid_back'].describe()

Target column statistics (loan_paid_back):


count    593994.000000
mean          0.798820
std           0.400883
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: loan_paid_back, dtype: float64

In [9]:
train.isnull().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [10]:
print("Duplicated Rows:",train.duplicated().sum())

Duplicated Rows: 0


In [11]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,593994.0,296996.5,171471.442235,0.0,148498.25,296996.5,445494.75,593993.0
annual_income,593994.0,48212.202976,26711.942078,6002.43,27934.4,46557.68,60981.32,393381.74
debt_to_income_ratio,593994.0,0.120696,0.068573,0.011,0.072,0.096,0.156,0.627
credit_score,593994.0,680.916009,55.424956,395.0,646.0,682.0,719.0,849.0
loan_amount,593994.0,15020.297629,6926.530568,500.09,10279.62,15000.22,18858.58,48959.95
interest_rate,593994.0,12.356345,2.008959,3.2,10.99,12.37,13.68,20.99
loan_paid_back,593994.0,0.79882,0.400883,0.0,1.0,1.0,1.0,1.0


### Removing the outliers

#### Z-score Method

In [12]:
upper_limit_credit_score = train['credit_score'].mean() + 3 *train['credit_score'].std()
lower_limit_credit_score = train['credit_score'].mean() - 3 *train['credit_score'].std()
upper_limit_interest_rate = train['interest_rate'].mean() + 3 *train['interest_rate'].std()
lower_limit_interest_rate = train['interest_rate'].mean() - 3 *train['interest_rate'].std()

In [13]:
train['credit_score'] = np.where(train['credit_score'] > upper_limit_credit_score,upper_limit_credit_score,np.where(train['credit_score']<lower_limit_credit_score, lower_limit_credit_score,train['credit_score']))

In [14]:
train['interest_rate'] = np.where(train['interest_rate'] > upper_limit_interest_rate,upper_limit_interest_rate,np.where(train['interest_rate']<lower_limit_interest_rate, lower_limit_interest_rate,train['interest_rate']))

#### IQR Method 

In [15]:
train['annual_income'].skew()

1.719508683164502

In [16]:
train['debt_to_income_ratio'].skew()

1.4066799385489641

In [17]:
train['loan_amount'].skew()

0.2073598200202166

In [18]:
features = [ 'annual_income','debt_to_income_ratio','loan_amount' ]



for feature in features:
    Q1 = train[feature].quantile(0.25)
    Q3 = train[feature].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    
    train[feature] = np.where(train[feature] >upper, upper, np.where(train[feature]<lower,lower, train[feature]) )

### Feature Engineering 

In [19]:
def engineer_features(df):
    
    df = df.copy()
    
    df['monthly_income'] = df['annual_income'] / 12
    df['monthly_payment'] = (df['loan_amount'] * df['interest_rate'] / 100) / 12
    df['payment_to_income_ratio'] = df['monthly_payment'] / df['monthly_income']
    
    
    df['total_debt'] = df['loan_amount'] * df['debt_to_income_ratio']
    df['monthly_debt'] = df['total_debt'] / 12
    df['remaining_income'] = df['monthly_income'] - df['monthly_debt']
    
    
    df['credit_efficiency'] = df['credit_score'] / df['debt_to_income_ratio']
    df['loan_to_income_ratio'] = df['loan_amount'] / df['annual_income']
    
    
    df['income_bracket'] = pd.cut(df['annual_income'], 
                                   bins=[0, 25000, 50000, 75000, 100000, np.inf],
                                   labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    
    
    df['credit_category'] = pd.cut(df['credit_score'],
                                    bins=[0, 580, 670, 740, 800, np.inf],
                                    labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])
    
    
    df['loan_size'] = pd.cut(df['loan_amount'],
                              bins=[0, 5000, 10000, 20000, np.inf],
                              labels=['Small', 'Medium', 'Large', 'Very Large'])
    
    
    df['rate_category'] = pd.cut(df['interest_rate'],
                                  bins=[0, 10, 15, 20, np.inf],
                                  labels=['Low', 'Medium', 'High', 'Very High'])
    
    
    df['risk_score'] = (df['debt_to_income_ratio'] * df['interest_rate']) / df['credit_score']
    df['income_credit_interaction'] = df['annual_income'] * df['credit_score']
    df['debt_credit_interaction'] = df['debt_to_income_ratio'] * df['credit_score']
    

    df['credit_score_squared'] = df['credit_score'] ** 2
    df['debt_ratio_squared'] = df['debt_to_income_ratio'] ** 2
    df['income_log'] = np.log1p(df['annual_income'])
    df['loan_amount_log'] = np.log1p(df['loan_amount'])
    
   
    df['gender_marital'] = df['gender'] + '_' + df['marital_status']
    df['education_employment'] = df['education_level'] + '_' + df['employment_status']
    
   
    df['high_risk_flag'] = ((df['debt_to_income_ratio'] > 0.4) | 
                            (df['credit_score'] < 650) | 
                            (df['interest_rate'] > 15)).astype(int)
    
    df['excellent_credit_flag'] = (df['credit_score'] >= 750).astype(int)
    df['high_income_flag'] = (df['annual_income'] >= 50000).astype(int)
    df['has_advanced_degree'] = (df['education_level'] == "Master's").astype(int)
    
 
    gender_income_mean = df.groupby('gender')['annual_income'].transform('mean')
    df['income_vs_gender_avg'] = df['annual_income'] / gender_income_mean
    

    edu_income_mean = df.groupby('education_level')['annual_income'].transform('mean')
    df['income_vs_edu_avg'] = df['annual_income'] / edu_income_mean
    
  
        
    return df


In [20]:
train_df = engineer_features(train)



In [21]:
test_df = engineer_features(test)

In [22]:
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_cols = [col for col in numeric_cols if col != 'loan_paid_back']


categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()



print("*"*180)
print("Numeric:", numeric_cols)

print("*"*180)

print("Categorical:", categorical_cols)
print("*"*180)



************************************************************************************************************************************************************************************
Numeric: ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'monthly_income', 'monthly_payment', 'payment_to_income_ratio', 'total_debt', 'monthly_debt', 'remaining_income', 'credit_efficiency', 'loan_to_income_ratio', 'risk_score', 'income_credit_interaction', 'debt_credit_interaction', 'credit_score_squared', 'debt_ratio_squared', 'income_log', 'loan_amount_log', 'high_risk_flag', 'excellent_credit_flag', 'high_income_flag', 'has_advanced_degree', 'income_vs_gender_avg', 'income_vs_edu_avg']
************************************************************************************************************************************************************************************
Categorical: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpos

### Preprocesing and Scaling Pipeline

In [23]:
train_df['education_employment'].value_counts()

education_employment
Bachelor's_Employed          210779
High School_Employed         140074
Master's_Employed             70970
Bachelor's_Unemployed         31178
Bachelor's_Self-employed      24441
Other_Employed                20361
High School_Unemployed        18148
High School_Self-employed     16238
Master's_Unemployed            9513
PhD_Employed                   8461
Master's_Self-employed         8410
Bachelor's_Retired             7461
Bachelor's_Student             5747
High School_Retired            5494
High School_Student            3638
Other_Unemployed               2678
Other_Self-employed            2400
Master's_Retired               2319
Master's_Student               1885
PhD_Self-employed               991
PhD_Unemployed                  968
Other_Retired                   757
Other_Student                   481
PhD_Retired                     422
PhD_Student                     180
Name: count, dtype: int64

In [24]:
y_train = train_df['loan_paid_back']
X_train = train_df.drop('loan_paid_back', axis=1)

X_test = test_df.copy() 

In [25]:
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols),
    ('scale', MinMaxScaler(), numeric_cols)
])

### Model Training 

In [26]:
xgb_params = {
    'n_estimators': 10000,
    'max_depth': 4,
    'learning_rate': 0.010433357477511243,
    'tree_method': 'hist',
    'device': 'cuda',
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'random_state': 42,
    'min_child_weight': 20,
    'subsample': 0.8879829126651821,
    'colsample_bytree': 0.5543148418738543,
    'gamma': 0.6845363006652688,
    'reg_alpha': 0.2399421158144976,
    'reg_lambda': 0.28254661049782354,
    'enable_categorical': True,
    
}

In [27]:


X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

xgb_pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', xgb.XGBClassifier(**xgb_params))
])
xgb_pipeline.fit(X_train_split, y_train_split)



### Evaluate on validation set

In [28]:

y_pred_proba_val = xgb_pipeline.predict_proba(X_val)
roc_auc = roc_auc_score(y_val, y_pred_proba_val[:, 1])
print(f"Validation ROC AUC Score: {roc_auc:.4f}")



Validation ROC AUC Score: 0.9205


### Full data training for final predictions

In [29]:

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb_final = xgb_pipeline.predict(X_test)
y_pred_proba_xgb = xgb_pipeline.predict_proba(X_test)

In [30]:
submission = submission.copy()
submission['loan_paid_back'] = y_pred_xgb_final

submission.to_csv('submission.csv', index=False)
print("\n Submission saved to 'submission.csv'")


 Submission saved to 'submission.csv'
