<a href="https://colab.research.google.com/github/sujansuprajn/Hackathons/blob/main/Loan_approval_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
from google.colab import drive

# Check if Google Drive is already mounted
if not os.path.exists('/content/drive/MyDrive/kaggle.json'):
    drive.mount('/content/drive')

# Setup Kaggle API key configuration
kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
if not os.path.exists(kaggle_path):
    os.makedirs(os.path.dirname(kaggle_path), exist_ok=True)
    !cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
else:
    print("Kaggle configuration already set up.")

Mounted at /content/drive


In [3]:
#dataset download
!kaggle competitions download -c playground-series-s4e10

#unzip
!unzip playground-series-s4e10.zip
!kaggle datasets download -d chilledwanker/loan-approval-prediction
!unzip loan-approval-prediction.zip

Downloading playground-series-s4e10.zip to /content
  0% 0.00/1.45M [00:00<?, ?B/s]
100% 1.45M/1.45M [00:00<00:00, 165MB/s]
Archive:  playground-series-s4e10.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
Dataset URL: https://www.kaggle.com/datasets/chilledwanker/loan-approval-prediction
License(s): apache-2.0
Downloading loan-approval-prediction.zip to /content
  0% 0.00/368k [00:00<?, ?B/s]
100% 368k/368k [00:00<00:00, 82.1MB/s]
Archive:  loan-approval-prediction.zip
  inflating: credit_risk_dataset.csv  


In [5]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler

In [6]:
df = pd.read_csv('train.csv', index_col='id')
X_test_full = pd.read_csv('test.csv', index_col='id')

In [7]:
original = pd.read_csv('credit_risk_dataset.csv')
X= pd.concat([df, original],ignore_index=True)
print("shape of the data :",X.shape)

shape of the data : (91226, 12)


In [8]:
#removing rows with missing taret
X.dropna(axis=0, subset=['loan_status'], inplace=True)
y = X.loan_status

X.drop(['loan_status','loan_amnt','cb_person_default_on_file','person_age'],axis=1, inplace=True)
X['loan_percent_income'] = X['loan_percent_income'] * 100

X_test_full.drop(['loan_amnt','cb_person_default_on_file','person_age'],axis=1, inplace=True)
X_test_full['loan_percent_income'] = X_test_full['loan_percent_income'] * 100

In [9]:
#break off validation data from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state=0)

# categorical cols with low cardinality
cat_cols = [cname for cname in X_train_full.columns
            if X_train_full[cname].dtype == 'object' and
            X_train_full[cname].nunique() < 10]

# numerical cols
num_cols = [cname for cname in X_train_full.columns
            if X_train_full[cname].dtype in ['int64','float64']]


# keeping only selected cols
my_cols = cat_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_valid[col] = X_valid[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [10]:
my_cols

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'person_income',
 'person_emp_length',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length']

PreProcessing With Pipeline

In [16]:
# preprocessing numerical data
numerical_transformer = SimpleImputer(strategy='median')

# preprocessing categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = 'missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
   ])


model = XGBClassifier(
    max_depth=3,
    learning_rate=0.08683740317846539,
    n_estimators=961,
    min_child_weight=3,
    random_state=0,
    enable_categorical=True,
    gamma=0.5,
    subsample=0.9292131415430409,
    scale_pos_weight=6.971333166633001,
    early_stopping_rounds=10,
    colsample_bytree= 0.90987893726981,
    reg_alpha=0.8570404246557465,
    reg_lambda=5.995860925165919e-05)

#Best parameters: {'max_depth': 3, 'learning_rate': 0.08683740317846539, 'n_estimators': 961, 'subsample': 0.9292131415430409, 'colsample_bytree': 0.90987893726981, 'scale_pos_weight': 6.971333166633001, 'lambda': 5.995860925165919e-05, 'alpha': 0.8570404246557465}
#Best ROC AUC score: 0.9561823463359602
#earlier depth6, learning_rate 0.01, estimators 1000, scaloe pos wieth 5, alpha= 0.5

Predicting the Target Variable

In [17]:
# Bundling preprocessing and modeling in Pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
  ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train, model__verbose=False, model__eval_set=[(preprocessor.fit_transform(X_train), y_train)])


preds = my_pipeline.predict(X_valid)
preds_proba = my_pipeline.predict_proba(X_valid)[:, 1]

xgb_auc = roc_auc_score(y_valid, preds_proba)
print(f"AUC: {xgb_auc:.4f}")

print("Confusion Matrix:")
print(confusion_matrix(y_valid, preds))
print("\nClassification Report:")
print(classification_report(y_valid, preds))

AUC: 0.9536
Confusion Matrix:
[[13685  1483]
 [  411  2667]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.90      0.94     15168
           1       0.64      0.87      0.74      3078

    accuracy                           0.90     18246
   macro avg       0.81      0.88      0.84     18246
weighted avg       0.92      0.90      0.90     18246



In [18]:
# preprocessing test data and predicting
preds_test = my_pipeline.predict(X_test)
output = pd.DataFrame({'id':X_test.index,
                       'loan_status':preds_test})
output.to_csv('submission.csv', index = False)

!kaggle competitions submit -c playground-series-s4e10 -f submission.csv -m "Message"

100% 305k/305k [00:00<00:00, 711kB/s]
400 - Bad Request - Submission not allowed:  Your team has used its daily Submission allowance (5) today, please try again tomorrow UTC (7.4 hours from now).
