# Introduction

This pipeline includes data preprocessing with OneHotEncoder for categorical variables, OrdinalEncoder for ordinal features, and MinMaxScaler for feature scaling. 

The classifier used is XGBoost, configured for multi-class classification with hyperparameters tuned for optimal performance.

### Importing required libraries

In [10]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import (
    OrdinalEncoder,
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import time

In [2]:
# Loading data
data = pd.read_csv("credit_data.csv")
# data.head(5)
data.drop(columns=["Unnamed: 0"])
# data.shape

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2,Approved_Flag
0,0.000,0.00,0,0.000,0,0,0,4,1,4,...,0.000,0.0,1,0,Married,2,M,PL,PL,P2
1,0.000,0.00,0,0.000,0,0,0,0,0,1,...,0.000,0.0,0,0,Single,3,F,ConsumerLoan,ConsumerLoan,P2
2,0.125,0.00,0,0.000,1,0,0,0,2,6,...,0.000,0.0,1,0,Married,1,M,ConsumerLoan,others,P2
3,0.000,0.00,0,0.000,0,0,0,0,3,0,...,0.000,0.0,0,0,Married,4,M,AL,AL,P1
4,0.000,0.00,1,0.167,0,0,0,0,6,0,...,0.429,0.0,1,0,Married,2,M,ConsumerLoan,PL,P3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42059,0.333,0.00,0,0.000,0,0,0,0,0,3,...,0.000,0.0,0,0,Married,2,M,ConsumerLoan,ConsumerLoan,P4
42060,0.000,0.25,1,0.250,0,0,0,0,2,2,...,0.000,0.0,0,0,Married,1,M,others,others,P1
42061,0.500,0.50,1,0.500,0,0,0,0,0,2,...,1.000,0.0,0,0,Married,1,M,ConsumerLoan,others,P3
42062,0.000,0.00,1,0.500,0,0,0,0,0,2,...,0.000,0.0,0,0,Single,3,F,ConsumerLoan,others,P2


In [4]:
train_input, test_input, train_target, test_target = train_test_split(
    data.drop(columns=["Approved_Flag"]),
    data["Approved_Flag"],
    test_size=0.2,
    random_state=42)

### Model Pipeline

In [11]:
# Define transformers for each feature type
ohe_transformers = [
    (
        "ohe_martial_status",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["MARITALSTATUS"],
    ),
    (
        "ohe_gender",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["GENDER"],
    ),
    (
        "ohe_last_prod_enq2",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["last_prod_enq2"],
    ),
    (
        "ohe_first_prod_enq2",
        OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
        ["first_prod_enq2"],
    ),
]

education_encoder = LabelEncoder()

ordinal_transformers = [
    (
        "oe_education",
        OrdinalEncoder(
            # Now, categories are expected to be numerical
            categories=[np.arange(7)]
        ),
        ["EDUCATION"],
    ),
]


# Create ColumnTransformer
trf1 = ColumnTransformer(
    transformers=ohe_transformers + ordinal_transformers, remainder="passthrough"
)

trf1.get_params

In [12]:
# Scaling
trf2 = ColumnTransformer([("scale", MinMaxScaler(), slice(0, 51))])

In [13]:
# Label Encoder (for Categorical Targets Only)
le = LabelEncoder()
le.fit(train_target)

train_target = le.transform(train_target)
test_target = le.transform(test_target)

In [14]:
# Creating classifier using XGBOOST Algorithm
clf = xgb.XGBClassifier(
    objective="multi:softmax", num_class=4, eta=0.15, gamma=0.2, max_depth=5
)

pipe = Pipeline([("trf1", trf1), ("trf2", trf2), ("clf", clf)])
pipe.fit(train_input, train_target)


pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_martial_status',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['MARITALSTATUS']),
                                 ('ohe_gender',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['GENDER']),
                                 ('ohe_last_prod_enq2',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['last_prod_enq2']),
                 

### To check Underfitting

In [16]:
train_pred = pipe.predict(train_input)

In [17]:
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

In [18]:
accuracy_score(train_target,train_pred)

0.8228581617188196

### Model Accuracy

In [20]:
test_pred = pipe.predict(test_input)
test_pred

array([1, 3, 1, ..., 2, 1, 3], dtype=int32)

In [21]:
accuracy_score(test_target,test_pred)

0.7817663140377986

In [22]:
overall_f1_score = precision_recall_fscore_support(test_target,test_pred, average='weighted')[2]
print(f"Overall F1 Score: {overall_f1_score:.2f}")

Overall F1 Score: 0.76


# Conclusion

* The model achieved an accuracy of 78.18% on the test set, with an overall F1 score of 0.76, indicating moderate success in predicting loan approval.
