# Model Training

### 1.1 Import Data and Required Packages
Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import datetime as  datetime
import time
import warnings



Import the CSV Data as Pandas DataFrame

In [3]:
df=pd.read_csv('Data/raw.csv')

### 1.2 Show Top 5 Records

In [4]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


### 1.3 Dataset information


- ID: ID of each client
- LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit)
- SEX: Gender (1=male, 2=female)
- EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
- MARRIAGE: Marital status (1=married, 2=single, 3=others)
- AGE: Age in years
- PAY_0: Repayment status in September, 2005 (-2= no credit to pay , -1=pay - - - duly , 0= minimum payment is met , 1=payment delay for one month , 2=payment delay for two months , … 8=payment delay for eight months , 9=payment delay for nine months and above)
- PAY_2: Repayment status in August, 2005 (scale same as above)
- PAY_3: Repayment status in July, 2005 (scale same as above)
- PAY_4: Repayment status in June, 2005 (scale same as above)
- PAY_5: Repayment status in May, 2005 (scale same as above)
- PAY_6: Repayment status in April, 2005 (scale same as above)
- BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
- BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
- BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
- BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
- BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
- BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
- PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
- PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
- PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
- PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
- PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
- PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
- default.payment.next.month: Default payment (1=yes, 0=no)

### 1.4 Data Cleaning

In [5]:
df.drop('ID',axis=1,inplace=True)

In [6]:
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [7]:
df['EDUCATION'] = df['EDUCATION'].replace({0: df['EDUCATION'].mode()[0], 6: df['EDUCATION'].mode()[0]})
df['MARRIAGE'] = df['MARRIAGE'].replace({0: df['MARRIAGE'].mode()[0]})



### 2. Preparing X and Y variables

In [8]:
X = df.drop(columns=['default_payment_next_month'],axis=1)

In [9]:
X.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0
2,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0
4,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0


In [10]:
# Get all column names
col_names = df.columns

# Loop through each column and display its unique values and value counts
for col in col_names:
    print(f"Column: {col}")
    print(f"Unique Values: {df[col].unique()}")
    print("-" * 50)  # Add a separator for better readability

Column: LIMIT_BAL
Unique Values: [ 20000.       120000.        90000.        50000.       500000.
 100000.       140000.       200000.       260000.       630000.
  70000.       250000.       320000.       360000.       180000.
 130000.       450000.        60000.       230000.       160000.
 280000.        10000.        40000.       210000.       150000.
 380000.       310000.       400000.        80000.       290000.
 340000.       300000.        30000.       240000.       470000.
 480000.       350000.       330000.       110000.       420000.
 170000.       370000.       270000.       220000.       190000.
 510000.       460000.       440000.       410000.       490000.
 390000.       580000.       600000.       620000.       610000.
 700000.       670000.       680000.       430000.       550000.
 540000.       999999.999999 530000.       710000.       560000.
 520000.       750000.       640000.        16000.       570000.
 590000.       660000.       720000.       327680.       

In [11]:
y = df['default_payment_next_month']

In [12]:
y

0        1
1        1
2        0
3        0
4        0
        ..
29995    0
29996    0
29997    1
29998    1
29999    1
Name: default_payment_next_month, Length: 30000, dtype: int64

In [13]:
##Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((21000, 23), (9000, 23))

In [14]:
X_train

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
28465,240000.0,2,1,1,40,-2,-2,-2,-2,-2,-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27622,50000.0,2,1,2,23,-1,-1,-1,-1,-1,-1,3430.0,2478.0,2299.0,4800.0,9810.0,660.0,2548.0,2321.0,4800.0,9810.0,660.0,2980.0
28376,50000.0,2,2,1,36,2,2,2,2,0,0,46203.0,45159.0,49125.0,47956.0,43578.0,35126.0,0.0,4700.0,0.0,2004.0,3500.0,0.0
10917,200000.0,2,3,1,54,6,5,4,3,2,2,110185.0,107665.0,104686.0,102549.0,101400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27234,240000.0,1,1,1,35,-1,-1,-1,0,-1,-1,2024.0,2007.0,21790.0,17102.0,13367.0,22659.0,2017.0,21817.0,1120.0,13434.0,22772.0,22820.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,50000.0,1,2,2,32,0,0,0,0,0,0,52475.0,53600.0,55739.0,55957.0,29238.0,6119.0,2000.0,3000.0,1591.0,72.0,1134.0,73421.0
5390,200000.0,1,1,2,37,2,2,2,2,2,2,157131.0,166590.0,168386.0,164182.0,169029.0,172084.0,13500.0,6000.0,0.0,7500.0,6000.0,4000.0
860,50000.0,1,1,2,26,-2,-2,-2,-2,-2,-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15795,70000.0,2,2,2,25,0,0,0,0,2,2,73939.0,70488.0,51152.0,35122.0,28633.0,28039.0,3000.0,2000.0,4500.0,1200.0,0.0,1200.0


In [15]:
X_test

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
2308,30000.0,1,2,2,25,0,0,0,0,0,0,8864.0,10062.0,11581.0,12580.0,13716.0,14828.0,1500.0,2000.0,1500.0,1500.0,1500.0,2000.0
22404,150000.0,2,1,2,26,0,0,0,0,0,0,136736.0,125651.0,116684.0,101581.0,77741.0,77264.0,4486.0,4235.0,3161.0,2647.0,2669.0,2669.0
23397,70000.0,2,3,1,32,0,0,0,0,0,0,70122.0,69080.0,68530.0,69753.0,70111.0,70212.0,2431.0,3112.0,3000.0,2438.0,2500.0,2554.0
25058,130000.0,1,3,2,49,0,0,0,0,0,-1,20678.0,18956.0,16172.0,16898.0,11236.0,6944.0,1610.0,1808.0,7014.0,27.0,7011.0,4408.0
2664,50000.0,2,2,2,36,0,0,0,0,0,2,94228.0,47635.0,42361.0,19574.0,20295.0,19439.0,2000.0,1500.0,1000.0,1800.0,0.0,1000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3941,410000.0,2,1,2,34,1,-1,-1,-2,-2,-1,0.0,13621.0,0.0,0.0,0.0,666.0,13621.0,0.0,0.0,0.0,666.0,0.0
17854,210000.0,1,1,2,27,0,0,0,0,0,0,42585.0,43304.0,45622.0,47232.0,47583.0,53032.0,8000.0,5000.0,4000.0,3000.0,8000.0,3000.0
95,90000.0,1,2,2,35,0,0,0,0,0,0,83725.0,85996.0,87653.0,35565.0,30942.0,30835.0,3621.0,3597.0,1179.0,1112.0,1104.0,1143.0
6279,220000.0,2,2,1,36,0,0,0,0,0,0,136556.0,139453.0,142295.0,145127.0,148159.0,151462.0,5100.0,5163.0,5196.0,5372.0,5761.0,5396.0


In [16]:

## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [17]:
X_train=scaler.fit_transform(X_train)

In [18]:
X_test=scaler.transform(X_test)

In [19]:
X_train

array([[ 0.56391448,  0.80648893, -1.0976851 , ..., -0.30370385,
        -0.30669703, -0.29550752],
       [-0.90610925,  0.80648893, -1.0976851 , ...,  0.3050678 ,
        -0.26502896, -0.12727157],
       [-0.90610925,  0.80648893,  0.19981502, ..., -0.17934316,
        -0.08573001, -0.29550752],
       ...,
       [-0.90610925, -1.23994262, -1.0976851 , ..., -0.30370385,
        -0.30669703, -0.29550752],
       [-0.75136991,  0.80648893,  0.19981502, ..., -0.22923637,
        -0.30669703, -0.2277615 ],
       [-0.05504288,  0.80648893,  0.19981502, ..., -0.20131106,
         0.58979774, -0.210825  ]])

In [20]:
X_test

array([[-1.06084859, -1.23994262,  0.19981502, ..., -0.2106195 ,
        -0.21199688, -0.18259749],
       [-0.13241255,  0.80648893, -1.0976851 , ..., -0.139441  ,
        -0.13819389, -0.14482908],
       [-0.75136991,  0.80648893,  1.49731513, ..., -0.15241075,
        -0.14886344, -0.15132141],
       ...,
       [-0.59663057, -1.23994262,  0.19981502, ..., -0.23469732,
        -0.23699772, -0.23097944],
       [ 0.40917514,  0.80648893,  0.19981502, ...,  0.02966224,
         0.05701469,  0.00912376],
       [ 0.64128415,  0.80648893, -1.0976851 , ..., -0.03617943,
        -0.25518015,  0.22190272]])

### 3. Create an Evaluate Function to give all metrics after model Training

In [22]:
models = {
    "SVM": SVC(probability=True),  # Ensure SVM outputs probabilities for ROC-AUC
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric="logloss"), 
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}


def model_prediction(name, model):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        precision, recall, f_beta, support = precision_recall_fscore_support(y_test, y_pred, beta=2, pos_label=1, average='weighted', zero_division=1)
        auc = roc_auc_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model name : {name}")
        print("="*50)
        cm = confusion_matrix(y_test, y_pred)

        # Create a DataFrame with appropriate labels
        cdf = pd.DataFrame(
            cm,
            index=["Actual 0", "Actual 1"],   # Rows labeled as actual classes
            columns=["Predicted 0", "Predicted 1"]  # Columns labeled as predicted classes
        )

        # Display the confusion matrix DataFrame
        print(cdf)
        print("="*50)
        print(f"Accuracy is: {accuracy:.2f}")
        print(f"Precision is: {precision:.2f}")
        print(f"Recall is: {recall:.2f}")
        print(f"Fscore is: {f_beta:.2f}")
        print(f"AUC is: {auc:.2f}\n")
        print("\nClassification Report (Test):")
        print(classification_report(y_test, y_pred))

for name, model in models.items():
    model_prediction(name, model)

Model name : SVM
          Predicted 0  Predicted 1
Actual 0         6724          316
Actual 1         1316          644
Accuracy is: 0.82
Precision is: 0.80
Recall is: 0.82
Fscore is: 0.81
AUC is: 0.64


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.84      0.96      0.89      7040
           1       0.67      0.33      0.44      1960

    accuracy                           0.82      9000
   macro avg       0.75      0.64      0.67      9000
weighted avg       0.80      0.82      0.79      9000

Model name : Decision Tree Classifier
          Predicted 0  Predicted 1
Actual 0         5748         1292
Actual 1         1150          810
Accuracy is: 0.73
Precision is: 0.74
Recall is: 0.73
Fscore is: 0.73
AUC is: 0.61


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      7040
           1       0.39      0.41      0.40      1960

    accuracy    

Parameters: { "use_label_encoder" } are not used.



Model name : XGBClassifier
          Predicted 0  Predicted 1
Actual 0         6580          460
Actual 1         1254          706
Accuracy is: 0.81
Precision is: 0.79
Recall is: 0.81
Fscore is: 0.80
AUC is: 0.65


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      7040
           1       0.61      0.36      0.45      1960

    accuracy                           0.81      9000
   macro avg       0.72      0.65      0.67      9000
weighted avg       0.79      0.81      0.79      9000

Model name : CatBoost Classifier
          Predicted 0  Predicted 1
Actual 0         6653          387
Actual 1         1242          718
Accuracy is: 0.82
Precision is: 0.80
Recall is: 0.82
Fscore is: 0.81
AUC is: 0.66


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      7040
           1       0.65      0.37      0.47      1960

    accurac



Model name : AdaBoost Classifier
          Predicted 0  Predicted 1
Actual 0         6752          288
Actual 1         1363          597
Accuracy is: 0.82
Precision is: 0.80
Recall is: 0.82
Fscore is: 0.80
AUC is: 0.63


Classification Report (Test):
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      7040
           1       0.67      0.30      0.42      1960

    accuracy                           0.82      9000
   macro avg       0.75      0.63      0.66      9000
weighted avg       0.80      0.82      0.79      9000



### 4. Results 

In this case we will consider recall for evaluating the credit card prediction model as under recall we evaluate out of all the predicted values how many are correctly predicted

As Random Forest & XGBoost are having same recall ,so we can consider any 1 of them.So will we considering Random Forest for Model Prediction

In [23]:
rf = RandomForestClassifier()
rf

In [24]:
rf.fit(X_train, y_train)

In [25]:
y_pred = rf.predict(X_test)

In [26]:
accuracy_score(y_test, y_pred)

0.8154444444444444

In [27]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
2308,0,0,0
22404,0,0,0
23397,0,0,0
25058,0,0,0
2664,1,0,1
...,...,...,...
3941,1,0,1
17854,0,0,0
95,0,0,0
6279,0,0,0


In [28]:
params = {'max_depth' : [1, 2, 3,4, 10, 20, 30],
         'n_estimators': [50, 100, 200, 300, 400, 1000],
         'criterion': ["gini", 'entropy']}

In [29]:
clf = GridSearchCV(rf, param_grid=params, cv = 5, scoring = "accuracy", verbose=3)

In [30]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.782 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.780 total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.777 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.793 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=50;, score=0.781 total time=   0.2s
[CV 1/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.785 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.784 total time=   0.5s
[CV 3/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.794 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.782 total time=   0.6s
[CV 5/5] END criterion=gini, max_depth=1, n_estimators=100;, score=0.785 total time=   0.5s
[CV 1/5] END criterion=

In [31]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}

In [32]:
clf.best_score_

0.8196190476190477

In [33]:
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame with appropriate labels
cdf = pd.DataFrame(
    cm,
    index=["Actual 0", "Actual 1"],   # Rows labeled as actual classes
    columns=["Predicted 0", "Predicted 1"]  # Columns labeled as predicted classes
)

# Display the confusion matrix DataFrame
print(cdf)

          Predicted 0  Predicted 1
Actual 0         6626          414
Actual 1         1247          713


In [34]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
2308,0,0,0
22404,0,0,0
23397,0,0,0
25058,0,0,0
2664,1,0,1
...,...,...,...
3941,1,0,1
17854,0,0,0
95,0,0,0
6279,0,0,0
