# Credit Risk Modelling —

In [43]:
!pip install imbalanced-learn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve

import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip




## Load the dataset

In [44]:
data = pd.read_csv('loan_detection.csv')
data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,Loan_Status_label
0,56,1,999,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
1,57,1,999,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,37,1,999,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,40,1,999,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,56,1,999,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


## Basic EDA

In [4]:
print('Data shape:', data.shape)
print('Dtypes summary:\n', data.dtypes.value_counts())
data.info()
data.describe()


Data shape: (41188, 60)
Dtypes summary:
 int64    60
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 60 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age                            41188 non-null  int64
 1   campaign                       41188 non-null  int64
 2   pdays                          41188 non-null  int64
 3   previous                       41188 non-null  int64
 4   no_previous_contact            41188 non-null  int64
 5   not_working                    41188 non-null  int64
 6   job_admin.                     41188 non-null  int64
 7   job_blue-collar                41188 non-null  int64
 8   job_entrepreneur               41188 non-null  int64
 9   job_housemaid                  41188 non-null  int64
 10  job_management                 41188 non-null  int64
 11  job_retired                    41188 non-null  int64


Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,Loan_Status_label
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,...,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,2.567593,962.475454,0.172963,0.963217,0.087623,0.253035,0.224677,0.03535,0.025736,...,0.013839,0.190031,0.206711,0.209357,0.196416,0.197485,0.103234,0.863431,0.033335,0.112654
std,10.42125,2.770014,186.910907,0.494901,0.18823,0.282749,0.434756,0.417375,0.184665,0.158348,...,0.116824,0.39233,0.404951,0.406855,0.397292,0.398106,0.304268,0.343396,0.179512,0.316173
min,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,1.0,999.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,38.0,2.0,999.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,47.0,3.0,999.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,98.0,56.0,999.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
print("Missing Values:\n", data.isnull().sum())

Missing Values:
 age                              0
campaign                         0
pdays                            0
previous                         0
no_previous_contact              0
not_working                      0
job_admin.                       0
job_blue-collar                  0
job_entrepreneur                 0
job_housemaid                    0
job_management                   0
job_retired                      0
job_self-employed                0
job_services                     0
job_student                      0
job_technician                   0
job_unemployed                   0
job_unknown                      0
marital_divorced                 0
marital_married                  0
marital_single                   0
marital_unknown                  0
education_basic.4y               0
education_basic.6y               0
education_basic.9y               0
education_high.school            0
education_illiterate             0
education_professional.course    0
edu

In [45]:
print("\nDuplicates:", data.duplicated().sum())
data = data.drop_duplicates()
print("After removing duplicates, new shape:", data.shape)



Duplicates: 2417
After removing duplicates, new shape: (38771, 60)


In [46]:
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

target_col = 'Loan_Status_label'
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

print('Numeric cols:', len(numeric_cols))
print('Categorical cols:', len(cat_cols))


Numeric cols: 59
Categorical cols: 0


In [47]:
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
Q1 = data[numeric_cols].quantile(0.25)
Q3 = data[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ((data[numeric_cols] < (Q1 - 1.5 * IQR)) | (data[numeric_cols] > (Q3 + 1.5 * IQR)))
outlier_counts = outlier_mask.sum().sort_values(ascending=False)
print('Top numeric columns by outlier count:')
print(outlier_counts[outlier_counts>0].head(15))

Top numeric columns by outlier count:
job_admin.               9635
education_high.school    9007
job_blue-collar          8669
default_no               8181
default_unknown          8178
day_of_week_mon          8108
day_of_week_thu          8056
day_of_week_wed          7607
day_of_week_tue          7570
day_of_week_fri          7430
loan_no                  7133
month_jul                6626
job_technician           6273
loan_yes                 6154
education_basic.9y       5687
dtype: int64


In [48]:
print("Class Distribution in Target Column:")
print(data["Loan_Status_label"].value_counts())

Class Distribution in Target Column:
Loan_Status_label
0    34179
1     4592
Name: count, dtype: int64


##  Split features and target

In [49]:
X = data.drop(columns=[target_col])
y = data[target_col]

In [50]:
X
y

0        0
1        0
2        0
3        0
4        0
        ..
41183    1
41184    0
41185    0
41186    1
41187    0
Name: Loan_Status_label, Length: 38771, dtype: int64

##  Train-test split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train:', X_train.shape, 'Test:', X_test.shape)

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)


Train: (31016, 59) Test: (7755, 59)


In [52]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_sc, y_train)

print("Before SMOTE:", X_train_sc.shape, y_train.value_counts().to_dict())
print("After SMOTE:", X_train_res.shape, y_train_res.value_counts().to_dict())


Before SMOTE: (31016, 59) {0: 27342, 1: 3674}
After SMOTE: (54684, 59) {0: 27342, 1: 27342}


## Logistic Regression

In [53]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

lr_pred_train = lr.predict(X_train_res)
lr_pred_test = lr.predict(X_test_sc)

In [54]:
print(confusion_matrix(y_train_res, lr_pred_train))
print()
print(accuracy_score(y_train_res, lr_pred_train))

[[14998 12344]
 [ 6309 21033]]

0.6588947406919757


In [55]:
print(confusion_matrix(y_test, lr_pred_test))
print()
print(accuracy_score(y_test, lr_pred_test))

[[3720 3117]
 [ 193  725]]

0.5731785944551901


In [56]:
print("ROC-AUC:", roc_auc_score(y_test, lr.predict_proba(X_test)[:,1]))

ROC-AUC: 0.7701457340123249


## Random forest

In [57]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred_train = rf.predict(X_train_res)
rf_pred_test = rf.predict(X_test_sc)

In [58]:
print(confusion_matrix(y_train_res, rf_pred_train))
print()
print(accuracy_score(y_train_res, rf_pred_train))

[[20716  6626]
 [12593 14749]]

0.648544363982152


In [59]:
print(confusion_matrix(y_test, rf_pred_test))
print()
print(accuracy_score(y_test, rf_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, rf_pred_test))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))

[[5064 1773]
 [ 452  466]]

0.7130883301096067

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.92      0.74      0.82      6837
           1       0.21      0.51      0.30       918

    accuracy                           0.71      7755
   macro avg       0.56      0.62      0.56      7755
weighted avg       0.83      0.71      0.76      7755

ROC-AUC: 0.7357748098182929


## Decision Tree

In [60]:
dt = DecisionTreeClassifier(
    max_depth=None,       
    random_state=42
)
dt.fit(X_train_res, y_train_res)

y_pred_train = dt.predict(X_train_sc)
y_pred_test = dt.predict(X_test_sc)

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

print("\nConfusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("ROC-AUC:", roc_auc_score(y_test, dt.predict_proba(X_test)[:,1]))

Train Accuracy: 0.9928101624967759
Test Accuracy: 0.8233397807865893

Confusion Matrix (Test):
 [[6097  740]
 [ 630  288]]

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.91      0.89      0.90      6837
           1       0.28      0.31      0.30       918

    accuracy                           0.82      7755
   macro avg       0.59      0.60      0.60      7755
weighted avg       0.83      0.82      0.83      7755

ROC-AUC: 0.4191356909396297


## AdaBoost

In [62]:
ad = AdaBoostClassifier(
    n_estimators=100,   
    random_state=42
)
ad.fit(X_train_res, y_train_res)

y_pred_train = ad.predict(X_train_sc)
y_pred_test = ad.predict(X_test_sc)

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

print("\nConfusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test))
print("\nClassification Report (Test):\n", classification_report(y_test, y_pred_test))
print("ROC-AUC:", roc_auc_score(y_test, ad.predict_proba(X_test)[:,1]))


Train Accuracy: 0.8848336342532886
Test Accuracy: 0.8857511283043198

Confusion Matrix (Test):
 [[6558  279]
 [ 607  311]]

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.92      0.96      0.94      6837
           1       0.53      0.34      0.41       918

    accuracy                           0.89      7755
   macro avg       0.72      0.65      0.67      7755
weighted avg       0.87      0.89      0.87      7755

ROC-AUC: 0.5740436424516989


##  Define models

In [63]:
best_model = models['AdaBoost']

print(f"\nBest Model: {best_name}\n")

y_pred = best_model.predict(X_test)
print("Confusion Matrix:")
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))


NameError: name 'models' is not defined