In [None]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import gc
from datetime import datetime 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score,auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score,f1_score, confusion_matrix, classification_report, roc_auc_score,precision_score,recall_score, precision_recall_curve)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

In [None]:
df=pd.read_csv(r"E:\Data_Science\Capstone_project\CAPSTONE_PROJECT_1\Good and Bad Customers for Granting Credit\Credit_Card_Default.csv") #loading data

In [None]:
df.head()

In [None]:
df.describe() #looking into data

In [None]:
for col in ["SEX", "EDUCATION", "MARRIAGE"]:
    print(f"\n{col} unique values:", df[col].unique())

In [None]:
df.drop('ID',axis=1,inplace=True) #dropping "ID" column as not required

In [None]:
df.columns #checking for column names

In [None]:
df['EDUCATION'].unique() #checking "EDUCATION" column for unique values.

In [None]:
df['EDUCATION'].replace({6:5,0:5},inplace=True) #replacing 6 by 5 as both represents same category in the dataset.
df['MARRIAGE'].replace({0:3},inplace=True)
print(df['EDUCATION'].unique())
print(df['MARRIAGE'].unique())

In [None]:
for col in ['PAY_0', 'PAY_2','PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',]:
    print(f"\n{col} unique values:", df[col].unique())

In [None]:
print((df['PAY_0'].isnull() | (df['PAY_0'] == 0)).sum())

In [None]:
df.isnull().sum() #checking for null values in the dataset.

In [None]:
for col in df.select_dtypes(include=['number']).columns:
    plt.figure(figsize=(4, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
temp = df['default.payment.next.month'].value_counts()
print(temp)
df_temp = pd.DataFrame({'default.payment.next.month': temp.index,'values': temp.values})

plt.figure(figsize = (5,5))
sns.set_color_codes("pastel")
plt.title("Default=1,Not-Default=0")
ax=sns.barplot(x = 'default.payment.next.month', y="values", data=df_temp,hue='default.payment.next.month')

locs, labels = plt.xticks()
for p in ax.patches:
    ax.annotate(
        f'{int(p.get_height())}',                 # text (exact count)
        (p.get_x() + p.get_width() / 2., p.get_height()),  # position
        ha='center', va='bottom', fontsize=12, color='black')
plt.show()

In [None]:
corr=df.corr()
plt.figure(figsize=(12,12))
sns.heatmap(
    corr,
    #annot=True,        # show correlation values
    fmt=".2f",         # format decimal places
    cmap="coolwarm",   # colormap
    cbar=True,         # show color bar
    square=True        # make cells square-shaped
)
plt.title("Correlation Heatmap", fontsize=16)
plt.show()

In [None]:
#Target and Feature
X=df.drop('default.payment.next.month', axis=1) #Features
Y=df['default.payment.next.month'] #Target

#Training and testing separation
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2, random_state=42, stratify=Y)
print('Train_shape',X_train.shape,"Test_shape",X_test.shape)

In [None]:
#scaling dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Using Logistic Regression model
lr = LogisticRegression(max_iter=2000, random_state=42)
lr.fit(X_train_scaled, Y_train)

In [None]:
#Predictions
Y_pred_lr = lr.predict(X_test_scaled)
Y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]

In [None]:
#Results
print("Logistic Regression Results")
print("Accuracy:", accuracy_score(Y_test, Y_pred_lr))
print("ROC AUC:", roc_auc_score(Y_test, Y_prob_lr))
print(classification_report(Y_test, Y_pred_lr))

#Confusion Matrix
sns.heatmap(confusion_matrix(Y_test, Y_pred_lr), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

### Handling data unbalancing using SMOTE

In [None]:
#using SMOTE as data is unbalanced
sm=SMOTE(random_state=42)
X_train_sm,Y_train_sm=sm.fit_resample(X_train_scaled,Y_train)

In [None]:
#Applying LogisticRegression after SMOTE

lr_sm=LogisticRegression(max_iter=2000,random_state=42)
lr_sm.fit(X_train_sm,Y_train_sm)

Y_pred_sm = lr_sm.predict(X_test_scaled)
Y_prob_sm = lr_sm.predict_proba(X_test_scaled)[:, 1]

#results
print('Logistic regressio with SMOTE results')
print("Accuracy:", accuracy_score(Y_test, Y_pred_sm))
print("ROC AUC:", roc_auc_score(Y_test, Y_prob_sm))
print(classification_report(Y_test,Y_pred_sm))

#Confusion Matrix
sns.heatmap(confusion_matrix(Y_test, Y_pred_sm), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression with SMOTE")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#Using SMOTEENN
smn=SMOTEENN(random_state=42)
X_train_smn,Y_train_smn=smn.fit_resample(X_train_scaled,Y_train)

In [None]:
#Applying LogisticRegression after SMOTEENN

lr_smn=LogisticRegression(max_iter=2000,random_state=42)
lr_smn.fit(X_train_smn,Y_train_smn)
Y_pred_smn = lr_smn.predict(X_test_scaled)
Y_prob_smn = lr_smn.predict_proba(X_test_scaled)[:, 1]

#results
print('SMOTEENN results')
print("Accuracy:", accuracy_score(Y_test, Y_pred_smn))
print("ROC AUC:", roc_auc_score(Y_test, Y_prob_smn))
print(classification_report(Y_test,Y_pred_smn))

#Confusion Matrix
sns.heatmap(confusion_matrix(Y_test, Y_pred_smn), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression with SMOTEENN")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

**With SMOTE and SMOTEENN we are getting same results, and not good results

##

##

# RANDOM FOREST

In [None]:
#Basic RAndom Forest
rf = RandomForestClassifier(random_state=42, n_estimators=200, n_jobs=-1,class_weight='balanced')
rf.fit(X_train, Y_train)

#Predict
Y_pred_rf = rf.predict(X_test)
Y_prob_rf = rf.predict_proba(X_test)[:, 1]

#Results
print("\nRandom Forest Results")
print("Accuracy:", accuracy_score(Y_test, Y_pred_rf))
print("ROC AUC:", roc_auc_score(Y_test, Y_prob_rf))
print(classification_report(Y_test, Y_pred_rf))

#confusion matrics
sns.heatmap(confusion_matrix(Y_test, Y_pred_rf), annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Basic Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#Hyperparameter tuning
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

In [None]:
#class weight balanced and using hyperparameter tuning
grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42,class_weight='balanced'),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid.fit(X_train, Y_train)

print("\nBest RF Params:", grid.best_params_)
best_rf = grid.best_estimator_

Y_pred_best = best_rf.predict(X_test)
Y_prob_best = best_rf.predict_proba(X_test)[:, 1]

print("Tuned RF Accuracy:", accuracy_score(Y_test, Y_pred_best))
print("Tuned RF ROC AUC:", roc_auc_score(Y_test, Y_prob_best))
print(classification_report(Y_test, Y_pred_best))

sns.heatmap(confusion_matrix(Y_test, Y_pred_best), annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Random Forest with weight balanced and hyperparameter tuning")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#using SMOTE data
X_train_smr,Y_train_smr=sm.fit_resample(X_train,Y_train)

grid_smr = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=2
) 

grid_smr.fit(X_train_smr, Y_train_smr)

print("\nBest RF Params:", grid_smr.best_params_)
best_rf_smr = grid_smr.best_estimator_

Y_pred_best_smr = best_rf_smr.predict(X_test)
Y_prob_best_smr = best_rf_smr.predict_proba(X_test)[:, 1]

print("Tuned RF Accuracy:", accuracy_score(Y_test, Y_pred_best_smr))
print("Tuned RF ROC AUC:", roc_auc_score(Y_test, Y_prob_best_smr))
print(classification_report(Y_test, Y_pred_best_smr))

sns.heatmap(confusion_matrix(Y_test, Y_pred_best_smr), annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Random Forest with SMOTE")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
##using SMOTEENN
X_train_smnr,Y_train_smnr=smn.fit_resample(X_train,Y_train)

grid_smnr = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=2
) 

grid_smnr.fit(X_train_smnr, Y_train_smnr)

print("\nBest RF Params:", grid_smnr.best_params_)
best_rf_smnr = grid_smnr.best_estimator_

Y_pred_best_smnr = best_rf_smnr.predict(X_test)
Y_prob_best_smnr = best_rf_smnr.predict_proba(X_test)[:, 1]

print("Tuned RF Accuracy:", accuracy_score(Y_test, Y_pred_best_smnr))
print("Tuned RF ROC AUC:", roc_auc_score(Y_test, Y_prob_best_smnr))
print(classification_report(Y_test, Y_pred_best_smnr))

sns.heatmap(confusion_matrix(Y_test, Y_pred_best_smnr), annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#SMOTOMEK
smtk = SMOTETomek(random_state=42)

In [None]:
#applying Random Forest technique on SMOTOMEK transformed variables
X_train_smtk,Y_train_smtk=smtk.fit_resample(X_train,Y_train)

grid_smtk = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=2
) 

grid_smtk.fit(X_train_smtk, Y_train_smtk)

print("\nBest RF Params:", grid_smtk.best_params_)
best_rf_smtk = grid_smtk.best_estimator_

Y_pred_best_smtk = best_rf_smtk.predict(X_test)
Y_prob_best_smtk = best_rf_smtk.predict_proba(X_test)[:, 1]

print("Tuned RF Accuracy:", accuracy_score(Y_test, Y_pred_best_smtk))
print("Tuned RF ROC AUC:", roc_auc_score(Y_test, Y_prob_best_smtk))
print(classification_report(Y_test, Y_pred_best_smtk))



In [None]:
sns.heatmap(confusion_matrix(Y_test, Y_pred_best_smtk), annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Random Forest with SMOTOMEK")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# XGBoost

In [None]:

scale = Y_train.value_counts()[0] / Y_train.value_counts()[1]
print(f"scale_pos_weight: {scale:.2f}")

xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    reg_lambda=1,
    scale_pos_weight=scale,
    random_state=42,
    eval_metric='auc'
)

xgb_clf.fit(X_train, Y_train)

Y_pred_xgb = xgb_clf.predict(X_test)
Y_pred_prob_xgb = xgb_clf.predict_proba(X_test)[:, 1]

print("Tuned RF Accuracy:", accuracy_score(Y_test, Y_pred_xgb))
print("ROC AUC Score:", roc_auc_score(Y_test, Y_pred_prob_xgb))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred_xgb))


# Confusion matrix
cm = confusion_matrix(Y_test, Y_pred_xgb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred_prob_xgb)
pr_auc = auc(recall, precision)

plt.figure(figsize=(6,4))
plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.3f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precisionâ€“Recall Curve')
plt.legend()
plt.show()

In [None]:
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_thresh = thresholds[np.argmax(f1_scores)]
print(f"Best threshold for F1: {best_thresh:.3f}")

Y_pred_adj = (Y_pred_prob_xgb>= best_thresh).astype(int)
print("\nAdjusted Classification Report:\n", classification_report(Y_test, Y_pred_adj))

# Confusion matrix
cm = confusion_matrix(Y_test, Y_pred_adj)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## Creating table for each algorithm and sorting based ROC-AUC values.

In [None]:
models = {
    "Basic Logistic Regression": (Y_test, Y_pred_lr, Y_prob_lr),
    "Logistic Regression with SMOTE": (Y_test, Y_pred_sm, Y_prob_sm),
    "Logistic Regression with SMOTEENN": (Y_test, Y_pred_smn, Y_prob_smn),

    "Basic Random Forest": (Y_test, Y_pred_rf, Y_prob_rf),
    "Random Forest with hyperparametertuning": (Y_test, Y_pred_best, Y_prob_best),
    "Random Forest with SMOTE": (Y_test, Y_pred_best_smr, Y_prob_best_smr),
    "Random Forest with SMOTEENN": (Y_test, Y_pred_best_smnr, Y_prob_best_smnr),
    "Random Forest with SMOTomek": (Y_test, Y_pred_best_smtk, Y_prob_best_smtk),

    "XGBoost": (Y_test, Y_pred_xgb, Y_pred_prob_xgb),
    "XGBoost adjusted": (Y_test, Y_pred_adj, Y_pred_prob_xgb)
}

#Store results
results = []

for name, (Y_true, Y_pred, Y_proba) in models.items():
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(Y_true, Y_pred),
        "Precision (Default)": precision_score(Y_true, Y_pred, pos_label=1),
        "Recall (Default)": recall_score(Y_true, Y_pred, pos_label=1),
        "F1-Score (Default)": f1_score(Y_true, Y_pred, pos_label=1),
        "ROC AUC": roc_auc_score(Y_true, Y_proba)
    })

#Convert to table
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="ROC AUC", ascending=False)
#print(results_df)
results_df

### Observation:- From above Table we can see that Random Forest with Hyperparameter tuning is giving best ROC AUC value. In this algorithm we also used weight _balanced parameter for handling weight imbalance.

### Plotting F1 score and ROC AUC of various models

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df.melt(id_vars="Model", value_vars=["F1-Score (Default)", "ROC AUC"]),
            x="value", y="Model", hue="variable")
plt.title("Model Comparison: F1-Score and ROC AUC")
plt.xlabel("Score")
plt.ylabel("Model")
plt.legend(title="")
plt.tight_layout()
plt.show()

In [None]:
best_entry = max(results, key=lambda x: (x['F1-Score (Default)'], x['ROC AUC']))
best_model_name = best_entry['Model']

In [None]:
print(f"Best model based on F1 and ROC AUC: {best_model_name}")
print(f"F1: {best_entry['F1-Score (Default)']:.4f}, ROC AUC: {best_entry['ROC AUC']:.4f}")

In [None]:
# Step 2: retrieve actual trained model object by name
best_model = models[best_model_name]

In [None]:
best_model

In [None]:
# Step 3: save it
import pickle
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved as 'best_model.pkl'")