In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Specify the file path (change this to the path where your CSV file is located)
file_path = 'loan_data.csv' 

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to ensure it's loaded correctly
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   credit.policy             purpose  int.rate  installment  log.annual.inc  \
0              1  debt_consolidation    0.1189       829.10       11.350407   
1              1         credit_card    0.1071       228.22       11.082143   
2              1  debt_consolidation    0.1357       366.86       10.373491   
3              1  debt_consolidation    0.1008       162.34       11.350407   
4              1         credit_card    0.1426       102.92       11.299732   

     dti  fico  days.with.cr.line  revol.bal  revol.util  inq.last.6mths  \
0  19.48   737        5639.958333      28854        52.1               0   
1  14.29   707        2760.000000      33623        76.7               0   
2  11.63   682        4710.000000       3511        25.6               1   
3   8.10   712        2699.958333      33667        73.2               1   
4  14.97   667        4066.000000       4740        39.5               0   

   delinq.2yrs  pub.rec  not.fully.paid

In [2]:
# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

In [3]:
# Encode categorical variables
df = pd.get_dummies(df, columns=['purpose'], drop_first=True)

# Split into test set and the rest
X = df.drop('not.fully.paid', axis=1)
y = df['not.fully.paid']
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Separate class 0 and class 1
class_0 = df[df['not.fully.paid'] == 0].reset_index(drop=True)
class_1 = df[df['not.fully.paid'] == 1].reset_index(drop=True)

X_0 = class_0.drop('not.fully.paid', axis=1)
y_0 = class_0['not.fully.paid']
X_1 = class_1.drop('not.fully.paid', axis=1)
y_1 = class_1['not.fully.paid']

In [4]:
ratio = 2
threshold = 0.46
seeds = [10]
ann_accuracies = []

In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

print("\n Artificial Neural Network (1:2 Balanced Sampling)")
print(f" Ratio Used: 1:{ratio}")
print(f" Threshold Used: {threshold}\n")

for i, seed in enumerate(seeds):
    print(f" Iteration {i+1}")

    tf.random.set_seed(seed)
    np.random.seed(seed)

    X_0_sampled = X_0.sample(n=ratio * len(class_1), random_state=seed)
    y_0_sampled = y_0.loc[X_0_sampled.index]
    X_balanced = pd.concat([X_0_sampled, X_1])
    y_balanced = pd.concat([y_0_sampled, y_1])
    X_balanced, y_balanced = shuffle(X_balanced, y_balanced, random_state=seed)

    # Normalize features
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_balanced = scaler.fit_transform(X_balanced)
    X_test_scaled = scaler.transform(X_test)

    # Build model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_balanced.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Fit model with early stopping
    model.fit(X_balanced, y_balanced, epochs=50, batch_size=32, verbose=0,
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)])

    # Predict
    y_probs = model.predict(X_test_scaled).flatten()
    y_pred = (y_probs >= threshold).astype(int)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(" Accuracy:", accuracy)
    print(" Confusion Matrix:\n", cm)
    print(" Classification Report:\n", report)
    print("-" * 60)

    ann_accuracies.append(accuracy)


 Artificial Neural Network (1:2 Balanced Sampling)
 Ratio Used: 1:2
 Threshold Used: 0.46

 Iteration 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  current = self.get_monitor_value(logs)


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
 Accuracy: 0.7943632567849687
 Confusion Matrix:
 [[1361  248]
 [ 146  161]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87      1609
           1       0.39      0.52      0.45       307

    accuracy                           0.79      1916
   macro avg       0.65      0.69      0.66      1916
weighted avg       0.82      0.79      0.81      1916

------------------------------------------------------------
