In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.utils import resample

import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
data = pd.read_csv('../Data/loan_data.csv')
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [24]:
data.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')

# <b><span style='color:#CDE8E5'>Step 2.1 |</span><span style='color:#4D869C'> Handling Missing Values</span></b>

In [None]:
# percentage of missing values
(data.isnull().sum()[data.isnull().sum() > 0] / data.shape[0]) * 100

# <b><span style='color:#CDE8E5'>Step 2.2 |</span><span style='color:#4D869C'> Handling Duplicates</span></b>

In [None]:
# Showing the number of duplicate rows
print(f"\nThe dataset contains {data.duplicated().sum()} duplicate rows.")

In [None]:
# finding and show duplicated rows
# data[data.duplicated(keep=False)].head(5)

<a id="eda"></a>
# <p style="background-color: #4D869C; font-family:calibri; color:white; font-size:135%; font-family:Verdana; text-align:center; border-radius:15px 50px;  padding: 15px;">Step 3 | Data Preprocessing:</p>
⬆️ [Tabel of Contents](#contents_tabel)

# <b><span style='color:#CDE8E5'>Step 3.1 |</span><span style='color:#4D869C'> Label Encoder:</span></b>

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['purpose'] = label_encoder.fit_transform(data['purpose'])
data.head()

# <b><span style='color:#CDE8E5'>Step 3.2 |</span><span style='color:#4D869C'> Normalize features</span></b>

In [None]:
def normal_dis(data, feature:str):
  # create new column
  data[feature] = np.sqrt(data[feature])

  # visualize
  sns.histplot(x=data[feature], label=data[feature].name, kde =True)
  plt.legend()
  plt.title(f'Distribution of {data[feature].name}')
  plt.show()

In [None]:
unnormal_features = ["revol.bal", "days.with.cr.line"]
for i in unnormal_features:
  normal_dis(data, i)

In [None]:
fico_labels = range(1,5)
fico = pd.qcut(data["fico"], q=4, labels=fico_labels)
data["fico"] = fico


revol_util_labels = range(1,5)
revol_util = pd.qcut(data["revol.util"], q=4, labels=revol_util_labels)
data["revol.util"] = revol_util


inq_last_6mths_labels = range(1,5)
inq_last_6mths = pd.cut(data["inq.last.6mths"], bins=4, labels= inq_last_6mths_labels)
data["inq.last.6mths"] = inq_last_6mths


delinq_2yrs_labels = range(1,5)
delinq_2yrs = pd.cut(data["delinq.2yrs"], bins=4, labels= delinq_2yrs_labels)
data["delinq.2yrs"] = delinq_2yrs


pub_rec_labels = range(1,5)
pub_rec = pd.cut(data["pub.rec"], bins=4, labels= pub_rec_labels)
data["pub.rec"] = pub_rec


# <b><span style='color:#CDE8E5'>Step 3.3 |</span><span style='color:#4D869C'> Treating Imbalanced data</span></b>


In [None]:
#create two different dataframe of majority and minority class
df_majority = data[(data['not.fully.paid']==0)]
df_minority = data[(data['not.fully.paid']==1)]
# upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,    # sample with replacement
                                 n_samples= 8045, # to match majority class
                                 random_state=60)  # reproducible results

# Combine majority class with upsampled minority class
data_upsampled = pd.concat([df_minority_upsampled, df_majority])

In [None]:
"""
from sklearn.utils import resample

# Combine the training data
X_train_combined = pd.concat(
    [pd.DataFrame(X_train, columns=data.columns[:-1]), pd.DataFrame(y_train, columns=یی['not.fully.paid'])], 
    axis=1)
X_train_combined

# Separate the minority and majority classes
not_fully_paid = X_train_combined[X_train_combined['not.fully.paid'] == 1]
fully_paid = X_train_combined[X_train_combined['not.fully.paid'] == 0]

# Upsample the minority class
not_fully_paid_upsampled = resample(not_fully_paid,
                                    replace=True, # sample with replacement
                                    n_samples=len(fully_paid), # match number of majority class
                                    random_state=42)

# Combine the majority class with the upsampled minority class
upsampled = pd.concat([fully_paid, not_fully_paid_upsampled])

# Separate back into X and y
X_train_upsampled = upsampled.drop('not.fully.paid', axis=1)
y_train_upsampled = upsampled['not.fully.paid']

"""

# <b><span style='color:#CDE8E5'>Step 3.4 |</span><span style='color:#4D869C'> Split the Dataset:</span></b>

In [None]:
X = data_upsampled.iloc[:, :-1].values
y = data_upsampled.iloc[:, -1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, random_state =100)

# <b><span style='color:#CDE8E5'>Step 3.5 |</span><span style='color:#4D869C'> Feature Scaling:</span></b>

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train = scaler.fit_transform(X_train)

# Only transform the testing data
X_test = scaler.transform(X_test)

<a id="modeling"></a>
# <p style="background-color: #4D869C; font-family:calibri; color:white; font-size:135%; font-family:Verdana; text-align:center; border-radius:15px 50px;  padding: 15px;">Step 4 | Neural Networks</p>
⬆️ [Tabel of Contents](#contents_tabel)

In [None]:
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test , num_classes=2)

In [None]:
initializer = tf.keras.initializers.HeNormal(seed=None)

In [None]:
# Define the model
model = tf.keras.Sequential(
    [
        tf.keras.Input(shape=X_train.shape[1:]),
        tf.keras.layers.Dense(256, activation="relu", kernel_initializer=initializer),
        tf.keras.layers.Dense(256, activation="relu", kernel_initializer=initializer),
        tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99),
        tf.keras.layers.Dense(128, activation="relu", kernel_initializer=initializer),
        tf.keras.layers.Dense(128, activation="relu", kernel_initializer=initializer),
        tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99),
        tf.keras.layers.Dense(64, activation="relu", kernel_initializer=initializer),
        tf.keras.layers.Dense(64, activation="relu", kernel_initializer=initializer),
        tf.keras.layers.Dense(2, activation="softmax"),
    ]
)

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
model.summary()

In [None]:
#plot_model(model, show_layer_activations=True, show_shapes=True)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), batch_size=128)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

In [None]:
# Predict the labels for the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Generate classification report
report = classification_report(y_true, y_pred_classes)
print(report)

In [None]:
conf_matrix = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d")
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix Neural Networks')
plt.show()

In [None]:
import pickle

# save label encoder
with open('label_encoder.h5', 'wb') as f:
    pickle.dump(label_encoder, f)

In [None]:
# save Scaler
with open('scaler.h5', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
model.save("my_model.keras")

## predict

In [20]:
newx = list(data.iloc[1, :-1].values)
newx

[1,
 'credit_card',
 0.1071,
 228.22,
 11.08214255,
 14.29,
 707,
 2760.0,
 33623,
 76.7,
 0,
 0,
 0]

In [25]:
# Convert to DataFrame for easier processing
temp_df = pd.DataFrame([newx], columns=['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec'])


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0


In [26]:
import pickle

with open('scaler.h5', 'rb') as f:
    ss = pickle.load(f)

# load label encoder
with open('label_encoder.h5', 'rb') as f:
    le = pickle.load(f)

In [27]:
temp_df["purpose"] = le.transform(temp_df["purpose"])
temp_df

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0


In [34]:
temp_df.values

array([[1.00000000e+00, 1.00000000e+00, 1.07100000e-01, 2.28220000e+02,
        1.10821426e+01, 1.42900000e+01, 7.07000000e+02, 2.76000000e+03,
        3.36230000e+04, 7.67000000e+01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

In [33]:
newx = ss.transform(temp_df.iloc[:,:].values)
newx

array([[ 0.57926401, -0.58139022, -0.72299016, -0.47110374,  0.2757736 ,
         0.20988242,  0.03307794, -0.70941302,  0.38141814,  0.96282773,
        -0.75152688, -0.31378359, -0.26952753]])

In [29]:
loaded_model = tf.keras.models.load_model("my_model.keras")
loaded_model

<Sequential name=sequential, built=True>

In [None]:
assert np.allclose(model.predict(newx), loaded_model.predict(newx))

In [30]:
np.argmax(loaded_model.predict(newx), axis=1)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step


0

In [None]:
X_test[1, :].shape

In [None]:
model.predict(X_test)

In [None]:
np.argmax(model.predict(X_test), axis=1)

In [None]:
y_true

In [None]:
new = X[1, :].reshape(1,-1)

In [None]:
new

In [None]:
X[1:2, :]

In [None]:
model.predict(new)

In [None]:
# end