In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
# Load the loan default data
df = pd.read_csv('Loan_Default.csv')
df.describe()

Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,Status,dtir1
count,148670.0,148670.0,148670.0,112231.0,112031.0,109028.0,148629.0,133572.0,139520.0,148670.0,133572.0,148670.0,124549.0
mean,99224.5,2019.0,331117.7,4.045476,0.441656,3224.996127,335.136582,497893.5,6957.338876,699.789103,72.746457,0.246445,37.732932
std,42917.476598,0.0,183909.3,0.561391,0.513043,3251.12151,58.409084,359935.3,6496.586382,115.875857,39.967603,0.430942,10.545435
min,24890.0,2019.0,16500.0,0.0,-3.638,0.0,96.0,8000.0,0.0,500.0,0.967478,0.0,5.0
25%,62057.25,2019.0,196500.0,3.625,0.076,581.49,360.0,268000.0,3720.0,599.0,60.47486,0.0,31.0
50%,99224.5,2019.0,296500.0,3.99,0.3904,2596.45,360.0,418000.0,5760.0,699.0,75.13587,0.0,39.0
75%,136391.75,2019.0,436500.0,4.375,0.7754,4812.5,360.0,628000.0,8520.0,800.0,86.184211,0.0,45.0
max,173559.0,2019.0,3576500.0,8.0,3.357,60000.0,360.0,16508000.0,578580.0,900.0,7831.25,1.0,61.0


In [4]:
# Split the data into numerical and categorical features
num_features = df.select_dtypes(include=[np.number]).columns.tolist()
num_features.remove('Status')
cat_features = df.select_dtypes(include=[object]).columns.tolist()
print(num_features)
print(1)
# Impute missing values for numerical features with the mean
num_imputer = SimpleImputer(strategy='mean')
df[num_features] = num_imputer.fit_transform(df[num_features])
print(2)
# Impute missing values for categorical features with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_features] = cat_imputer.fit_transform(df[cat_features])
print(3)

['ID', 'year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1']
1
2
3


In [5]:
# Preprocess the numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(df[num_features])

# Preprocess the categorical features
encoder = OneHotEncoder()
X_cat = encoder.fit_transform(df[cat_features])

In [6]:
X = pd.concat([pd.DataFrame(X_num), pd.DataFrame(X_cat.toarray())], axis=1)
y = df["Status"]
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,-1.732039,0.0,-1.166980,1.820924e-15,1.246439e-16,-1.633359e-16,0.425737,-1.113507,-0.829008,0.502357,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-1.732016,0.0,-0.677607,1.820924e-15,1.246439e-16,-1.633359e-16,0.425737,0.000000,-0.314189,-1.275413,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.731993,0.0,0.409890,1.054866e+00,-5.426102e-01,-9.446405e-01,0.425737,0.029623,0.400838,1.158234,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,-1.731969,0.0,0.681764,4.193110e-01,5.374204e-01,-1.633359e-16,0.425737,0.469289,0.782185,-0.973365,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-1.731946,0.0,1.986759,-9.323350e-02,-3.086410e-01,-1.158352e+00,0.425737,0.762399,0.553377,-0.843916,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,1.731946,0.0,0.573014,-1.887139e+00,-4.143986e-01,2.419075e+00,-2.656410,0.322734,0.143428,-0.352008,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
148666,1.731969,0.0,1.388636,2.346478e+00,9.267703e-01,-1.158352e+00,0.425737,0.850332,0.029024,-1.128704,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
148667,1.731993,0.0,0.627389,-1.887139e+00,-8.084638e-01,-7.177684e-01,-2.656410,0.674466,-0.009111,0.019080,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
148668,1.732016,0.0,-0.731981,-1.118323e+00,3.160253e-01,3.944989e-01,-2.656410,-0.644530,0.029024,0.321128,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
# Train the logistic regression model
clf = LogisticRegression(random_state=0)
clf.fit(X_train.values, y_train.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's performance
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(acc))
print("Precision: {:.2f}".format(prec))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))

Accuracy: 0.87
Precision: 0.95
Recall: 0.49
F1 Score: 0.65


In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.99      0.92     22326
           1       0.95      0.49      0.65      7408

    accuracy                           0.87     29734
   macro avg       0.90      0.74      0.78     29734
weighted avg       0.88      0.87      0.85     29734



In [11]:
# Train the logistic regression model
clf = LogisticRegression(random_state=0,class_weight={1: 2})
clf.fit(X_train.values, y_train.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Make predictions on the test data
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91     22326
           1       0.82      0.58      0.68      7408

    accuracy                           0.86     29734
   macro avg       0.85      0.77      0.80     29734
weighted avg       0.86      0.86      0.86     29734



In [24]:
# Get the predicted probabilities
y_prob = clf.predict_proba(X_test)[:, 1]

# Threshold the probabilities to make predictions
y_pred = np.where(y_prob > 0.53, 1, 0)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92     22326
           1       0.86      0.56      0.68      7408

    accuracy                           0.87     29734
   macro avg       0.86      0.77      0.80     29734
weighted avg       0.87      0.87      0.86     29734

