In [24]:
# Importing required modules
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the data
df = pd.read_csv('Loan_Data.csv')

In [3]:
# Visualise the data
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [4]:
# Data Preprocessing
# Clean and preprocess the data, handling missing values and encoding categorical variables 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


In [25]:
# Change datatype of customer id column
df['customer_id'] = df['customer_id'].astype('str')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  object 
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 625.1+ KB


In [26]:
# Generate descriptive statistics
df.describe()

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


In [8]:
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [10]:
# Feature scaling is used to 
# Standardize or normalize numerical features to ensure that they are on a similar scale
# we will going to scale fico_score

In [11]:
X = df.drop(['default', 'customer_id'], axis=1)
y = df['default']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
scaler = MinMaxScaler()

In [13]:
X_train['fico_score'] = scaler.fit_transform(X_train[['fico_score']].values)
X_test['fico_score'] = scaler.fit_transform(X_test[['fico_score']].values)

In [14]:
# Initialize and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [15]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [16]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [17]:
# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

Accuracy: 0.99

Confusion Matrix:
[[1644    8]
 [  19  329]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1652
           1       0.98      0.95      0.96       348

    accuracy                           0.99      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



In [18]:
# Extract coefficients and intercept
coefficients = model.coef_[0]
print('Coefficients: ', coefficients)
intercept = model.intercept_[0]
print('Intercept: ', intercept)

Coefficients:  [ 9.55372452e-02  7.03332502e-04  1.71950259e-03 -3.76174983e-04
 -4.31345514e-01 -4.70975849e-02]
Intercept:  -0.012727623143099758


In [19]:
def scaled(score):
    scaled_point = (score - df['fico_score'].min()) / (df['fico_score'].max() - df['fico_score'].min())
    return scaled_point

In [20]:
# Define the logistic function
def logistic_function(features):
    features[-1] = scaled(features[-1])
    # Calculate the linear combination of features and coefficients
    linear_combination = np.dot(features, coefficients) + intercept
    # Apply the logistic function (sigmoid)
    return 1 / (1 + np.exp(-linear_combination))

In [21]:
def expected_loss_on_loan(credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score, recovery_rate=0.10):
    '''
    Takes in the properties of a loan and output the expected loss (£)
    
    Parameters:
    - credit_lines_outstanding: the number of active credit lines that a borrower has
    - loan_amt_outstanding: the total amount of money that a borrower still owes on their outstanding loans
    - total_debt_outstanding:  the total debt outstanding of a borrower
    - income: borrower's income
    - years_employed: number of years borrower was employed
    - fico_score: credit score of borrower
    
    Returns:
    - expected_loss: expected loss (£) in case of default
    '''
    features = [credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score]
    probability_of_default = logistic_function(features)
    expected_loss = round((probability_of_default - recovery_rate) * features[1], 1)
    if expected_loss > 0:
        expected_loss = expected_loss
    if expected_loss <= 0:
        expected_loss = f'We anticipate Probability of Default equal to {round(probability_of_default*100, 0)}% on this loan.'
    return expected_loss

In [22]:
expected_loss_on_loan(0, 5221, 3915, 78039, 5, 605, 0.10)

'We anticipate Probability of Default equal to 0.0% on this loan.'

In [23]:
expected_loss_on_loan(5, 1959, 8229, 26648, 2, 572, 0.10)

1751.1