In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv ('Task 3 and 4_Loan_data.csv')

In [5]:
print(data.isna().sum())
print(data.head())

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64
   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347

# Building the model

In [6]:
X = data[['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score']]
y = data['default']

print(X.describe(), y.describe())

       credit_lines_outstanding  loan_amt_outstanding  total_debt_outstanding  \
count              10000.000000          10000.000000            10000.000000   
mean                   1.461200           4159.677034             8718.916797   
std                    1.743846           1421.399078             6627.164762   
min                    0.000000             46.783973               31.652732   
25%                    0.000000           3154.235371             4199.836020   
50%                    1.000000           4052.377228             6732.407217   
75%                    2.000000           5052.898103            11272.263740   
max                    5.000000          10750.677810            43688.784100   

              income  years_employed    fico_score  
count   10000.000000    10000.000000  10000.000000  
mean    70039.901401        4.552800    637.557700  
std     20072.214143        1.566862     60.657906  
min      1000.000000        0.000000    408.000000  
25%  

In [9]:
# Split for train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1111)

# Standardize the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Using f1 and RAROC to see which model performs the best

### Logistic regression

In [10]:
# Instantiate Logistic Regression Model for Classification and fit the model
logit = LogisticRegression().fit(X_train, y_train)
# Predict the value
y_pred_logit = logit.predict(X_test)
# Predict the probability
y_pred_proba_logit = logit.predict_proba(X_test)[:,1]
# Calculate the RAROC value and f1 for the model
logit_score = roc_auc_score(y_test, y_pred_proba_logit)
print(logit_score)
logit_f1 = f1_score(y_test, y_pred_logit)
print(logit_f1)

0.9999985261334955
0.997289972899729


### Decision tree

In [11]:
# Instantiate Decision Tree Model for Classification and fit the model
dt = DecisionTreeClassifier().fit(X_train, y_train)
# Predict the value
y_pred_dt = dt.predict(X_test)
# Predict the probability
y_pred_proba_dt = dt.predict_proba(X_test)[:,1]
# Calculate the RAROC value and f1 for the model
dt_score = roc_auc_score(y_test, y_pred_proba_dt)
print(dt_score)
dt_f1 = f1_score(y_test, y_pred_dt)
print(dt_f1)

0.9949649035538606
0.9901168014375561


## Random forest classifier

In [12]:
# Instantiate Random Forest Model for Classification and fit the model
rfc = RandomForestClassifier().fit(X_train, y_train)
# Predict the value
y_pred_rfc = rfc.predict(X_test)
# Predict the probability
y_pred_proba_rfc = rfc.predict_proba(X_test)[:,1]
# Calculate the RAROC value and f1 for the model
rfc_score = roc_auc_score(y_test, y_pred_proba_rfc)
print(rfc_score)
rfc_f1 = f1_score(y_test, y_pred_rfc)
print(rfc_f1)

0.9998695628143481
0.9927667269439422


## Modelling the function using logistic regression

In [16]:
# Define varibles for EL
ead = data['loan_amt_outstanding']
rr = 0.1
lgd = 1 - rr
pd = np.append(logit.predict_proba(X_train)[:,1], logit.predict_proba(X_test)[:,1])

def exp_loss(lgd, ead, pd):
    el = lgd * ead * pd
    return el

In [None]:
# Prompt the user to input the index of the line they want to see
index = int(input("Enter the index of the line you want to see the expected loss for: "))

# Check if the index is within the valid range
if index >= 0 and index < len(ead):
    # Calculate expected loss for the specified index
    expected_loss = exp_loss(lgd, ead[index], pd[index])
    print("Expected loss for index {}: {}".format(index, expected_loss))
else:
    print("Invalid index. Please enter a valid index within the range of the dataset.")
# Define varibles for EL
ead = data['loan_amt_outstanding']
rr = 0.1
lgd = 1 - rr
pd = np.append(logit.predict_proba(X_train)[:,1], logit.predict_proba(X_test)[:,1])

def exp_loss(lgd, ead, pd):
    el = lgd * ead * pd
    return el

# Prompt the user to input the index of the line they want to see
index = int(input("Enter the index of the line you want to see the expected loss for: "))

# Check if the index is within the valid range
if index >= 0 and index < len(ead):
    # Calculate expected loss for the specified index
    expected_loss = exp_loss(lgd, ead[index], pd[index])
    print("Expected loss for index {}: {}".format(index, expected_loss))
else:
    print("Invalid index. Please enter a valid index within the range of the dataset.")


Enter the index of the line you want to see the expected loss for:  4


Expected loss for index 4: 2.6450685491853204e-05
