In [2]:
## Import the libraries
import pandas as pd
import numpy as np

## Load the Loan dataset
loan_data = pd.read_csv('loan2+(1).csv')
loan_data.head()

Unnamed: 0,id,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,loan_status
0,1077501,5000,5000,10.65,162.87,10,24000.0,Low Risk
1,1077430,2500,2500,15.27,59.83,1,30000.0,High Risk
2,1077175,2400,2400,15.96,84.33,10,12252.0,Low Risk
3,1076863,10000,10000,13.49,339.31,10,49200.0,Low Risk
4,1075358,3000,3000,12.69,67.79,1,80000.0,Medium Risk


In [4]:
## Check the details of the dataset
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38642 entries, 0 to 38641
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           38642 non-null  int64  
 1   loan_amnt    38642 non-null  int64  
 2   funded_amnt  38642 non-null  int64  
 3   int_rate     38642 non-null  float64
 4   installment  38642 non-null  float64
 5   emp_length   38642 non-null  int64  
 6   annual_inc   38642 non-null  float64
 7   loan_status  38642 non-null  object 
dtypes: float64(3), int64(4), object(1)
memory usage: 2.4+ MB


In [6]:
## Distribution of the target variable
loan_data['loan_status'].value_counts()

Low Risk       32145
High Risk       5399
Medium Risk     1098
Name: loan_status, dtype: int64

In [11]:
loan_data['loan_status'].value_counts(normalize=True)*100

Low Risk       83.186688
High Risk      13.971844
Medium Risk     2.841468
Name: loan_status, dtype: float64

### Data Cleaning

In [13]:
## check for null values
loan_data.isnull().sum().sum()

0

In [15]:
## check for duplicate values
loan_data.duplicated().sum()

0

### Feature Creation

In [16]:
# Adding new variables
## fund_perc variable represents the ratio of funded amount wrt loan amount
loan_data['fund_perc'] = loan_data['funded_amnt']/loan_data['loan_amnt']

## incToloan_perc variable represents the ratio btwn income and loan amount
loan_data['incToloan_perc'] = loan_data['annual_inc']/loan_data['loan_amnt']

In [18]:
## understanding distribution of all the numeric variables
loan_data.describe(percentiles=[0.25,0.5,0.75,0.90,0.95,1])

Unnamed: 0,id,loan_amnt,funded_amnt,int_rate,installment,emp_length,annual_inc,fund_perc,incToloan_perc
count,38642.0,38642.0,38642.0,38642.0,38642.0,38642.0,38642.0,38642.0,38642.0
mean,681040.4,11291.615988,11017.101211,12.052427,326.760477,5.09205,69608.28,0.985571,8.91595
std,211304.5,7462.136215,7193.038828,3.716705,209.143908,3.408338,64253.2,0.070317,13.845454
min,54734.0,500.0,500.0,5.42,15.69,1.0,4000.0,0.10125,1.204819
25%,513435.0,5500.0,5500.0,9.32,168.4425,2.0,41400.0,1.0,4.0
50%,662770.5,10000.0,9950.0,11.86,282.83,4.0,60000.0,1.0,6.066667
75%,836491.2,15000.0,15000.0,14.59,434.3975,9.0,83199.99,1.0,10.016699
90%,1006751.0,22250.0,21000.0,16.95,625.63,10.0,117000.0,1.0,17.026487
95%,1040032.0,25000.0,25000.0,18.62,767.8755,10.0,143975.0,1.0,23.75
100%,1077501.0,35000.0,35000.0,24.59,1305.19,10.0,6000000.0,1.0,1266.666667


In [19]:
## column names
loan_data.columns

Index(['id', 'loan_amnt', 'funded_amnt', 'int_rate', 'installment',
       'emp_length', 'annual_inc', 'loan_status', 'fund_perc',
       'incToloan_perc'],
      dtype='object')

### Train- Test split

In [23]:
## independent variables
X = loan_data.select_dtypes(include='number')


## dropping the columns id and funded_amnt
X.drop(['id','funded_amnt'],axis=1,inplace=True)

## dependent variable
y = loan_data['loan_status']

## sklearn library
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=100)


## standardizing all the variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Model Building

In [27]:
## importing libraries
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report
import warnings
warnings.filterwarnings('ignore')

In [28]:
### Building a classification model using one vs rest method
LR = LogisticRegression()
oneVsrest = OneVsRestClassifier(estimator=LR)

## Fitting the model with training data
oneVsrest.fit(X_train_scaled,y_train)

### Model Prediction

In [30]:
## Making a prediction on the test set
prediction_oneVsrest = oneVsrest.predict(X_test_scaled)

## Evaluating the model
print(f'Test set Accuracy:{accuracy_score(y_test,prediction_oneVsrest)*100}')
print(f'Classification Report :{classification_report(y_test,prediction_oneVsrest)}')

Test set Accuracy:82.94660571034245
Classification Report :              precision    recall  f1-score   support

   High Risk       0.47      0.01      0.02      1665
    Low Risk       0.83      1.00      0.91      9612
 Medium Risk       0.00      0.00      0.00       316

    accuracy                           0.83     11593
   macro avg       0.43      0.34      0.31     11593
weighted avg       0.76      0.83      0.75     11593



In [60]:
X_test

Unnamed: 0,loan_amnt,int_rate,installment,emp_length,annual_inc,fund_perc,incToloan_perc,Scaled_features,Actual,prediction_oneVsrest,prob_pred_oneVsrest,prob_pred_oneVsrest_highRisk,prob_pred_oneVsrest_lowRisk,prob_pred_oneVsrest_mediumRisk
13544,24000,11.99,533.75,10,84996.0,1.0,3.541500,"[1.6976067970323159, -0.017103257487488508, 0....",Low Risk,Low Risk,"[0.14540704828756565, 0.6705568181703935, 0.18...",0.145407,0.670557,0.184036
20268,12500,13.43,423.77,1,58000.0,1.0,4.640000,"[0.15662313180383905, 0.37089505172108733, 0.4...",Low Risk,Low Risk,"[0.13443332466561736, 0.8637730013278412, 0.00...",0.134433,0.863773,0.001794
35271,1700,13.16,57.41,1,14000.0,1.0,8.235294,"[-1.2905615277150784, 0.29814536874447944, -1....",Low Risk,Low Risk,"[0.1752122244548113, 0.8066509312052971, 0.018...",0.175212,0.806651,0.018137
29133,10000,12.73,335.67,10,60000.0,1.0,6.000000,"[-0.1783733171588733, 0.1822847625224742, 0.03...",High Risk,Low Risk,"[0.1383502039899746, 0.8570853687482315, 0.004...",0.138350,0.857085,0.004564
2974,1300,7.90,40.68,1,41000.0,1.0,31.538462,"[-1.3441609595491122, -1.1191262329479577, -1....",Low Risk,Low Risk,"[0.07183469705597736, 0.9259854258182477, 0.00...",0.071835,0.925985,0.002180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36909,7500,13.16,253.28,1,36000.0,1.0,4.800000,"[-0.5133697661215857, 0.29814536874447944, -0....",High Risk,Low Risk,"[0.15141497851236188, 0.8430564928374713, 0.00...",0.151415,0.843056,0.005529
20896,4000,9.25,127.67,10,45000.0,1.0,11.250000,"[-0.982364794669383, -0.7553778180649178, -0.9...",Low Risk,Low Risk,"[0.09624537309186205, 0.8968119003535345, 0.00...",0.096245,0.896812,0.006943
16851,34000,11.11,741.11,10,191000.0,1.0,5.617647,"[3.0375925928831653, -0.25421333533717405, 1.9...",Low Risk,Low Risk,"[0.07304848926278627, 0.6086913361669488, 0.31...",0.073048,0.608691,0.318260
24550,10000,15.95,242.92,10,100000.0,1.0,10.000000,"[-0.1783733171588733, 1.049892092836095, -0.40...",Low Risk,Low Risk,"[0.19784159082758154, 0.6999106214661841, 0.10...",0.197842,0.699911,0.102248


### Analysing the probabilities and classification values

In [34]:
## Adding following variables to the test dataset

## scaled feature arraya
X_test['Scaled_features'] = X_test_scaled.tolist()

## Actual target variable
X_test['Actual'] = y_test

## OnevsRest target prediction
X_test['prediction_oneVsrest'] = prediction_oneVsrest

## OnevsRest probability prediction
X_test['prob_pred_oneVsrest'] = oneVsrest.predict_proba(X_test_scaled).tolist()

## OnevsRest individual class prediction
X_test['prob_pred_oneVsrest_highRisk'] = oneVsrest.predict_proba(X_test_scaled)[:,0].tolist()
X_test['prob_pred_oneVsrest_lowRisk'] = oneVsrest.predict_proba(X_test_scaled)[:,1].tolist()
X_test['prob_pred_oneVsrest_mediumRisk'] = oneVsrest.predict_proba(X_test_scaled)[:,2].tolist()

In [35]:
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,emp_length,annual_inc,fund_perc,incToloan_perc,Scaled_features,Actual,prediction_oneVsrest,prob_pred_oneVsrest,prob_pred_oneVsrest_highRisk,prob_pred_oneVsrest_lowRisk,prob_pred_oneVsrest_mediumRisk
13544,24000,11.99,533.75,10,84996.0,1.0,3.5415,"[1.6976067970323159, -0.017103257487488508, 0....",Low Risk,Low Risk,"[0.14540704828756565, 0.6705568181703935, 0.18...",0.145407,0.670557,0.184036
20268,12500,13.43,423.77,1,58000.0,1.0,4.64,"[0.15662313180383905, 0.37089505172108733, 0.4...",Low Risk,Low Risk,"[0.13443332466561736, 0.8637730013278412, 0.00...",0.134433,0.863773,0.001794
35271,1700,13.16,57.41,1,14000.0,1.0,8.235294,"[-1.2905615277150784, 0.29814536874447944, -1....",Low Risk,Low Risk,"[0.1752122244548113, 0.8066509312052971, 0.018...",0.175212,0.806651,0.018137
29133,10000,12.73,335.67,10,60000.0,1.0,6.0,"[-0.1783733171588733, 0.1822847625224742, 0.03...",High Risk,Low Risk,"[0.1383502039899746, 0.8570853687482315, 0.004...",0.13835,0.857085,0.004564
2974,1300,7.9,40.68,1,41000.0,1.0,31.538462,"[-1.3441609595491122, -1.1191262329479577, -1....",Low Risk,Low Risk,"[0.07183469705597736, 0.9259854258182477, 0.00...",0.071835,0.925985,0.00218


### Display the coefficients and intercept values for each Logistic Regression model

In [53]:
## class for which individual models are created
print('Classes:',oneVsrest.classes_)

## Coefficent matrix for all the models created
print('\n\n\n')
print('Coefficients for first model:\n',oneVsrest.estimators_[0].coef_)
print('Coefficients for second model:\n',oneVsrest.estimators_[1].coef_)
print('Coefficients for third model:\n',oneVsrest.estimators_[2].coef_)

## Intercept values for all the models created
print('\n\n\n')
print('Intercept for first model:\n',oneVsrest.estimators_[0].intercept_)
print('Intercept for second model:\n',oneVsrest.estimators_[1].intercept_)
print('Intercept for third model:\n',oneVsrest.estimators_[2].intercept_)


Classes: ['High Risk' 'Low Risk' 'Medium Risk']




Coefficients for first model:
 [[ 0.39851727  0.59347116 -0.38860129  0.04331238 -0.39891493  0.02934282
  -0.01457014]]
Coefficients for second model:
 [[-1.16799074 -0.63873289  1.13309613 -0.07050531  0.30833649 -0.21800131
   0.09024585]]
Coefficients for third model:
 [[ 4.9602886   0.96920326 -5.59110894  0.17542626  0.1334617   1.39128615
  -0.64155322]]




Intercept for first model:
 [-1.98702862]
Intercept for second model:
 [1.81436432]
Intercept for third model:
 [-5.11248147]


### Analyse probability values for one test sample

In [54]:
print(X_test.iloc[0]['prob_pred_oneVsrest'])

[0.14540704828756565, 0.6705568181703935, 0.18403613354204082]


# --------------------------------------------------------------------

### Building Logistic Regression Model and using it in One vs One classifier

In [81]:
## Classification using OnevsOne method
LR1 = LogisticRegression()
OnevsOne = OneVsOneClassifier(estimator=LR1)

## Fitting the model with training data
OnevsOne.fit(X_train_scaled,y_train)

##Making a prediction on the test set
prediction_OnevsOne = OnevsOne.predict(X_test_scaled)

### Model Prediction

In [83]:
## Evaluating the model
print(f'Accuracy of the model:{accuracy_score(y_test,prediction_OnevsOne)}')
print(f'Classification report:\n\n{classification_report(y_test,prediction_OnevsOne)}')

Accuracy of the model:0.8292072802553265
Classification report:

              precision    recall  f1-score   support

   High Risk       0.50      0.00      0.01      1665
    Low Risk       0.83      1.00      0.91      9612
 Medium Risk       0.42      0.02      0.03       316

    accuracy                           0.83     11593
   macro avg       0.58      0.34      0.32     11593
weighted avg       0.77      0.83      0.75     11593



### Analysing the probabilities and classification values

In [85]:
###Adding the prediction
X_test['prediction_oneVsone'] = prediction_OnevsOne
X_test.head(2)

Unnamed: 0,loan_amnt,int_rate,installment,emp_length,annual_inc,fund_perc,incToloan_perc,Scaled_features,Actual,prediction_oneVsrest,prob_pred_oneVsrest,prob_pred_oneVsrest_highRisk,prob_pred_oneVsrest_lowRisk,prob_pred_oneVsrest_mediumRisk,prediction_oneVsone
13544,24000,11.99,533.75,10,84996.0,1.0,3.5415,"[1.6976067970323159, -0.017103257487488508, 0....",Low Risk,Low Risk,"[0.14540704828756565, 0.6705568181703935, 0.18...",0.145407,0.670557,0.184036,Low Risk
20268,12500,13.43,423.77,1,58000.0,1.0,4.64,"[0.15662313180383905, 0.37089505172108733, 0.4...",Low Risk,Low Risk,"[0.13443332466561736, 0.8637730013278412, 0.00...",0.134433,0.863773,0.001794,Low Risk


In [86]:
## OnevsOne classes
print(OnevsOne.classes_)

['High Risk' 'Low Risk' 'Medium Risk']


In [87]:
## OneVsOne estimators
print(OnevsOne.estimators_)

(LogisticRegression(), LogisticRegression(), LogisticRegression())


In [88]:
##coefficents for the first model
print('Coefficients for the first model:',OnevsOne.estimators_[0].coef_)


Coefficients for the first model: [[-0.58297804 -0.6251806   0.5553151  -0.05221415  0.40056105 -0.06786675
   0.0207779 ]]
