In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/YBIFoundation/Dataset/main/Credit%20Default.csv')

In [3]:
df.head()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
0,66155.9251,59.017015,8106.532131,0.122537,0
1,34415.15397,48.117153,6564.745018,0.190752,0
2,57317.17006,63.108049,8020.953296,0.13994,0
3,42709.5342,45.751972,6103.64226,0.142911,0
4,66952.68885,18.584336,8770.099235,0.13099,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Income          2000 non-null   float64
 1   Age             2000 non-null   float64
 2   Loan            2000 non-null   float64
 3   Loan to Income  2000 non-null   float64
 4   Default         2000 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 78.3 KB


In [5]:
df.describe()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,45331.600018,40.927143,4444.369695,0.098403,0.1415
std,14326.327119,13.26245,3045.410024,0.05762,0.348624
min,20014.48947,18.055189,1.37763,4.9e-05,0.0
25%,32796.45972,29.062492,1939.708847,0.047903,0.0
50%,45789.11731,41.382673,3974.719418,0.099437,0.0
75%,57791.28167,52.596993,6432.410625,0.147585,0.0
max,69995.68558,63.971796,13766.05124,0.199938,1.0


In [6]:
df.columns

Index(['Income', 'Age', 'Loan', 'Loan to Income', 'Default'], dtype='object')

In [7]:
df.shape

(2000, 5)

In [8]:
x = df[['Income', 'Age', 'Loan', 'Loan to Income']]

In [9]:
x.shape

(2000, 4)

In [10]:
x

Unnamed: 0,Income,Age,Loan,Loan to Income
0,66155.92510,59.017015,8106.532131,0.122537
1,34415.15397,48.117153,6564.745018,0.190752
2,57317.17006,63.108049,8020.953296,0.139940
3,42709.53420,45.751972,6103.642260,0.142911
4,66952.68885,18.584336,8770.099235,0.130990
...,...,...,...,...
1995,59221.04487,48.518179,1926.729397,0.032535
1996,69516.12757,23.162104,3503.176156,0.050394
1997,44311.44926,28.017167,5522.786693,0.124636
1998,43756.05660,63.971796,1622.722598,0.037086


In [11]:
y = df['Default']

In [12]:
y

0       0
1       0
2       0
3       0
4       1
       ..
1995    0
1996    0
1997    1
1998    0
1999    0
Name: Default, Length: 2000, dtype: int64

In [13]:
y.shape

(2000,)

In [14]:
xtr,xte,ytr,yte = train_test_split(x,y,test_size=0.2,random_state=2529)

In [15]:
xtr,xte,ytr,yte

(           Income        Age         Loan  Loan to Income
 1578  29849.96714  39.928724  3678.899676        0.123246
 331   24698.66931  48.912557  2427.650788        0.098291
 608   34399.20978  31.764889  6019.834423        0.174999
 457   20686.23909  33.280524  3052.576691        0.147566
 641   66871.26736  62.689364  3614.268185        0.054048
 ...           ...        ...          ...             ...
 740   63661.38333  25.595524  6095.308749        0.095746
 399   24037.16514  23.311574  2469.364426        0.102731
 828   68100.73562  47.752940  8124.598980        0.119303
 1586  34163.62565  45.782718  6617.400172        0.193697
 1376  60624.81537  39.857788  6740.716136        0.111187
 
 [1600 rows x 4 columns],
            Income        Age         Loan  Loan to Income
 1317  62125.25811  21.085868  5700.457195        0.091757
 705   53330.76714  42.377246  2343.497556        0.043943
 1881  24406.89381  37.905318  1733.403111        0.071021
 1725  34428.97264  27.36841

In [16]:
model = LogisticRegression(max_iter=10000)

In [17]:
model.fit(xtr,ytr)

In [18]:
ypred = model.predict(xte)

In [19]:
ypred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [20]:
accuracy_score(yte,ypred)

0.9125

In [21]:
confusion_matrix(yte,ypred)

array([[340,   7],
       [ 28,  25]], dtype=int64)

In [22]:
print(classification_report(yte,ypred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       347
           1       0.78      0.47      0.59        53

    accuracy                           0.91       400
   macro avg       0.85      0.73      0.77       400
weighted avg       0.91      0.91      0.90       400

