# Credit Card Default Prediction

The data set consists of 2000 samples from each of two categories. Five variables are
->Income
->Age
->Loan
->Loan to Income (engineered feature)
->Default

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/Credit%20Default.csv')
data

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
0,66155.92510,59.017015,8106.532131,0.122537,0
1,34415.15397,48.117153,6564.745018,0.190752,0
2,57317.17006,63.108049,8020.953296,0.139940,0
3,42709.53420,45.751972,6103.642260,0.142911,0
4,66952.68885,18.584336,8770.099235,0.130990,1
...,...,...,...,...,...
1995,59221.04487,48.518179,1926.729397,0.032535,0
1996,69516.12757,23.162104,3503.176156,0.050394,0
1997,44311.44926,28.017167,5522.786693,0.124636,1
1998,43756.05660,63.971796,1622.722598,0.037086,0


In [6]:
data.head()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
0,66155.9251,59.017015,8106.532131,0.122537,0
1,34415.15397,48.117153,6564.745018,0.190752,0
2,57317.17006,63.108049,8020.953296,0.13994,0
3,42709.5342,45.751972,6103.64226,0.142911,0
4,66952.68885,18.584336,8770.099235,0.13099,1


In [7]:
data.shape

(2000, 5)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Income          2000 non-null   float64
 1   Age             2000 non-null   float64
 2   Loan            2000 non-null   float64
 3   Loan to Income  2000 non-null   float64
 4   Default         2000 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 78.3 KB


In [8]:
data.columns

Index(['Income', 'Age', 'Loan', 'Loan to Income', 'Default'], dtype='object')

In [9]:
data.dtypes

Income            float64
Age               float64
Loan              float64
Loan to Income    float64
Default             int64
dtype: object

In [10]:
data.nunique()

Income            2000
Age               2000
Loan              2000
Loan to Income    2000
Default              2
dtype: int64

In [11]:
data.value_counts()

Income       Age        Loan         Loan to Income  Default
20014.48947  43.202204  2426.306223  0.121227        0          1
53601.81244  20.240621  9601.375482  0.179124        1          1
53825.53674  49.713904  5272.804792  0.097961        0          1
53825.43058  45.356690  431.450161   0.008016        0          1
53812.22648  44.919152  3245.041667  0.060303        0          1
                                                               ..
36840.60366  36.674583  6557.940331  0.178008        0          1
36837.53085  54.728504  1598.183569  0.043385        0          1
36801.90718  43.027943  5406.344926  0.146904        0          1
36727.74600  58.184654  7287.540764  0.198421        0          1
69995.68558  52.719673  2084.370861  0.029779        0          1
Name: count, Length: 2000, dtype: int64

In [12]:
data.duplicated().any()

False

In [13]:
# missing values in each column
missing_data = data.isnull().sum()
print(missing_data)

Income            0
Age               0
Loan              0
Loan to Income    0
Default           0
dtype: int64


In [14]:
data.describe()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,45331.600018,40.927143,4444.369695,0.098403,0.1415
std,14326.327119,13.26245,3045.410024,0.05762,0.348624
min,20014.48947,18.055189,1.37763,4.9e-05,0.0
25%,32796.45972,29.062492,1939.708847,0.047903,0.0
50%,45789.11731,41.382673,3974.719418,0.099437,0.0
75%,57791.28167,52.596993,6432.410625,0.147585,0.0
max,69995.68558,63.971796,13766.05124,0.199938,1.0


In [5]:
data['Default'].value_counts()

Default
0    1717
1     283
Name: count, dtype: int64

In [None]:
# Split data into train and test sets

In [16]:
y=default['Default']
x=default.drop(['Default'],axis=1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size=0.8)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1600, 4), (400, 4), (1600,), (400,))

In [None]:
# Importing Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)

In [None]:
# the intercept (often labeled the constant) is the expected value of Y when all X=0.

In [18]:
model.intercept_

array([9.35932553])

In [None]:
# coef is the output of the logistic regression function

In [19]:
model.coef_

array([[-2.34294672e-04, -3.45247220e-01,  1.70651551e-03,
         1.50230309e+00]])

In [20]:
y_pred=model.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# coef is the output of the logistic regression function

In [22]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
confusion_matrix(y_test,y_pred)

array([[323,  13],
       [ 14,  50]], dtype=int64)

In [23]:
# accuracy

In [24]:
accuracy_score(y_test,y_pred)

0.9325

In [25]:
# classification_report displays the model’s precision, recall, F1 score and support.

In [26]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       336
           1       0.79      0.78      0.79        64

    accuracy                           0.93       400
   macro avg       0.88      0.87      0.87       400
weighted avg       0.93      0.93      0.93       400

