In [17]:
!pip install ucimlrepo



# <font color = 'pickle'>***Setting the environment and Loading Data***

In [2]:
# importing packages
import numpy as np
import pandas as pd
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from ucimlrepo import fetch_ucirepo

In [19]:
default_info = fetch_ucirepo(id = 350)
default_info

{'data': {'ids':           ID
  0          1
  1          2
  2          3
  3          4
  4          5
  ...      ...
  29995  29996
  29996  29997
  29997  29998
  29998  29999
  29999  30000
  
  [30000 rows x 1 columns],
  'features':            X1  X2  X3  X4  X5  X6  X7  X8  ...    X16    X17    X18    X19    X20   X21    X22   X23
  0       20000   2   2   1  24   2   2  -1  ...      0      0      0    689      0     0      0     0
  1      120000   2   2   2  26  -1   2   0  ...   3455   3261      0   1000   1000  1000      0  2000
  2       90000   2   2   2  34   0   0   0  ...  14948  15549   1518   1500   1000  1000   1000  5000
  3       50000   2   2   1  37   0   0   0  ...  28959  29547   2000   2019   1200  1100   1069  1000
  4       50000   1   2   1  57  -1   0  -1  ...  19146  19131   2000  36681  10000  9000    689   679
  ...       ...  ..  ..  ..  ..  ..  ..  ..  ...    ...    ...    ...    ...    ...   ...    ...   ...
  29995  220000   1   3   1  39   0   0  

In [9]:
# reading in the data
default_risk = pd.read_excel('https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls', header=1)

In [10]:
default_risk.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


<font color = 'pickle'>*The target variable is the last column of the dataset, 'default_payment_next_month.' The other columns with the exception of the id column will be features relevant to the classification task.*

In [12]:
default_risk.rename({'default_payment_next_month':'default'}, axis = 'columns', inplace = True)

default_risk.drop('ID', axis = 1, inplace = True)

default_risk.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


# <font color = 'pickle'>***Data Preprocessing***

<font color  = 'pickle'>*Notice there are no n/a or missing values from any of the columns.*

In [13]:
default_risk.isna().sum()

Unnamed: 0,0
LIMIT_BAL,0
SEX,0
EDUCATION,0
MARRIAGE,0
AGE,0
PAY_0,0
PAY_2,0
PAY_3,0
PAY_4,0
PAY_5,0


In [14]:
default_risk.isnull().sum()

Unnamed: 0,0
LIMIT_BAL,0
SEX,0
EDUCATION,0
MARRIAGE,0
AGE,0
PAY_0,0
PAY_2,0
PAY_3,0
PAY_4,0
PAY_5,0


<font color = 'pickle'>*Below we have some invalid values present in the marriage and education columns. Education is mentioned in the data card as having values from 1-6 and marriage is mentioned as having values from 1-3. These erroneous values need to be removed from the dataset.*

In [16]:
# Checking the unique values within each of the following columns.

print(default_risk['EDUCATION'].unique())
print(default_risk['MARRIAGE'].unique())
print(default_risk['SEX'].unique())
print(default_risk['AGE'].unique())

[2 1 3 5 4 6 0]
[1 2 3 0]
[2 1]
[24 26 34 37 57 29 23 28 35 51 41 30 49 39 40 27 47 33 32 54 58 22 25 31
 46 42 43 45 56 44 53 38 63 36 52 48 55 60 50 75 61 73 59 21 67 66 62 70
 72 64 65 71 69 68 79 74]


In [28]:
print(len(default_risk.loc[default_risk['EDUCATION']== 0 ]))

print(len(default_risk.loc[default_risk['MARRIAGE']==0]))


14
54


In [31]:
default_risk = default_risk.loc[(default_risk['EDUCATION'] != 0 ) & (default_risk['MARRIAGE'] != 0)]
print(len(default_risk.loc[(default_risk['EDUCATION']==0)]| (default_risk['MARRIAGE']==0)))

0


In [None]:
plt.figure(figsize = (12,8))