Import libraries necessary for this project

In [173]:
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Pretty display for notebooks
%matplotlib inline

Below is a data ditionary from https://github.com/meauxt/credit-card-default/blob/master/credit_card_default.ipynb

Load the datasets

In [174]:
train = pd.read_csv("DataFiles/CreditCard_train.csv",header = 1)
test = pd.read_csv("DataFiles/CreditCard_test.csv",header = 1)
all_data = pd.concat([df_train,df_test])

In [175]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 0 to 5999
Data columns (total 25 columns):
ID                            30000 non-null int64
LIMIT_BAL                     30000 non-null int64
SEX                           30000 non-null int64
EDUCATION                     30000 non-null int64
MARRIAGE                      30000 non-null int64
AGE                           30000 non-null int64
PAY_0                         30000 non-null int64
PAY_2                         30000 non-null int64
PAY_3                         30000 non-null int64
PAY_4                         30000 non-null int64
PAY_5                         30000 non-null int64
PAY_6                         30000 non-null int64
BILL_AMT1                     30000 non-null int64
BILL_AMT2                     30000 non-null int64
BILL_AMT3                     30000 non-null int64
BILL_AMT4                     30000 non-null int64
BILL_AMT5                     30000 non-null int64
BILL_AMT6                

In [176]:
train.rename(columns={"PAY_0": "PAY_1"}, inplace = True)
test.rename(columns={"PAY_0": "PAY_1"}, inplace = True)

Split the train and test data for later training

In [151]:
train_y = train['default payment next month']
train_x = train.drop('default payment next month', axis = 1)
test_y = test['default payment next month']
test_x = test.drop('default payment next month', axis = 1)

In [152]:
general_stats= all_data.describe().iloc[:,0:6]
pay_status_stats = all_data.describe().iloc[:,6:12]
bill_stats = all_data.describe().iloc[:,12:18]
payed_stats = all_data.describe().iloc[:,18:24]

In [153]:
general_stats

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904
min,1.0,10000.0,1.0,0.0,0.0,21.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0


In [154]:
pay_status_stats

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911
std,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988
min,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,8.0,8.0,8.0,8.0,8.0,8.0


In [155]:
bill_stats

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604
std,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537
min,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0
25%,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0
50%,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0
75%,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25
max,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0


In [156]:
payed_stats

Unnamed: 0,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567
std,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1000.0,833.0,390.0,296.0,252.5,117.75
50%,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0
75%,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0
max,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0


Check if the data contains labels not included in the dictionary

In [157]:
for c in all_data.columns[2:12]:
    unique_vals = all_data[c].unique()
    print(c,"-"*5,unique_vals)

SEX ----- [2 1]
EDUCATION ----- [2 1 3 5 4 6 0]
MARRIAGE ----- [1 2 3 0]
AGE ----- [24 26 34 37 57 29 23 28 35 51 41 30 49 39 40 27 47 33 32 54 58 22 25 31
 46 42 43 45 56 44 53 38 63 36 52 48 55 60 50 75 61 73 59 21 67 66 62 70
 72 64 65 71 69 68 79 74]
PAY_0 ----- [ 2 -1  0 -2  1  3  4  8  7  5  6]
PAY_2 ----- [ 2  0 -1 -2  3  5  7  4  1  6  8]
PAY_3 ----- [-1  0  2 -2  3  4  6  7  1  5  8]
PAY_4 ----- [-1  0 -2  2  3  4  5  7  6  1  8]
PAY_5 ----- [-2  0 -1  2  3  5  4  7  8  6]
PAY_6 ----- [-2  2  0 -1  3  6  4  7  8  5]


In EDUCATION column, 5 and 6 both encodes for 'unknown'. There are 3 ways to eliminate this duplicate.
1.remove all 5s and 6s, and replace with Nan.
2.covert 5, 6 to 4, which encodes for 'others'.
3.keep the label 'unknown', convert 6 to 5.

In [134]:
# here I chose to convert 5,6 to 4 because the label 'others' does not contain more information than 'unknown'
train["EDUCATION"].replace([5,6], 4, inplace = True)
test["EDUCATION"].replace([5,6], 4, inplace = True)

In [143]:
#if we were to remove the unkown values, we should consider the amount of value we are removing
sum_unknown = all_data['EDUCATION'][(all_data['EDUCATION'] == 5)].count() + all_data['EDUCATION'][(all_data['EDUCATION'] == 6)].count()
sum_unknown/30000

0.011033333333333332

In [146]:
# all_data[['BILL_AMT1','PAY_AMT1']][(all_data['PAY_0'] == -2)]

Unnamed: 0,BILL_AMT1,PAY_AMT1
9,0,0
23,5512,19428
33,10929,4152
34,13709,5006
45,0,0
52,13465,7875
55,1905,3640
65,152519,0
92,-2000,7555
100,672,10212


In [98]:
clf = AdaBoostClassifier(random_state=41,n_estimators=15)
clf.fit(train_x,train_y)
accuracy = clf.score(test_x,test_y)
print (accuracy)

0.8285
