In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("data\\UCI_Credit_Card.csv")
df.rename({'default.payment.next.month' : 'Default'}, axis = 'columns', inplace = True)
df.drop('ID', axis=1, inplace= True)
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'Default'],
      dtype='object')

In [3]:
df['SEX'].unique()


array([2, 1], dtype=int64)

In [4]:
df['EDUCATION'].unique()

array([2, 1, 3, 5, 4, 6, 0], dtype=int64)

In [5]:
df['MARRIAGE'].unique()

array([1, 2, 3, 0], dtype=int64)

In [6]:
len(df.loc[(df['EDUCATION'] == 0) | (df['MARRIAGE'] == 0)])

68

In [7]:
df_clean = df.loc[(df['EDUCATION']!=0) & (df['MARRIAGE']!=0)]
print(len(df))
print(len(df_clean))


30000
29932


In [8]:
df_clean['EDUCATION'].unique()


array([2, 1, 3, 5, 4, 6], dtype=int64)

In [9]:
df_clean['MARRIAGE'].unique()

array([1, 2, 3], dtype=int64)

In [10]:
df_default_1 = df_clean[df_clean['Default'] == 0]
df_default_0 = df_clean[df_clean['Default'] == 1]

In [13]:
df_default_0_downsample = resample(df_default_0, replace=False, n_samples=1000, random_state=42)
df_default_1_downsample = resample(df_default_1, replace=False, n_samples=1000, random_state=42)

In [16]:
df_downsample = pd.concat([df_default_0_downsample, df_default_1_downsample])
print(len(df_downsample))

2000


In [17]:
df_downsample.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'Default'],
      dtype='object')

In [23]:
value = df_downsample.iloc[: , : -1];
label = df_downsample.iloc[: , -1: ];
print(value.columns)
print(label.columns)

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')
Index(['Default'], dtype='object')


In [29]:
value_encode = pd.get_dummies(value, columns=['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'])
# encode.head()

In [30]:
value_train, value_test, label_train, label_test = train_test_split(value_encode, label, test_size=0.2, random_state=2)