## Setting

In [2]:
# set module
import pandas as pd
import numpy as np
import tqdm

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# train vaild split
from sklearn.model_selection import train_test_split

# modeling 
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# confusion matrix
from sklearn.metrics import confusion_matrix

# graph
from matplotlib import pyplot as plt
import seaborn as sns

In [6]:
# load data
data = pd.read_csv('../data/loan_train.csv')
submission = pd.read_csv('../data/loan_test.csv')

In [8]:
# search the NaN
data.isna().sum()

Gender                13
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                  14
Credit_History        50
Area                   0
Status                 0
dtype: int64

In [13]:
# 결측치가 포함된 행 전부 제거
complete_data = data.dropna()

- 결측치 제거 전 train : test 비율 약 6:4 (0.62)
- 결측치 제거 후 train : test 비율 약 6:4 (0.58)

In [20]:
# 데이터 확인
complete_data.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,Male,No,0,Graduate,No,584900,0.0,15000000,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,458300,150800.0,12800000,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,300000,0.0,6600000,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,258300,235800.0,12000000,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,600000,0.0,14100000,360.0,1.0,Urban,Y


### 적용해볼 수 있는 요소
- 범주형 데이터의 숫자화
- 표준화 ex) min-max, normalizations
- 이상치 탐지 및 대치

In [22]:
# 연속형 데이터 확인
complete_data.describe()

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History
count,499.0,499.0,499.0,499.0,499.0
mean,533612.8,156699.6,13952510.0,342.012024,0.851703
std,561815.7,258095.5,8345237.0,64.860368,0.35575
min,15000.0,0.0,0.0,36.0,0.0
25%,288550.0,0.0,9800000.0,360.0,1.0
50%,385900.0,108600.0,12600000.0,360.0,1.0
75%,582500.0,225150.0,16650000.0,360.0,1.0
max,8100000.0,3383700.0,60000000.0,480.0,1.0


- 특별히 튀는 값은 보이지 않음
- 단위가 너무 커서 표준화 적용해야 할 것으로 보임

In [27]:
# 데이터 형식 확인
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499 entries, 0 to 613
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Gender              499 non-null    object 
 1   Married             499 non-null    object 
 2   Dependents          499 non-null    object 
 3   Education           499 non-null    object 
 4   Self_Employed       499 non-null    object 
 5   Applicant_Income    499 non-null    int64  
 6   Coapplicant_Income  499 non-null    float64
 7   Loan_Amount         499 non-null    int64  
 8   Term                499 non-null    float64
 9   Credit_History      499 non-null    float64
 10  Area                499 non-null    object 
 11  Status              499 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 50.7+ KB


In [41]:
complete_data.dtypes

Gender                 object
Married                object
Dependents             object
Education              object
Self_Employed          object
Applicant_Income        int64
Coapplicant_Income    float64
Loan_Amount             int64
Term                  float64
Credit_History        float64
Area                   object
Status                 object
dtype: object

In [121]:
# 타겟의 카테고리별 개수
complete_data['Status'][complete_data['Status'] == 'N']

1      N
7      N
9      N
13     N
17     N
      ..
589    N
591    N
596    N
605    N
613    N
Name: Status, Length: 158, dtype: object

- 341:158 : 약 2.3:1 비율로 불균형이라고 보기도 애매하고 아니기도 애매하기에 일단은 그냥 진행

## 범주형 변수 - LabelEncoding
- 결정트리 모델을 활용하기 때문에 이것으로 적용가능
- 결정트리가 아닐 경우 one-hot encoding 권장

In [80]:
# 범주형 변수만 LabelEncoding
encode_cate = pd.DataFrame() # 범주변수 저장
encode_num = pd.DataFrame() # 연속 변수 저장
for attribute, col in zip(complete_data.dtypes, complete_data.columns): # 변수 타입과 열 이름
    if attribute == 'object':
        le = LabelEncoder() # 인코더 불러오기
        le.fit(complete_data[col]) # 적합
        encode_cate[col] = le.transform(complete_data[col]) # 변화시켜 저장
    else:
        encode_num[col] = complete_data[col] # 연속 변수는 그대로 저장

## 연속형 변수 - Normalization Scaling

In [81]:
# StandardScaler객체 생성
scaler = StandardScaler()

In [82]:
# StandardScaler 로 데이터 셋 변환 .fit( ) 과 .transform( ) 호출
scaler.fit(encode_num)
trans_num = scaler.transform(encode_num)

In [83]:
# transform( )시 scale 변환된 데이터 셋이 numpy ndarry로 반환되어 이를 DataFrame으로 변환
trans_num_scaled = pd.DataFrame(data=trans_num, columns=encode_num.columns)

In [84]:
# test set에도 똑같은 값으로 적용해야하기 때문에 그대로 사용
print('feature 들의 표준화 평균 값')
print(trans_num.mean())
print('\nfeature 들의 표준화 표준편차 값')
print(trans_num.std())

feature 들의 표준화 평균 값
8.187616694630413e-17

feature 들의 표준화 표준편차 값
1.0


## 데이터 결합

In [85]:
# 사용할 데이터 합치기
use_data = pd.concat([encode_cate, trans_num_scaled], axis = 1)

In [86]:
# 완성된 데이터 확인
use_data.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Area,Status,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History
0,1,0,0,0,0,2,1,0.09138,-0.607747,0.125646,0.277612,0.417274
1,1,1,1,0,0,0,0,-0.134187,-0.022881,-0.138242,0.277612,0.417274
2,1,1,0,0,1,2,1,-0.416235,-0.607747,-0.881926,0.277612,0.417274
3,1,1,0,1,0,2,1,-0.490533,0.306785,-0.234201,0.277612,0.417274
4,1,0,0,0,0,2,1,0.118284,-0.607747,0.017692,0.277612,0.417274


## Train/ Valid 데이터로 나누기

In [91]:
# 데이터 분류
train_x, valid_x, train_y, valid_y = train_test_split(use_data[use_data.columns.difference(['Status'])], 
                                                      use_data['Status'],
                                                      test_size=0.2,
                                                      random_state=42)

# 모델 적합

In [131]:
# 모델 불러오기
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()

In [132]:
# 각 모델 학습
rf.fit(train_x, train_y)
xgb.fit(train_x, train_y)
lgb.fit(train_x, train_y)

LGBMClassifier()

In [136]:
# valid 대입
rf.predict = rf.predict(valid_x)
xgb.predict = xgb.predict(valid_x)
lgb.predict = lgb.predict(valid_x)

In [138]:
# RF CM
confusion_matrix(rf.predict, valid_y)

array([[24,  6],
       [12, 58]], dtype=int64)

In [140]:
# XGB CM
confusion_matrix(xgb.predict, valid_y)

array([[25,  9],
       [11, 55]], dtype=int64)

In [141]:
# LGBM CM
confusion_matrix(lgb.predict, valid_y)

array([[25,  9],
       [11, 55]], dtype=int64)