### 라이브러리

In [59]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

from xgboost import XGBClassifier
import xgboost as xgb

### 데이터 불러오기 

In [60]:
MY_PATH = '/content/drive/MyDrive'
train= pd.read_csv(MY_PATH + '/dacon프로젝트/소비자프로젝트/dataset/train.csv')
test=pd.read_csv(MY_PATH + '/dacon프로젝트/소비자프로젝트/dataset/test.csv')
sample_submission=pd.read_csv(MY_PATH + '/dacon프로젝트/소비자프로젝트/dataset/sample_submission.csv')

### train_test 합치기(concat)

In [61]:
data= pd.concat([train, test], axis =0);data.head(3)

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0


### 변수 확인 및 1차 제거(index, family_size, flag)

In [62]:
unique_len = data.apply(lambda x : len(x.unique()));unique_len

index            36457
gender               2
car                  2
reality              2
child_num            9
income_total       265
income_type          5
edu_type             5
family_type          5
house_type           6
DAYS_BIRTH        7183
DAYS_EMPLOYED     3640
FLAG_MOBIL           1
work_phone           2
phone                2
email                2
occyp_type          19
family_size         10
begin_month         61
credit               4
dtype: int64

In [63]:
data = data.drop(['index','family_size','FLAG_MOBIL'], axis=1)

### 0,1 인코딩(1)

In [64]:
data['gender'] = data['gender'].replace(['F','M'],[0, 1])
data['car'] = data['car'].replace(['N','Y'],[0, 1])
data['reality'] = data['reality'].replace(['N','Y'],[0, 1])

### 0,1 인코딩(2) - occyp

In [65]:
data['occyp_type'].fillna(0,inplace=True)
data['occyp_type'].loc[(data['occyp_type'] == 0)]= 0
data['occyp_type'].loc[(data['occyp_type'] != 0)]= 1

In [66]:
# occyp가 object 형태이므로 labelencoding으로 float로 바꿔준다
label_encoder = LabelEncoder()
data['occyp_type'] = label_encoder.fit_transform(data['occyp_type'])
data['occyp_type'].value_counts()

1    25134
0    11323
Name: occyp_type, dtype: int64

### 레이블 인코딩 - 카테고리 3개 이상

In [67]:
label_encoder = LabelEncoder()

In [68]:
# child number
data.loc[data['child_num'] > 2, 'child_num'] = 2;data['child_num'].value_counts()

0    25201
1     7492
2     3764
Name: child_num, dtype: int64

In [69]:
# income_type
data['income_type'] = label_encoder.fit_transform(data['income_type'])
data['income_type'].value_counts()

4    18819
0     8490
1     6152
2     2985
3       11
Name: income_type, dtype: int64

In [70]:
# edu_type
data['edu_type'] = label_encoder.fit_transform(data['edu_type'])
data['edu_type'].value_counts()

4    24777
1     9864
2     1410
3      374
0       32
Name: edu_type, dtype: int64

In [71]:
# family_type
data['family_type'] = label_encoder.fit_transform(data['family_type'])
data['family_type'].value_counts()

1    25048
3     4829
0     2945
2     2103
4     1532
Name: family_type, dtype: int64

In [72]:
# house_type
data['house_type'] = label_encoder.fit_transform(data['house_type'])
data['house_type'].value_counts()

1    32548
5     1776
2     1128
4      575
3      262
0      168
Name: house_type, dtype: int64

### 정규화 전 CORR 확인

In [73]:
data.corr()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,begin_month,credit
gender,1.0,0.361379,-0.050758,0.082905,0.197805,0.105639,0.00588,-0.099289,0.070301,0.202352,-0.173434,0.064994,-0.026833,-0.003284,0.152854,0.005079,0.001562
car,0.361379,1.0,-0.015185,0.108653,0.215506,0.054817,-0.101272,-0.121716,0.016337,0.157144,-0.156452,0.021644,-0.014019,0.02175,0.118775,-0.029627,0.007761
reality,-0.050758,-0.015185,1.0,-0.005346,0.032719,-0.046543,0.010997,0.022993,-0.179187,-0.129838,0.093006,-0.207732,-0.066601,0.052194,-0.05478,0.006476,-0.009387
child_num,0.082905,0.108653,-0.005346,1.0,0.039223,0.110941,-0.060559,-0.172429,0.031028,0.361501,-0.243993,0.051692,-0.014736,0.020113,0.175118,-0.005181,0.003441
income_total,0.197805,0.215506,0.032719,0.039223,1.0,-0.072974,-0.226931,-0.001191,-0.00648,0.067908,-0.168611,-0.037746,0.017245,0.086681,0.119265,-0.017494,0.008555
income_type,0.105639,0.054817,-0.046543,0.110941,-0.072974,1.0,0.057225,-0.048532,0.035694,0.213509,-0.363363,0.165785,0.006529,-0.019567,0.243698,-0.000846,-0.008163
edu_type,0.00588,-0.101272,0.010997,-0.060559,-0.226931,0.057225,1.0,0.007747,-0.036322,-0.169024,0.121874,-0.021092,-0.044875,-0.098457,-0.04126,0.014839,0.01378
family_type,-0.099289,-0.121716,0.022993,-0.172429,-0.001191,-0.048532,0.007747,1.0,0.010025,-0.106486,0.120113,-0.063103,-0.015065,-0.010895,-0.079639,0.030053,-0.00523
house_type,0.070301,0.016337,-0.179187,0.031028,-0.00648,0.035694,-0.036322,0.010025,1.0,0.211562,-0.107957,0.03008,-0.020254,0.012879,0.061358,0.030031,-0.009023
DAYS_BIRTH,0.202352,0.157144,-0.129838,0.361501,0.067908,0.213509,-0.169024,-0.106486,0.211562,1.0,-0.616213,0.179054,-0.028659,0.105625,0.418938,0.053913,-0.025187


### 2차 제거

In [74]:
data = data.drop(['work_phone','phone'], axis=1)

In [75]:
data.shape

(36457, 15)

### 정규화

In [76]:
data[['DAYS_BIRTH','DAYS_EMPLOYED','income_total','begin_month']][:4]

Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,income_total,begin_month
0,-13899,-4709,202500.0,-6.0
1,-11380,-1540,247500.0,-5.0
2,-19087,-4434,450000.0,-22.0
3,-15088,-2092,202500.0,-37.0


In [77]:
# DAYS_BIRTH :양수 및 중위값 처리
data['DAYS_BIRTH'] = data['DAYS_BIRTH']* -1 ;data['DAYS_BIRTH'][:5]

0    13899
1    11380
2    19087
3    15088
4    15037
Name: DAYS_BIRTH, dtype: int64

In [78]:
# DAYS_EMPLOYED : 이상값 중위수 처리
data.sort_values(by='DAYS_EMPLOYED',ascending =False)
indexs = data[data['DAYS_EMPLOYED']==365243].index
data_employed = data[data['DAYS_EMPLOYED'] != 365243]

# 이상값 중위수 처리
median_employed = data_employed['DAYS_EMPLOYED'].median()

# 양수 변환
data['DAYS_EMPLOYED']=data['DAYS_EMPLOYED'].replace(365243, median_employed)
data['DAYS_EMPLOYED']=data['DAYS_EMPLOYED'] * -1

In [79]:
# begin_month : 양수 변환
data['begin_month']=data['begin_month'] * -1

### 4개 변수 정규화

In [80]:
data['DAYS_BIRTH'] = (data['DAYS_BIRTH'] - data['DAYS_BIRTH'].mean()) / data['DAYS_BIRTH'].std()
data['DAYS_EMPLOYED'] = (data['DAYS_EMPLOYED'] - data['DAYS_EMPLOYED'].mean()) / data['DAYS_EMPLOYED'].std()
data['income_total'] = (data['income_total'] - data['income_total'].mean()) / data['income_total'].std()
data['begin_month'] = (data['begin_month'] - data['begin_month'].mean()) / data['begin_month'].std()

In [81]:
data[['DAYS_BIRTH','DAYS_EMPLOYED','income_total','begin_month']][:4]

Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,income_total,begin_month
0,-0.494262,1.00448,0.155363,-1.221935
1,-1.093946,-0.46009,0.597453,-1.282534
2,0.740814,0.877387,2.586858,-0.252347
3,-0.211204,-0.204981,0.155363,0.656642


### 전처리 데이터 최종 CORR 확인

In [82]:
data.corr()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,email,occyp_type,begin_month,credit
gender,1.0,0.361379,-0.050758,0.082905,0.197805,0.105639,0.00588,-0.099289,0.070301,-0.202352,-0.095072,-0.003284,0.152854,-0.005079,0.001562
car,0.361379,1.0,-0.015185,0.108653,0.215506,0.054817,-0.101272,-0.121716,0.016337,-0.157144,-0.047415,0.02175,0.118775,0.029627,0.007761
reality,-0.050758,-0.015185,1.0,-0.005346,0.032719,-0.046543,0.010997,0.022993,-0.179187,0.129838,-0.004734,0.052194,-0.05478,-0.006476,-0.009387
child_num,0.082905,0.108653,-0.005346,1.0,0.039223,0.110941,-0.060559,-0.172429,0.031028,-0.361501,-0.035149,0.020113,0.175118,0.005181,0.003441
income_total,0.197805,0.215506,0.032719,0.039223,1.0,-0.072974,-0.226931,-0.001191,-0.00648,-0.067908,0.037324,0.086681,0.119265,0.017494,0.008555
income_type,0.105639,0.054817,-0.046543,0.110941,-0.072974,1.0,0.057225,-0.048532,0.035694,-0.213509,0.086133,-0.019567,0.243698,0.000846,-0.008163
edu_type,0.00588,-0.101272,0.010997,-0.060559,-0.226931,0.057225,1.0,0.007747,-0.036322,0.169024,0.024306,-0.098457,-0.04126,-0.014839,0.01378
family_type,-0.099289,-0.121716,0.022993,-0.172429,-0.001191,-0.048532,0.007747,1.0,0.010025,0.106486,-0.013287,-0.010895,-0.079639,-0.030053,-0.00523
house_type,0.070301,0.016337,-0.179187,0.031028,-0.00648,0.035694,-0.036322,0.010025,1.0,-0.211562,-0.0614,0.012879,0.061358,-0.030031,-0.009023
DAYS_BIRTH,-0.202352,-0.157144,0.129838,-0.361501,-0.067908,-0.213509,0.169024,0.106486,-0.211562,1.0,0.187958,-0.105625,-0.418938,0.053913,0.025187


### 데이터분리

In [83]:
train = data[:-10000]
test = data[-10000:]

In [84]:
train_x = train.drop('credit',axis=1)
train_y=train['credit']
test_x = test.drop('credit', axis = 1)
print(train_x.shape, train_y.shape, test_x.shape)

(26457, 14) (26457,) (10000, 14)


### XGBOOST_train_test_split

In [85]:
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.25, random_state = 33)
dtrain = xgb.DMatrix(data = x_train, label = y_train)
dtest = xgb.DMatrix(data = x_val, label = y_val)
print(x_train.shape,y_train.shape,x_val.shape, y_val.shape)

(19842, 14) (19842,) (6615, 14) (6615,)


### 하이퍼 파라미터

In [86]:
model = XGBClassifier(silent=False,
                             objective='multi:softprob',
                             eval_metric='mlogloss',
                             n_estimator=600)

param_grid={'max_depth': [10], 'learning_rate' :[0.1], 'subsample' : [0.9],
            'gamma' : [0.01,0.1,1], 'lambda' : [0.01,0.1,1]}
cv = KFold(n_splits=3 , random_state=33)

### gcv 실행(100개씩 출력)

In [87]:
gcv = GridSearchCV(model, param_grid=param_grid, cv=cv)
gcv.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=3, random_state=33, shuffle=False),
             error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eval_metric='mlogloss',
                                     gamma=0, learning_rate=0.1,
                                     max_delta_step=0, max_depth=3,
                                     min_child_weight=1, missing=None,
                                     n_estimator=600, n_estimators=100,
                                     n_jobs=1, nthr...
                                     objective='multi:softprob', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None,
                                     silent=False, subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
       

### 최적파라미터 출력

In [88]:
print(gcv.best_params_, gcv.best_score_)

{'gamma': 0.01, 'lambda': 0.01, 'learning_rate': 0.1, 'max_depth': 10, 'subsample': 0.9} 0.708900312468501
