In [104]:
# basic mudule
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np
import os

# Light GBM
import lightgbm as lgb

# custom module
from Preprocessing import *

In [195]:
# 데이터 로드
cp = os.getcwd()
train = pd.read_csv(os.path.join(cp,'data/train.csv')).drop(['FLAG_MOBIL','index'], axis=1) # 변수 'FLAG_MOBIL','index' 제거
test = pd.read_csv(os.path.join(cp,'data/test.csv')).drop(['FLAG_MOBIL','index'], axis=1) # 변수 'FLAG_MOBIL','index' 제거

### 전처리

In [178]:
def preprocessing_train(data):
    # 데이터 컬럼 타입 설정
    # object type
    data[['gender','car','reality','income_type','edu_type','family_type','house_type','work_phone','phone','email','occyp_type','credit']] = data[['gender','car','reality','income_type','edu_type','family_type','house_type','work_phone','phone','email','occyp_type']].astype(object)

    # float type
    data[['child_num','income_total','DAYS_BIRTH','DAYS_EMPLOYED','family_size','begin_month']] = data[['child_num','income_total','DAYS_BIRTH','DAYS_EMPLOYED','family_size','begin_month']].astype(float)
    
    # dummy variable 생성
    # X, y 분리
    train_X = pd.get_dummies(data.drop(['credit'],axis=1))
    train_y = data['credit'].astype(int)

    return train_X, train_y
   

In [190]:
def preprocessing_test(data):
    
    data[['gender','car','reality','income_type','edu_type','family_type','house_type','work_phone','phone','email','occyp_type']] = data[['gender','car','reality','income_type','edu_type','family_type','house_type','work_phone','phone','email','occyp_type']].astype(object)

    # float type
    data[['child_num','income_total','DAYS_BIRTH','DAYS_EMPLOYED','family_size','begin_month']] = data[['child_num','income_total','DAYS_BIRTH','DAYS_EMPLOYED','family_size','begin_month']].astype(float)
    
    # dummy variable 생성
    # X, y 분리
    X = pd.get_dummies(data)

    return X

### 데이터 분할

In [179]:
def data_split(train_X, train_y):
    
    train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.3, shuffle=False, random_state=100)
    
    return np.array(train_X), np.array(valid_X), np.array(train_y), np.array(valid_y)

In [180]:
train_X, train_y = preprocessing_train(train)
train_X, valid_X, train_y, valid_y = data_split(train_X, train_y)

In [181]:
def learning_model(train_X, valid_X, train_y, valid_y):

    # 학습 데이터셋 정의.
    d_train = lgb.Dataset(train_X, label=train_y) 
    d_test = lgb.Dataset(valid_X, label = valid_y)

    # 파라미터 조정
    params = {}
    params['learning_rate'] = 0.003
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'multiclass'
    params['metric'] = 'multi_logloss'
    params['sub_feature'] = 0.5
    params['num_leaves'] = 10
    params['min_data'] = 50
    params['max_depth'] = 10
    params['num_class'] = 3

    clf = lgb.train(params, d_train, 100, d_test)

    return clf

In [182]:
model = learning_model(train_X, valid_X, train_y, valid_y)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 804
[LightGBM] [Info] Number of data points in the train set: 18519, number of used features: 52
[LightGBM] [Info] Start training from score -2.108312
[LightGBM] [Info] Start training from score -1.442434
[LightGBM] [Info] Start training from score -0.442847
[1]	valid_0's multi_logloss: 0.884693
[2]	valid_0's multi_logloss: 0.884649
[3]	valid_0's multi_logloss: 0.884608
[4]	valid_0's multi_logloss: 0.884353
[5]	valid_0's multi_logloss: 0.883849
[6]	valid_0's multi_logloss: 0.883811
[7]	valid_0's multi_logloss: 0.883519
[8]	valid_0's multi_logloss: 0.883006
[9]	valid_0's multi_logloss: 0.882498
[10]	valid_0's multi_logloss: 0.882238
[11]	valid_0's multi_logloss: 0.88196
[12]	valid_0's multi_logloss: 0.881705
[13]	valid_0's multi_logloss: 0.881433
[14]	valid_0's multi_logloss: 0.88119
[15]	valid_0's multi_logloss: 0.880702
[16]	valid_0's m

In [196]:
test = preprocessing_test(test)

In [198]:
pred = model.predict(test)

In [199]:
pred

array([[0.11691949, 0.22609848, 0.65698202],
       [0.11963536, 0.22362204, 0.6567426 ],
       [0.11926132, 0.22564137, 0.65509731],
       ...,
       [0.11667745, 0.22320115, 0.6601214 ],
       [0.11341521, 0.22511236, 0.66147244],
       [0.12476193, 0.23705646, 0.6381816 ]])