In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
#데이터 불러오기
data = pd.read_csv('./Data/otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [3]:
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''

'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'

In [5]:
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
print(nCar,nVar)

61878 95


### 의미 없는 변수 제거

In [6]:
data = data.drop(['id'], axis = 1)

### 타겟 변수의 문자열을 숫자로 변환

In [7]:
mapping_dict = {"Class_1": 1,
                "Class_2": 2,
                "Class_3": 3,
                "Class_4": 4,
                "Class_5": 5,
                "Class_6": 6,
                "Class_7": 7,
                "Class_8": 8,
                "Class_9": 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

### 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [10]:
feature_columns = list(data.columns.difference(['target']))
x = data[feature_columns]
y = after_mapping_target
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(49502, 93) (12376, 93) (49502,) (12376,)


### 학습 데이터를 AdaBoost 모형에 적합 후 평가 데이터로 검증

In [11]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree = DecisionTreeClassifier(max_depth=5)
AdaBoost = AdaBoostClassifier(base_estimator=tree,
                             n_estimators=20,
                             random_state = 42)
model1 = AdaBoost.fit(train_x,train_y)
pred1 = model1.predict(test_x)
print(accuracy_score(test_y,pred1) * 100)

63.23529411764706


### 더 많은 추정해보기

In [12]:
Adaboost2 = AdaBoostClassifier(base_estimator = tree, # 트리모델을 기본으로 추정
                                    n_estimators = 300, # 300회 추정
                                    random_state = 42) # 시드값 고정
model2 = Adaboost2.fit(train_x, train_y) # 학습 진행
predict2 = model2.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict2) * 100), "%") # 정확도 % 계산

Accuracy: 64.14 %


### 트리의 깊이를 늘려보기

In [13]:
tree_model2 = DecisionTreeClassifier(max_depth = 20) # 트리 최대 깊이 20으로 새로 정의
Adaboost_model3 = AdaBoostClassifier(base_estimator = tree_model2, # 새 트리 모델을 기본으로 추정
                                     n_estimators = 300, # 300회 추정
                                     random_state = 42) # 시드값 고정
model3 = Adaboost_model3.fit(train_x, train_y) # 학습 진행
predict3 = model3.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict3) * 100), "%") # 정확도 % 계산

Accuracy: 79.92 %


### 트리의 깊이를 최대로 늘려보기

In [14]:
tree_model3 = DecisionTreeClassifier(max_depth = 100) # 트리 최대 깊이 100으로 새로 정의
Adaboost_model4 = AdaBoostClassifier(base_estimator = tree_model3, # 새 트리 모델을 기본으로 추정
                                     n_estimators = 300, # 300회 추정
                                     random_state = 42) # 시드값 고정
model4 = Adaboost_model4.fit(train_x, train_y) # 학습 진행
predict4 = model4.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict4) * 100), "%") # 정확도 % 계산

Accuracy: 70.81 %


### 학습 데이터를 XGBoost 모형에 적합 후 평가 데이터로 검증

In [18]:
import xgboost as xgb
import time
start = time.time() #시작 시간 지정
xgb_dtrain = xgb.DMatrix(data=train_x, label=train_y) # 학습 데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data = test_x) # 평가 데이터를 XGBoost 모델에 맞게 변환
xgb_param = {'max_depth': 10, # 트리 깊이
         'learning_rate': 0.01, # Step Size
         'n_estimators': 100, # Number of trees, 트리 생성 개수
         'objective': 'multi:softmax', # 목적 함수
        'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # 학습 진행
xgb_model_predict = xgb_model.predict(xgb_dtest) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 76.67 %
Time: 7.59 seconds


In [19]:
xgb_model_predict

array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)

### 학습 데이터를 XGBoost 모형에 적합 후 평가 데이터로 검증

In [20]:
import lightgbm as lgb
start = time.time() # 시작 시간 지정
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees, 트리 생성 개수
            'objective': 'multiclass', # 목적 함수
            'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 3.92 seconds


In [21]:
lgb_model.predict(test_x)

array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,
        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],
       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,
        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],
       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,
        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],
       ...,
       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,
        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],
       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,
        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],
       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,
        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])

### 학습 데이터를 XGBoost 모형에 적합 후 평가 데이터로 검증

In [22]:
import catboost as cb
start = time.time() # 시작 시간 지정
cb_dtrain = cb.Pool(data = train_x, label = train_y) # 학습 데이터를 Catboost 모델에 맞게 변환
cb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees, 트리 생성 개수
            'eval_metric': 'Accuracy', # 평가 척도
            'loss_function': 'MultiClass'} # 손실 함수, 목적 함수
cb_model = cb.train(pool = cb_dtrain, params = cb_param) # 학습 진행
cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1 # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

0:	learn: 0.5907034	total: 646ms	remaining: 1m 3s
1:	learn: 0.6356107	total: 1.34s	remaining: 1m 5s
2:	learn: 0.6411256	total: 2.05s	remaining: 1m 6s
3:	learn: 0.6480344	total: 2.72s	remaining: 1m 5s
4:	learn: 0.6508222	total: 3.39s	remaining: 1m 4s
5:	learn: 0.6499939	total: 4.13s	remaining: 1m 4s
6:	learn: 0.6507818	total: 4.87s	remaining: 1m 4s
7:	learn: 0.6548422	total: 5.67s	remaining: 1m 5s
8:	learn: 0.6559533	total: 6.37s	remaining: 1m 4s
9:	learn: 0.6560947	total: 7.08s	remaining: 1m 3s
10:	learn: 0.6568421	total: 7.76s	remaining: 1m 2s
11:	learn: 0.6588219	total: 8.61s	remaining: 1m 3s
12:	learn: 0.6592259	total: 9.31s	remaining: 1m 2s
13:	learn: 0.6611248	total: 10.1s	remaining: 1m 1s
14:	learn: 0.6625591	total: 10.8s	remaining: 1m 1s
15:	learn: 0.6631853	total: 11.5s	remaining: 1m
16:	learn: 0.6639328	total: 12.3s	remaining: 1m
17:	learn: 0.6668821	total: 13.1s	remaining: 59.6s
18:	learn: 0.6669630	total: 13.8s	remaining: 59s
19:	learn: 0.6675286	total: 14.7s	remaining: 58.6

In [23]:
cb_model.predict(test_x)

array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,
        -0.02059177, -0.2130643 ],
       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,
         0.2719157 ,  0.25089315],
       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,
        -0.24018767, -0.32984969],
       ...,
       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,
         0.12789417,  1.51166757],
       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,
        -0.49799871, -0.38136323],
       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,
         1.05515787, -0.20799899]])

### 새로운 데이터로 앙상블 모델 해보기

In [24]:
# 데이터 불러오기
data = pd.read_csv("./Data/kc_house_data.csv") 
data.head() # 데이터 확인

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [25]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # id, date, zipcode, lat, long  제거

In [26]:
feature_columns = list(data.columns.difference(['price'])) # Price를 제외한 모든 행
x = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 42) # 학습데이터와 평가데이터의 비율을 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(15129, 8) (6484, 8) (15129,) (6484,)


In [27]:
import lightgbm as lgb
start = time.time() # 시작 시간 지정
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees, 트리 생성 개수
            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537729.263666


In [28]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))

210904.17249451784

### Ensemble의 Ensemble

In [29]:
import random
bagging_predict_result = [] # 빈 리스트 생성
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])] # 학습 데이터의 인덱스를 리스트로 변환
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # 데이터의 1/10 크기만큼 랜덤 샘플링, // 는 소수점을 무시하기 위함
    print(len(set(random_data_index)))
    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # 학습 데이터를 LightGBM 모델에 맞게 변환
    lgb_param = {'max_depth': 14, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees, 트리 생성 개수
            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
    lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
    predict1 = lgb_model.predict(test_x) # 테스트 데이터 예측
    bagging_predict_result.append(predict1) # 반복문이 실행되기 전 빈 리스트에 결과 값 저장

9586
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536489.340274




9533
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 536186.398374
9561
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537924.224536
9556
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537944.965100
9559
You can set `force_row_wise=true` to remove the overhead.
And if 

In [30]:
bagging_predict_result

[array([507253.3866251 , 659586.02376282, 921778.26051183, ...,
        363994.64941803, 907335.59599342, 453267.07203624]),
 array([ 523041.42667995,  624390.04819792, 1035112.3486997 , ...,
         335089.07638276,  941202.27335475,  456945.59804323]),
 array([501082.07330118, 636448.01797546, 962702.29818413, ...,
        339826.90760119, 920179.88267951, 457641.25234293]),
 array([ 490709.93003947,  600085.42712056, 1018997.81337543, ...,
         339579.91144283,  901906.87483598,  455147.97379538]),
 array([514404.76663979, 656126.16586637, 906849.60459994, ...,
        319866.88132659, 915834.25849097, 464921.99126374]),
 array([487920.42338702, 639920.64526923, 954732.44311262, ...,
        348247.87685872, 989063.52285168, 464428.38884282]),
 array([492004.80111638, 614784.29377639, 900416.20156853, ...,
        336711.88827712, 946207.34003005, 486722.98770536]),
 array([496229.92496608, 587476.60824038, 912643.79691362, ...,
        349647.02515463, 893255.23710104, 462204.

In [31]:
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [32]:
# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 211204.12925643113


In [33]:
bagging_predict

[500101.0275659395,
 625251.8398773337,
 953607.1222845137,
 1625448.880721765,
 637409.6677421046,
 368633.7881387803,
 697387.7101177999,
 433982.1799216671,
 464701.00663997914,
 498286.4722826725,
 645024.2467170774,
 383204.3377799808,
 298688.99246290093,
 359742.64434098726,
 339211.33049235353,
 1265932.934480607,
 373711.9681859823,
 1046176.8498400789,
 317608.0474228215,
 523011.33408646256,
 375594.0518530584,
 1810688.1934308554,
 664327.6408161392,
 533051.7488045788,
 507620.35926165135,
 483741.8048304993,
 294410.82522664394,
 252533.02583555473,
 477728.5215858297,
 535181.3549899565,
 495618.0699986238,
 470948.06113383325,
 462634.15734258585,
 584360.1335150127,
 375504.63316235127,
 1034013.6669728311,
 910581.3145368124,
 524468.7119506458,
 353391.56530882284,
 1547420.642368413,
 387137.0286535895,
 277367.7831772934,
 509911.00167048787,
 342036.30631919584,
 253555.26528125428,
 248137.0870182556,
 332012.1862137965,
 332768.0448444452,
 352033.81930312794,
 