In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# 데이터 불러오기
data = pd.read_csv("C:/Users/rltmdals/Downloads/otto_train.csv") # Product Category
data.head() # 데이터 확인

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''

'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'

In [6]:
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 61878 nVar: 95


In [7]:
# 의미가 없다고 판단되는 변수 제거
data = data.drop(['id'], axis = 1) # id 제거

In [8]:
data['target'].unique()

array(['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9'], dtype=object)

In [9]:
# 타겟 변수의 문자열을 숫자로 변환
mapping_dict = {'Class_1': 1,
                'Class_2': 2,
               'Class_3': 3,
               'Class_4': 4,
               'Class_5': 5,
               'Class_6': 6,
               'Class_7': 7,
               'Class_8': 8,
               'Class_9': 9}
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

In [24]:
after_mapping_target

0        1
1        1
2        1
3        1
4        1
        ..
61873    9
61874    9
61875    9
61876    9
61877    9
Name: target, Length: 61878, dtype: int64

In [10]:
# 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리
feature_columns = list(data.columns.difference(['target'])) # target을 제외한 모든 행
X = data[feature_columns] # 설명변수
y = after_mapping_target # 타켓 변수
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # 학습 데이터와 평가 데이터의 비율을 8:2로 분할
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(49502, 93) (12376, 93) (49502,) (12376,)


In [27]:
# 학습 데이터를 랜덤포레스트 모형에 적합 후 평가 데이터로 검증
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
random_forest_model1 = RandomForestClassifier(n_estimators = 20, # 20번 추정
                                             max_depth = 5, # 트리 최대 깊이 5
                                             random_state = 42) # 시드값 고정
model1 = random_forest_model1.fit(train_x, train_y) # 학습 진행
predict1 = model1.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict1) * 100), "%") # 정확도 % 계산

Accuracy: 60.16 %


In [30]:
# 트리를 많이 만들어보는건 어떨까?
random_forest_model3 = RandomForestClassifier(n_estimators = 300, # 300번 추정
                                             max_depth = 5, # 트리 최대 깊이 5
                                             random_state = 42) # 시드값 고정
model3 = random_forest_model3.fit(train_x, train_y) # 학습 진행
predict3 = model3.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict3) * 100), "%") # 정확도 % 계산

Accuracy: 61.73 %


In [31]:
# 그렇다면, 트리의 깊이를 늘려보는건 어떨까?
random_forest_model3 = RandomForestClassifier(n_estimators = 300, # 300번 추정
                                             max_depth = 20, # 트리 최대 깊이 20
                                             random_state = 42) # 시드값 고정
model3 = random_forest_model3.fit(train_x, train_y) # 학습 진행
predict3 = model3.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict3) * 100), "%") # 정확도 % 계산

Accuracy: 78.09 %


In [32]:
# 그렇다면, 트리의 깊이를 최대로 늘려보자!
random_forest_model3 = RandomForestClassifier(n_estimators = 300, # 300번 추정
                                             max_depth = 100, # 트리 최대 깊이 100
                                             random_state = 42) # 시드값 고정
model3 = random_forest_model3.fit(train_x, train_y) # 학습 진행
predict3 = model3.predict(test_x) # 평가 데이터 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, predict3) * 100), "%") # 정확도 % 계산

Accuracy: 81.23 %


In [None]:
# 다른 하이퍼파라미터에 대한 정보를 얻고싶으면 링크를 참조
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [11]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-win_amd64.whl (89.1 MB)
     ---------------------------------------- 89.1/89.1 MB 9.8 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3


In [12]:
# 1. XGBoost
import xgboost as xgb
import time
start = time.time() # 시작 시간 지정
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y) # 학습 데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data = test_x) # 평가 데이터를 XGBoost 모델에 맞게 변환
xgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees, 트리 생성 개수
            'objective': 'multi:softmax', # 목적 함수
            'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class] -> num_class 보다 1 커야한다.
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain) # 학습 진행
xgb_model_predict = xgb_model.predict(xgb_dtest) # 평가 데이터 예측
print('Accuracy: %.2f' % (accuracy_score(test_y, xgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), 'seconds') # 코드 실행 시간 계산

# 본문에 있는 seconds: 6.35
# 1. 67
# 훨씬 적게 나왔으나 이유를 모름

Parameters: { "n_estimators" } are not used.

Accuracy: 76.67 %
Time: 1.67 seconds


In [14]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.4-py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 10.8 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.4


In [15]:
# 2. LightGBM
import lightgbm as lgb
start = time.time()
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 lgb 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 100, # Number of trees, 트리 생성 개수
            'objective': 'multiclass', # 목적 함수
            'num_class': len(set(train_y)) + 1} # 파라미터 추가, Label must be in [0, num_class] -> num_class 보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis = 1) # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측
print('Accuracy: %.2f' % (accuracy_score(test_y, lgb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), 'seconds') # 코드 실행 시간 계산

# 본문과 똑같은 경고창



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 1.39 seconds


In [16]:
lgb_model.predict(test_x)
# 평가 데이터

array([[1.01734061e-15, 2.25081693e-02, 3.62193933e-01, ...,
        3.24234521e-02, 5.82126692e-02, 3.67722414e-02],
       [1.14084116e-15, 5.36978636e-02, 1.90687128e-01, ...,
        3.25081119e-01, 9.38028846e-02, 6.50463131e-02],
       [5.94595781e-16, 9.66842220e-03, 5.82817482e-02, ...,
        1.42318289e-02, 3.40230275e-02, 2.14919364e-02],
       ...,
       [7.09105769e-16, 4.63740004e-02, 1.08297559e-01, ...,
        5.46934960e-02, 7.24513712e-02, 5.74635996e-01],
       [9.88127136e-16, 1.54895684e-02, 5.45515599e-01, ...,
        2.45870954e-02, 5.65410617e-02, 3.62344513e-02],
       [7.59617500e-16, 1.49480877e-02, 7.44570300e-02, ...,
        5.76695793e-01, 1.43227106e-01, 2.74567219e-02]])

In [17]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.1.1-cp39-none-win_amd64.whl (74.0 MB)
     ---------------------------------------- 74.0/74.0 MB 9.6 MB/s eta 0:00:00
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [34]:
# 3. Catboost
import catboost as cb
start = time.time() # 시작 시간 지정
cb_dtrain = cb.Pool(data = train_x, label = train_y) # 학습 데이터를 cb 모델에 맞게 변환
cb_param = {'max_depth': 10,
           'learning_rate': 0.01,
           'n_estimators': 100, # Number of trees, 트리 생성 개수
           'eval_metric': 'Accuracy', # 평가 척도
           'loss_function': 'MultiClass'} # 손실 함수, 목적 함수
cb_model = cb.train(pool = cb_dtrain, params = cb_param) # 학습 진행
cb_model_predict = np.argmax(cb_model.predict(test_x), axis = 1) + 1
    # 평가 데이터 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict) * 100), "%") # 정확도 % 계산
print("Time: %.2f" % (time.time() - start), "seconds") # 코드 실행 시간 계산

0:	learn: 0.5907034	total: 180ms	remaining: 17.8s
1:	learn: 0.6356107	total: 367ms	remaining: 18s
2:	learn: 0.6411256	total: 549ms	remaining: 17.7s
3:	learn: 0.6480344	total: 722ms	remaining: 17.3s
4:	learn: 0.6508222	total: 905ms	remaining: 17.2s
5:	learn: 0.6499939	total: 1.08s	remaining: 17s
6:	learn: 0.6507818	total: 1.26s	remaining: 16.8s
7:	learn: 0.6548422	total: 1.44s	remaining: 16.6s
8:	learn: 0.6559533	total: 1.61s	remaining: 16.3s
9:	learn: 0.6560947	total: 1.79s	remaining: 16.1s
10:	learn: 0.6568421	total: 1.96s	remaining: 15.9s
11:	learn: 0.6588219	total: 2.14s	remaining: 15.7s
12:	learn: 0.6592259	total: 2.32s	remaining: 15.5s
13:	learn: 0.6611248	total: 2.49s	remaining: 15.3s
14:	learn: 0.6625591	total: 2.68s	remaining: 15.2s
15:	learn: 0.6631853	total: 2.86s	remaining: 15s
16:	learn: 0.6639328	total: 3.05s	remaining: 14.9s
17:	learn: 0.6668821	total: 3.25s	remaining: 14.8s
18:	learn: 0.6669630	total: 3.44s	remaining: 14.7s
19:	learn: 0.6675286	total: 3.62s	remaining: 14

In [36]:
cb_model.predict(test_x)

array([[-0.35426047,  1.22109587,  0.44230101, ..., -0.1698448 ,
        -0.02059177, -0.2130643 ],
       [-0.07235138,  0.42535181,  0.20060428, ...,  0.21863604,
         0.2719157 ,  0.25089315],
       [-0.3315885 , -0.31862353, -0.31279765, ..., -0.29798357,
        -0.24018767, -0.32984969],
       ...,
       [ 0.05304325,  0.02500267, -0.14752573, ..., -0.20741963,
         0.12789417,  1.51166757],
       [-0.55093666,  1.7691278 ,  0.99746884, ..., -0.3420542 ,
        -0.49799871, -0.38136323],
       [-0.3033724 ,  0.09352675, -0.11808658, ...,  0.65825036,
         1.05515787, -0.20799899]])

In [37]:
# 데이터 불러오기
data = pd.read_csv("C:/Users/rltmdals/Downloads/archive/kc_house_data.csv")
data.head() # 데이터 확인

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503


In [38]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1) # id, date, zipcode, lat, long 제거

In [39]:
feature_columns = list(data.columns.difference(['price'])) # Price를 제외한 모든 행
X = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42) # 학습 데이터와 평가데이터의 비율을 7:3
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(15129, 15) (6484, 15) (15129,) (6484,)


In [40]:
import lightgbm as lgb
start = time.time() # 시작 시간 지정
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y) # 학습 데이터를 lgb 모델에 맞게 변환
lgb_param = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees, 트리 생성 개수
            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class] -> num_class보다 1 커야한다.
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1748
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 15
[LightGBM] [Info] Start training from score 537640.173177


In [41]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

sqrt(mean_squared_error(lgb_model.predict(test_x),test_y))

194414.563521239

In [42]:
# Ensemble의 Ensemble
import random
bagging_predict_result = [] # 빈 리스트 생성
for _ in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])] # 학습 데이터의 인덱스를 리스트로 변환
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # 데이터의 1/10 크기만큼 랜덤 샘플링, // 는 소수점을 무시하기 위함
    print(len(set(random_data_index)))
    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # 학습 데이터를 LightGBM 모델에 맞게 변환
    lgb_param = {'max_depth': 14, # 트리 깊이
            'learning_rate': 0.01, # Step Size
            'n_estimators': 500, # Number of trees, 트리 생성 개수
            'objective': 'regression'} # 파라미터 추가, Label must be in [0, num_class) -> num_class보다 1 커야한다.
    lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain) # 학습 진행
    predict1 = lgb_model.predict(test_x) # 테스트 데이터 예측
    bagging_predict_result.append(predict1) # 반복문이 실행되기 전 빈 리스트에 결과 값 저장

9542
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1713
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 15
[LightGBM] [Info] Start training from score 536568.129817




9546
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1717
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 15
[LightGBM] [Info] Start training from score 543752.281777
9523
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1711
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 15
[LightGBM] [Info] Start training from score 533275.197039
9637
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1718
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 15
[LightGBM] [Info] Start training from score 539615.313768
9644
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1713
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 15
[LightGBM] [Info] Start training from score 534813.650274
9579
You

In [43]:
bagging_predict_result

[array([ 522369.86511186,  685872.94483676, 1054786.09826361, ...,
         318796.08840448,  861476.88291368,  579165.37611421]),
 array([519000.07387236, 703715.79716215, 998367.30175623, ...,
        326857.63950465, 846197.34579836, 501522.03118248]),
 array([ 480384.90798231,  725594.77994711, 1056930.73281768, ...,
         320956.22422497,  922797.97267726,  568258.54845257]),
 array([494701.87197675, 634900.60715389, 965470.25885294, ...,
        317731.85405132, 867831.17532469, 552084.42577147]),
 array([474791.01606616, 682978.05304722, 989918.74782277, ...,
        322651.39625573, 873558.03984669, 519080.67697301]),
 array([465859.71991054, 706277.67591311, 960961.88292524, ...,
        329942.86024912, 862894.77391578, 537530.72584571]),
 array([ 513905.50291342,  747838.94029182, 1152160.09089167, ...,
         326854.45772701,  821286.42926266,  497326.69882469]),
 array([492564.28337081, 656993.44077947, 997969.54805493, ...,
        315805.82713717, 931122.23093821, 4

In [44]:
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [45]:
# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 194338.6933637178


In [46]:
bagging_predict

[495843.93293519877,
 689389.2914028508,
 1022102.8040399983,
 1444224.0159512465,
 653538.989769556,
 341478.7147594405,
 639265.6224328497,
 446175.8599202386,
 436701.3306284988,
 508469.16988159565,
 669594.0428821519,
 374995.4249257528,
 324236.9642219649,
 399236.2045122292,
 323676.7794997232,
 1159399.198179995,
 416480.15114795783,
 947463.3245112037,
 277911.60746277776,
 424568.1458704059,
 322069.47077677224,
 1981426.2959421617,
 700142.2622568869,
 661679.7266763524,
 478242.2393693449,
 444601.4742277799,
 275369.4856138602,
 255569.92797829016,
 457180.7643892265,
 488964.64383606176,
 346011.7988703665,
 536585.5417488688,
 435089.4517497874,
 422540.6748966217,
 503748.37916394806,
 1061132.7051865633,
 862996.4089264369,
 484129.7120183059,
 338096.9163346468,
 1759775.421635089,
 415746.9971057721,
 285805.34370833053,
 489571.3162551214,
 357007.50937248516,
 249449.0211955873,
 257219.81699725013,
 357108.58885737037,
 385970.5496549489,
 312164.7737380976,
 7250