In [98]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder

import time
import warnings
warnings.filterwarnings('ignore')

import pickle

import xgboost as xgb
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier

from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score
from hyperopt import hp
from sklearn.model_selection import KFold
from hyperopt import fmin, tpe, Trials

### 2015 ~ 2020년도 데이터 가져오기

In [27]:
filenames=['C:/workspace/p-tag/KBO_prediction_data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

['C:/workspace/p-tag/KBO_prediction_data/baseball_2015.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2016.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2017.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2018.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2019.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2020.csv']

In [28]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

100%|██████████| 6/6 [00:00<00:00, 26.62it/s]


In [29]:
baseball_data = data.copy()

### 무승부 제거

In [30]:
baseball_data = baseball_data[baseball_data['win']!=0.5]

### 팀 명 라벨링

In [31]:
le = LabelEncoder()
le.fit(baseball_data['T_ID'])
baseball_data.loc[:,'T_ID'] = le.transform(baseball_data['T_ID'])
baseball_data.loc[:,'VS_T_ID'] = le.transform(baseball_data['VS_T_ID'])
baseball_data.head()

Unnamed: 0,G_ID,GDAY_DS,T_ID,VS_T_ID,HEADER_NO,TB_SC,PA,AB,RBI,RUN,...,HP,KK,GD,LOB,P_HRA_RT,P_AB_CN,P_HIT_CN,OBP,OOO,win
0,20150328HHNE0,20150328,0,6,0,T,54.0,42.0,4.0,4.0,...,1.0,7.0,0.0,24.0,0.133333,15,2,0.352941,0.238095,0.0
1,20150328HHNE0,20150328,6,0,0,B,46.0,40.0,5.0,5.0,...,0.0,7.0,1.0,18.0,0.090909,11,1,0.304348,0.2,1.0
2,20150328KTLT0,20150328,2,4,0,T,46.0,35.0,8.0,9.0,...,1.0,8.0,1.0,23.0,0.285714,14,4,0.511111,0.4,0.0
3,20150328KTLT0,20150328,4,2,0,B,42.0,37.0,12.0,12.0,...,1.0,8.0,1.0,17.0,0.4375,16,7,0.439024,0.378378,1.0
4,20150328LGHT0,20150328,3,1,0,T,34.0,29.0,1.0,1.0,...,0.0,6.0,2.0,17.0,0.2,10,2,0.333333,0.241379,0.0


### train_data, train_target 나누기

In [32]:
# baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID']]
# 정해진 feature 값을 넣을 예정
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
train_target = baseball_data['win']

In [33]:
train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(6817, 7) (1705, 7) (6817,) (1705,)


## RandomForest

### GridSearchCV - RandomForest

In [34]:
params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[6,9,12,15,18,21],
    'min_samples_split':[8,16,24,32,40,48,56]
}

rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1,
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [4, 8, 12, 16, 20, 24],
                         'min_samples_leaf': [6, 9, 12, 15, 18, 21],
                         'min_samples_split': [8, 16, 24, 32, 40, 48, 56]})

In [35]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 12, 'min_samples_leaf': 6, 'min_samples_split': 16}
최고 예측 정확도: 0.8102


### 정확도 측정

In [36]:
from sklearn.metrics import accuracy_score

rfc_model = grid_cv.best_estimator_
pred = rfc_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7982


### 확률 구하기

In [38]:
pred_probs_RandomForest = rfc_model.predict_proba(test_x)
pred_probs_RandomForest

array([[0.98995634, 0.01004366],
       [0.69927466, 0.30072534],
       [0.68268686, 0.31731314],
       ...,
       [0.19706674, 0.80293326],
       [0.00791667, 0.99208333],
       [0.88065818, 0.11934182]])

### 모델 저장

In [39]:
# Save the trained model as a pickle string.
pickle.dump(rfc_model, open('C:/workspace/p-tag/model/random_forest.pkl', 'wb'))


### 모델 불러오기

In [40]:
pickled_RandomForest_model = pickle.load(open('C:/workspace/p-tag/model/random_forest.pkl', 'rb'))

### 불러온 모델로 확률 구하기

In [41]:
pred = pickled_RandomForest_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7982


## XGBoost

In [42]:
baseball_data = data.copy()

In [43]:
le = LabelEncoder()
le.fit(baseball_data['T_ID'])
baseball_data.loc[:,'T_ID'] = le.transform(baseball_data['T_ID'])
baseball_data.loc[:,'VS_T_ID'] = le.transform(baseball_data['VS_T_ID'])
baseball_data.head()

Unnamed: 0,G_ID,GDAY_DS,T_ID,VS_T_ID,HEADER_NO,TB_SC,PA,AB,RBI,RUN,...,HP,KK,GD,LOB,P_HRA_RT,P_AB_CN,P_HIT_CN,OBP,OOO,win
0,20150328HHNE0,20150328,0,6,0,T,54.0,42.0,4.0,4.0,...,1.0,7.0,0.0,24.0,0.133333,15,2,0.352941,0.238095,0.0
1,20150328HHNE0,20150328,6,0,0,B,46.0,40.0,5.0,5.0,...,0.0,7.0,1.0,18.0,0.090909,11,1,0.304348,0.2,1.0
2,20150328KTLT0,20150328,2,4,0,T,46.0,35.0,8.0,9.0,...,1.0,8.0,1.0,23.0,0.285714,14,4,0.511111,0.4,0.0
3,20150328KTLT0,20150328,4,2,0,B,42.0,37.0,12.0,12.0,...,1.0,8.0,1.0,17.0,0.4375,16,7,0.439024,0.378378,1.0
4,20150328LGHT0,20150328,3,1,0,T,34.0,29.0,1.0,1.0,...,0.0,6.0,2.0,17.0,0.2,10,2,0.333333,0.241379,0.0


In [44]:
baseball_data = baseball_data[baseball_data['win']!=0.5]

In [45]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
train_target = baseball_data['win']

In [50]:
X_train, X_test, y_train, y_test=train_test_split(train_data, train_target,
                                         test_size=0.2, random_state=156 )

X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1, random_state=156 )

In [52]:
print(X_train.shape , X_test.shape)
print(X_tr.shape, X_val.shape)

(6817, 7) (1705, 7)
(6135, 7) (682, 7)


In [53]:
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test , label=y_test)

In [54]:
params = { 'max_depth':3,
          'eta': 0.05,
          'objective':'binary:logistic',
          'eval_metric':'logloss'
         }
num_rounds = 400

In [55]:
eval_list = [(dtr,'train'),(dval,'eval')]

xgb_model = xgb.train(params = params , dtrain=dtr , num_boost_round=num_rounds ,\
                      early_stopping_rounds=50, evals=eval_list )

[0]	train-logloss:0.67441	eval-logloss:0.67564
[1]	train-logloss:0.65708	eval-logloss:0.65964
[2]	train-logloss:0.64133	eval-logloss:0.64524
[3]	train-logloss:0.62707	eval-logloss:0.63236
[4]	train-logloss:0.61410	eval-logloss:0.62045
[5]	train-logloss:0.60204	eval-logloss:0.60943
[6]	train-logloss:0.59092	eval-logloss:0.59940
[7]	train-logloss:0.58086	eval-logloss:0.59013
[8]	train-logloss:0.57123	eval-logloss:0.58168
[9]	train-logloss:0.56257	eval-logloss:0.57352
[10]	train-logloss:0.55422	eval-logloss:0.56571
[11]	train-logloss:0.54649	eval-logloss:0.55863
[12]	train-logloss:0.53912	eval-logloss:0.55131
[13]	train-logloss:0.53213	eval-logloss:0.54510
[14]	train-logloss:0.52566	eval-logloss:0.53839
[15]	train-logloss:0.51980	eval-logloss:0.53311
[16]	train-logloss:0.51384	eval-logloss:0.52745
[17]	train-logloss:0.50853	eval-logloss:0.52230
[18]	train-logloss:0.50361	eval-logloss:0.51737
[19]	train-logloss:0.49875	eval-logloss:0.51283
[20]	train-logloss:0.49419	eval-logloss:0.50821
[2

In [59]:
pred_probs = xgb_model.predict(dtest)
print('predict( ) 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨')
print(np.round(pred_probs[:10],3))

# 예측 확률이 0.5 보다 크면 1 , 그렇지 않으면 0 으로 예측값 결정하여 List 객체인 preds에 저장 
preds = [ 1 if x > 0.5 else 0 for x in pred_probs ]
print('예측값 10개만 표시:',preds[:10])

predict( ) 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨
[0.259 0.034 0.484 0.919 0.022 0.971 0.258 0.181 0.494 0.965]
예측값 10개만 표시: [0, 0, 0, 1, 0, 1, 0, 0, 0, 1]


### 정확도 측정

In [62]:
accuracy_score(y_test , preds)

0.8011730205278592

### 모델 저장

In [60]:
# Save the trained model as a pickle string.
pickle.dump(xgb_model, open('C:/workspace/p-tag/model/xgboost.pkl', 'wb'))

### 모델 불러오기

In [61]:
pickled_XGBoost_model = pickle.load(open('C:/workspace/p-tag/model/xgboost.pkl', 'rb'))

### 불러온 모델로 정확도 측정하기

In [63]:
pred_probs = pickled_XGBoost_model.predict(dtest)
preds = [ 1 if x > 0.5 else 0 for x in pred_probs ]
accuracy_score(y_test , preds)

0.8011730205278592

## GBM

In [64]:
baseball_data = data.copy()

In [66]:
le = LabelEncoder()
le.fit(baseball_data['T_ID'])
baseball_data.loc[:,'T_ID'] = le.transform(baseball_data['T_ID'])
baseball_data.loc[:,'VS_T_ID'] = le.transform(baseball_data['VS_T_ID'])
baseball_data.head()

Unnamed: 0,G_ID,GDAY_DS,T_ID,VS_T_ID,HEADER_NO,TB_SC,PA,AB,RBI,RUN,...,HP,KK,GD,LOB,P_HRA_RT,P_AB_CN,P_HIT_CN,OBP,OOO,win
0,20150328HHNE0,20150328,0,6,0,T,54.0,42.0,4.0,4.0,...,1.0,7.0,0.0,24.0,0.133333,15,2,0.352941,0.238095,0.0
1,20150328HHNE0,20150328,6,0,0,B,46.0,40.0,5.0,5.0,...,0.0,7.0,1.0,18.0,0.090909,11,1,0.304348,0.2,1.0
2,20150328KTLT0,20150328,2,4,0,T,46.0,35.0,8.0,9.0,...,1.0,8.0,1.0,23.0,0.285714,14,4,0.511111,0.4,0.0
3,20150328KTLT0,20150328,4,2,0,B,42.0,37.0,12.0,12.0,...,1.0,8.0,1.0,17.0,0.4375,16,7,0.439024,0.378378,1.0
4,20150328LGHT0,20150328,3,1,0,T,34.0,29.0,1.0,1.0,...,0.0,6.0,2.0,17.0,0.2,10,2,0.333333,0.241379,0.0


In [67]:
baseball_data = baseball_data[baseball_data['win']!=0.5]

In [68]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
train_target = baseball_data['win']

In [69]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(6817, 7) (1705, 7) (6817,) (1705,)


### RandomSearch로 훈련

In [75]:
start_time = time.time()

params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(1, 20),
    'min_samples_leaf': randint(2, 20),
    'min_samples_split': randint(2, 20),
    'learning_rate': uniform(0.001, 0.2)
}

gb_clf = GradientBoostingClassifier(random_state = 0)
random_cv = RandomizedSearchCV(gb_clf, params, n_iter=100, n_jobs=-1)
random_cv.fit(train_x, train_y)
best_rs_gbm = random_cv.best_estimator_

print('최적의 파라미터 :',random_cv.best_params_)
print('최고의 예측 정확도 :{0:.4f}'.format(random_cv.best_score_))
print('RandomSearchCV 수행 시간: {0:.1f} 초'.format(time.time() - start_time))

최적의 파라미터 : {'learning_rate': 0.15606830094214136, 'max_depth': 2, 'min_samples_leaf': 6, 'min_samples_split': 11, 'n_estimators': 120}
최고의 예측 정확도 :0.8116
RandomSearchCV 수행 시간: 712.7 초


In [77]:
gb_pred3 = best_rs_gbm.predict(test_x)
gb_accuracy3 = accuracy_score(test_y, gb_pred3)
print('예측 정확도: {0:.4f}'.format(gb_accuracy3))

예측 정확도: 0.8000


### 모델 저장

In [78]:
pickle.dump(best_rs_gbm, open('C:/workspace/p-tag/model/GBM.pkl', 'wb'))

### 모델 불러오기

In [79]:
pickled_GBM_model = pickle.load(open('C:/workspace/p-tag/model/GBM.pkl', 'rb'))

### 불러온 모델로 정확도 예측

In [80]:
pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8000


## LightGBM

In [91]:
baseball_data = data.copy()

In [92]:
baseball_data = baseball_data[baseball_data['win']!=0.5]

In [93]:
le = LabelEncoder()
le.fit(baseball_data['T_ID'])
baseball_data.loc[:,'T_ID'] = le.transform(baseball_data['T_ID'])
baseball_data.loc[:,'VS_T_ID'] = le.transform(baseball_data['VS_T_ID'])
baseball_data.head()

Unnamed: 0,G_ID,GDAY_DS,T_ID,VS_T_ID,HEADER_NO,TB_SC,PA,AB,RBI,RUN,...,HP,KK,GD,LOB,P_HRA_RT,P_AB_CN,P_HIT_CN,OBP,OOO,win
0,20150328HHNE0,20150328,0,6,0,T,54.0,42.0,4.0,4.0,...,1.0,7.0,0.0,24.0,0.133333,15,2,0.352941,0.238095,0.0
1,20150328HHNE0,20150328,6,0,0,B,46.0,40.0,5.0,5.0,...,0.0,7.0,1.0,18.0,0.090909,11,1,0.304348,0.2,1.0
2,20150328KTLT0,20150328,2,4,0,T,46.0,35.0,8.0,9.0,...,1.0,8.0,1.0,23.0,0.285714,14,4,0.511111,0.4,0.0
3,20150328KTLT0,20150328,4,2,0,B,42.0,37.0,12.0,12.0,...,1.0,8.0,1.0,17.0,0.4375,16,7,0.439024,0.378378,1.0
4,20150328LGHT0,20150328,3,1,0,T,34.0,29.0,1.0,1.0,...,0.0,6.0,2.0,17.0,0.2,10,2,0.333333,0.241379,0.0


In [94]:
X_features = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
y_labels = baseball_data['win']

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                    test_size=0.2, random_state=0)

In [96]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train,
                                                    test_size=0.3, random_state=0)

In [99]:
lgbm_clf = LGBMClassifier(n_estimators=500)

eval_set=[(X_tr, y_tr), (X_val, y_val)]
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set=eval_set)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.885155	training's binary_logloss: 0.649735	valid_1's auc: 0.876037	valid_1's binary_logloss: 0.650289
[2]	training's auc: 0.887104	training's binary_logloss: 0.61421	valid_1's auc: 0.877542	valid_1's binary_logloss: 0.615538
[3]	training's auc: 0.891744	training's binary_logloss: 0.584687	valid_1's auc: 0.88508	valid_1's binary_logloss: 0.586158
[4]	training's auc: 0.893374	training's binary_logloss: 0.559743	valid_1's auc: 0.885055	valid_1's binary_logloss: 0.56225
[5]	training's auc: 0.895899	training's binary_logloss: 0.537961	valid_1's auc: 0.884701	valid_1's binary_logloss: 0.542258
[6]	training's auc: 0.899379	training's binary_logloss: 0.518664	valid_1's auc: 0.888836	valid_1's binary_logloss: 0.52369
[7]	training's auc: 0.900141	training's binary_logloss: 0.502841	valid_1's auc: 0.888672	valid_1's binary_logloss: 0.509255
[8]	training's auc: 0.901524	training's binary_logloss: 0.488366	valid_1's auc: 0.890798	valid_1's binary_logloss: 0.495203
[9]	training

In [100]:
lgbm_search_space = {'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
                     'max_depth': hp.quniform('max_depth', 100, 160, 1),
                     'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
                     'subsample': hp.uniform('subsample', 0.7, 1),
                     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)

                    }

In [101]:
def objective_func(search_space):
    lgbm_clf =  LGBMClassifier(n_estimators=100, num_leaves=int(search_space['num_leaves']),
                               max_depth=int(search_space['max_depth']),
                               min_child_samples=int(search_space['min_child_samples']), 
                               subsample=search_space['subsample'],
                               learning_rate=search_space['learning_rate'])
    # 3개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
    roc_auc_list = []
    
    # 3개 k-fold방식 적용 
    kf = KFold(n_splits=3)
    # X_train을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_train):
        # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
        X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
        X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행. 
        lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
           eval_set=[(X_tr, y_tr), (X_val, y_val)])

        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음.
        score = roc_auc_score(y_val, lgbm_clf.predict_proba(X_val)[:, 1]) 
        roc_auc_list.append(score)
    
    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되, 
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환.
    return -1*np.mean(roc_auc_list)

In [102]:
trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. 
best = fmin(fn=objective_func, space=lgbm_search_space, algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials, rstate=np.random.default_rng(seed=30))

print('best:', best)

[1]	training's auc: 0.887765	training's binary_logloss: 0.682542	valid_1's auc: 0.862369	valid_1's binary_logloss: 0.683194
[2]	training's auc: 0.888196	training's binary_logloss: 0.672424	valid_1's auc: 0.862347	valid_1's binary_logloss: 0.673741
[3]	training's auc: 0.888554	training's binary_logloss: 0.662774	valid_1's auc: 0.862287	valid_1's binary_logloss: 0.664688
[4]	training's auc: 0.8896	training's binary_logloss: 0.653518	valid_1's auc: 0.861841	valid_1's binary_logloss: 0.65615
[5]	training's auc: 0.890892	training's binary_logloss: 0.644643	valid_1's auc: 0.864399	valid_1's binary_logloss: 0.647895
[6]	training's auc: 0.892105	training's binary_logloss: 0.636154	valid_1's auc: 0.864666	valid_1's binary_logloss: 0.640021
[7]	training's auc: 0.893231	training's binary_logloss: 0.628003	valid_1's auc: 0.865335	valid_1's binary_logloss: 0.632642
[8]	training's auc: 0.893755	training's binary_logloss: 0.620166	valid_1's auc: 0.866099	valid_1's binary_logloss: 0.625432
[9]	trainin

In [103]:
lgbm_clf =  LGBMClassifier(n_estimators=500, num_leaves=int(best['num_leaves']),
                           max_depth=int(best['max_depth']),
                           min_child_samples=int(best['min_child_samples']), 
                           subsample=round(best['subsample'], 5),
                           learning_rate=round(best['learning_rate'], 5)
                          )

# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행. 
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, 
            eval_metric="auc",eval_set=[(X_tr, y_tr), (X_val, y_val)])

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.878704	training's binary_logloss: 0.652437	valid_1's auc: 0.879099	valid_1's binary_logloss: 0.65175
[2]	training's auc: 0.880219	training's binary_logloss: 0.619039	valid_1's auc: 0.879006	valid_1's binary_logloss: 0.618061
[3]	training's auc: 0.882467	training's binary_logloss: 0.591147	valid_1's auc: 0.881471	valid_1's binary_logloss: 0.590415
[4]	training's auc: 0.885627	training's binary_logloss: 0.567326	valid_1's auc: 0.882926	valid_1's binary_logloss: 0.566786
[5]	training's auc: 0.886979	training's binary_logloss: 0.546861	valid_1's auc: 0.882601	valid_1's binary_logloss: 0.547078
[6]	training's auc: 0.889618	training's binary_logloss: 0.529127	valid_1's auc: 0.886386	valid_1's binary_logloss: 0.529031
[7]	training's auc: 0.891581	training's binary_logloss: 0.513388	valid_1's auc: 0.888812	valid_1's binary_logloss: 0.513148
[8]	training's auc: 0.892996	training's binary_logloss: 0.499696	valid_1's auc: 0.888598	valid_1's binary_logloss: 0.500737
[9]	train

In [114]:
y_pred = lgbm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8105571847507331


In [115]:
pred_probs = lgbm_clf.predict_proba(X_test)

### 모델 저장

In [116]:
pickle.dump(lgbm_clf, open('C:/workspace/p-tag/model/LightGBM.pkl', 'wb'))

### 모델 불러오기

In [117]:
pickled_LightGBM_model = pickle.load(open('C:/workspace/p-tag/model/LightGBM.pkl', 'rb'))

### 불러온 모델로 정확도 측정

In [118]:
pred = pickled_LightGBM_model.predict(X_test)
print('예측 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

예측 정확도: 0.8106
