### 

## 필요한 모듈 import

In [2]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from hyperopt import hp
from sklearn.model_selection import KFold
from hyperopt import fmin, tpe, Trials

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier


## 2015 ~ 2020년도 데이터 가져오기

In [3]:
filenames=['C:/workspace/p-tag/KBO_prediction_data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

['C:/workspace/p-tag/KBO_prediction_data/baseball_2015.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2016.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2017.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2018.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2019.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2020.csv']

In [5]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

100%|██████████| 6/6 [00:00<00:00, 32.79it/s]


In [18]:
baseball_data = data.copy()

## 전처리 함수

In [19]:
def preprocessing(df):
    # 무승부 제거
    df = df[df['win']!=0.5]

    # 넥센 팀 명이 키움으로 변경됨
    df['T_ID'] = df['T_ID'].replace('NE','WO')
    df['VS_T_ID'] = df['VS_T_ID'].replace('NE','WO')

    # 팀 명 라벨링
    le = LabelEncoder()
    le.fit(df['T_ID'])
    df.loc[:,'T_ID'] = le.transform(df['T_ID'])
    df.loc[:,'VS_T_ID'] = le.transform(df['VS_T_ID'])
    
    return df


## 1. RandomForest 모델

### 전처리

In [21]:
baseball_data = preprocessing(baseball_data)

### train, test set 나누기

In [23]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','OBP','VS_T_ID']]
train_target = baseball_data['win']

In [24]:
train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) 

(6817, 7) (1705, 7) (6817,) (1705,)


### GridSearchCV

In [25]:
params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[6,9,12,15,18,21],
    'min_samples_split':[8,16,24,32,40,48,56]
}

rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1,
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [4, 8, 12, 16, 20, 24],
                         'min_samples_leaf': [6, 9, 12, 15, 18, 21],
                         'min_samples_split': [8, 16, 24, 32, 40, 48, 56]})

In [26]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 16, 'min_samples_leaf': 6, 'min_samples_split': 16}
최고 예측 정확도: 0.8106


### 정확도 측정

In [27]:
from sklearn.metrics import accuracy_score

rfc_model = grid_cv.best_estimator_
pred = rfc_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7994


### RandomForest 모델 저장
- model 폴더에 저장

In [28]:
pickle.dump(rfc_model, open('C:/workspace/p-tag/model/random_forest.pkl', 'wb'))

## 2. XGBoost 모델

In [30]:
baseball_data = data.copy()

### 전처리

In [31]:
baseball_data = preprocessing(baseball_data)

### train, test, val set 나누기

In [32]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
train_target = baseball_data['win']

In [33]:
X_train, X_test, y_train, y_test=train_test_split(train_data, train_target,
                                         test_size=0.2, random_state=156 )

X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1, random_state=156 )

In [35]:
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test , label=y_test)

In [36]:
params = { 'max_depth':3,
          'eta': 0.05,
          'objective':'binary:logistic',
          'eval_metric':'logloss'
         }
num_rounds = 400

In [37]:
eval_list = [(dtr,'train'),(dval,'eval')]

xgb_model = xgb.train(params = params , dtrain=dtr , num_boost_round=num_rounds ,\
                      early_stopping_rounds=50, evals=eval_list )

[0]	train-logloss:0.67441	eval-logloss:0.67564
[1]	train-logloss:0.65708	eval-logloss:0.65964
[2]	train-logloss:0.64133	eval-logloss:0.64524
[3]	train-logloss:0.62707	eval-logloss:0.63236
[4]	train-logloss:0.61410	eval-logloss:0.62045
[5]	train-logloss:0.60204	eval-logloss:0.60943
[6]	train-logloss:0.59092	eval-logloss:0.59940
[7]	train-logloss:0.58086	eval-logloss:0.59013
[8]	train-logloss:0.57123	eval-logloss:0.58168
[9]	train-logloss:0.56257	eval-logloss:0.57352
[10]	train-logloss:0.55422	eval-logloss:0.56571
[11]	train-logloss:0.54649	eval-logloss:0.55863
[12]	train-logloss:0.53912	eval-logloss:0.55131
[13]	train-logloss:0.53213	eval-logloss:0.54510
[14]	train-logloss:0.52566	eval-logloss:0.53839
[15]	train-logloss:0.51980	eval-logloss:0.53311
[16]	train-logloss:0.51384	eval-logloss:0.52745
[17]	train-logloss:0.50853	eval-logloss:0.52230
[18]	train-logloss:0.50361	eval-logloss:0.51737
[19]	train-logloss:0.49875	eval-logloss:0.51283
[20]	train-logloss:0.49419	eval-logloss:0.50821
[2

### 정확도 측정

In [39]:
pred_probs = xgb_model.predict(dtest)
preds = [ 1 if x > 0.5 else 0 for x in pred_probs ]
accuracy_score(y_test , preds)

0.7976539589442815

### 모델 저장

In [40]:
pickle.dump(xgb_model, open('C:/workspace/p-tag/model/xgboost.pkl', 'wb'))

## 3. GBM 모델

In [41]:
baseball_data = data.copy()

### 전처리

In [42]:
baseball_data = preprocessing(baseball_data)

### train, test set 나누기

In [43]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
train_target = baseball_data['win']

train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(6817, 7) (1705, 7) (6817,) (1705,)


### RandomSearchCV

In [44]:
start_time = time.time()

params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(1, 20),
    'min_samples_leaf': randint(2, 20),
    'min_samples_split': randint(2, 20),
    'learning_rate': uniform(0.001, 0.2)
}

gb_clf = GradientBoostingClassifier(random_state = 0)
random_cv = RandomizedSearchCV(gb_clf, params, n_iter=100, n_jobs=-1)
random_cv.fit(train_x, train_y)
best_rs_gbm = random_cv.best_estimator_

print('최적의 파라미터 :',random_cv.best_params_)
print('최고의 예측 정확도 :{0:.4f}'.format(random_cv.best_score_))
print('RandomSearchCV 수행 시간: {0:.1f} 초'.format(time.time() - start_time))

최적의 파라미터 : {'learning_rate': 0.1514343996306351, 'max_depth': 2, 'min_samples_leaf': 12, 'min_samples_split': 13, 'n_estimators': 129}
최고의 예측 정확도 :0.8128
RandomSearchCV 수행 시간: 707.8 초


In [45]:
gb_pred3 = best_rs_gbm.predict(test_x)
gb_accuracy3 = accuracy_score(test_y, gb_pred3)
print('예측 정확도: {0:.4f}'.format(gb_accuracy3))

예측 정확도: 0.8012


### 모델 저장

In [46]:
pickle.dump(best_rs_gbm, open('C:/workspace/p-tag/model/GBM.pkl', 'wb'))

## 4. LightGBM 모델

In [49]:
baseball_data = data.copy()

### 전처리

In [50]:
baseball_data = preprocessing(baseball_data)

### train, test, val set 나누기

In [51]:
X_features = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
y_labels = baseball_data['win']

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                    test_size=0.2, random_state=0)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train,
                                                    test_size=0.3, random_state=0)

### 모델 훈련

In [52]:
lgbm_clf = LGBMClassifier(n_estimators=500)

eval_set=[(X_tr, y_tr), (X_val, y_val)]
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set=eval_set)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.885155	training's binary_logloss: 0.649735	valid_1's auc: 0.876037	valid_1's binary_logloss: 0.650289
[2]	training's auc: 0.886743	training's binary_logloss: 0.614228	valid_1's auc: 0.877704	valid_1's binary_logloss: 0.6156
[3]	training's auc: 0.891534	training's binary_logloss: 0.584721	valid_1's auc: 0.885381	valid_1's binary_logloss: 0.586048
[4]	training's auc: 0.891988	training's binary_logloss: 0.559921	valid_1's auc: 0.885899	valid_1's binary_logloss: 0.561648
[5]	training's auc: 0.895238	training's binary_logloss: 0.538236	valid_1's auc: 0.885171	valid_1's binary_logloss: 0.541897
[6]	training's auc: 0.898838	training's binary_logloss: 0.518782	valid_1's auc: 0.889046	valid_1's binary_logloss: 0.523196
[7]	training's auc: 0.899584	training's binary_logloss: 0.502714	valid_1's auc: 0.888826	valid_1's binary_logloss: 0.508352
[8]	training's auc: 0.900847	training's binary_logloss: 0.488296	valid_1's auc: 0.890662	valid_1's binary_logloss: 0.494223
[9]	traini

In [53]:
lgbm_search_space = {'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
                     'max_depth': hp.quniform('max_depth', 100, 160, 1),
                     'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
                     'subsample': hp.uniform('subsample', 0.7, 1),
                     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)

                    }

In [54]:
def objective_func(search_space):
    lgbm_clf =  LGBMClassifier(n_estimators=100, num_leaves=int(search_space['num_leaves']),
                               max_depth=int(search_space['max_depth']),
                               min_child_samples=int(search_space['min_child_samples']), 
                               subsample=search_space['subsample'],
                               learning_rate=search_space['learning_rate'])
    # 3개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
    roc_auc_list = []
    
    # 3개 k-fold방식 적용 
    kf = KFold(n_splits=3)
    # X_train을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_train):
        # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
        X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
        X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행. 
        lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
           eval_set=[(X_tr, y_tr), (X_val, y_val)])

        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음.
        score = roc_auc_score(y_val, lgbm_clf.predict_proba(X_val)[:, 1]) 
        roc_auc_list.append(score)
    
    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되, 
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환.
    return -1*np.mean(roc_auc_list)

In [55]:
trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. 
best = fmin(fn=objective_func, space=lgbm_search_space, algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials, rstate=np.random.default_rng(seed=30))

print('best:', best)

[1]	training's auc: 0.887568	training's binary_logloss: 0.682551	valid_1's auc: 0.861869	valid_1's binary_logloss: 0.683211
[2]	training's auc: 0.887739	training's binary_logloss: 0.672442	valid_1's auc: 0.862048	valid_1's binary_logloss: 0.673757
[3]	training's auc: 0.888002	training's binary_logloss: 0.662795	valid_1's auc: 0.862407	valid_1's binary_logloss: 0.664699
[4]	training's auc: 0.888976	training's binary_logloss: 0.653541	valid_1's auc: 0.862516	valid_1's binary_logloss: 0.656124
[5]	training's auc: 0.890277	training's binary_logloss: 0.644676	valid_1's auc: 0.864416	valid_1's binary_logloss: 0.647885
[6]	training's auc: 0.891576	training's binary_logloss: 0.636195	valid_1's auc: 0.864689	valid_1's binary_logloss: 0.640017
[7]	training's auc: 0.892943	training's binary_logloss: 0.628048	valid_1's auc: 0.865327	valid_1's binary_logloss: 0.632641
[8]	training's auc: 0.893256	training's binary_logloss: 0.620214	valid_1's auc: 0.866075	valid_1's binary_logloss: 0.625425
[9]	trai

### 최적의 파라미터로 모델 훈련

In [56]:
lgbm_clf =  LGBMClassifier(n_estimators=500, num_leaves=int(best['num_leaves']),
                           max_depth=int(best['max_depth']),
                           min_child_samples=int(best['min_child_samples']), 
                           subsample=round(best['subsample'], 5),
                           learning_rate=round(best['learning_rate'], 5)
                          )

# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행. 
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, 
            eval_metric="auc",eval_set=[(X_tr, y_tr), (X_val, y_val)])

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.878734	training's binary_logloss: 0.661187	valid_1's auc: 0.878983	valid_1's binary_logloss: 0.660608
[2]	training's auc: 0.879775	training's binary_logloss: 0.633711	valid_1's auc: 0.87909	valid_1's binary_logloss: 0.632851
[3]	training's auc: 0.880644	training's binary_logloss: 0.609922	valid_1's auc: 0.878413	valid_1's binary_logloss: 0.608991
[4]	training's auc: 0.881896	training's binary_logloss: 0.58928	valid_1's auc: 0.879775	valid_1's binary_logloss: 0.588427
[5]	training's auc: 0.884318	training's binary_logloss: 0.571023	valid_1's auc: 0.88138	valid_1's binary_logloss: 0.570282
[6]	training's auc: 0.885584	training's binary_logloss: 0.554804	valid_1's auc: 0.882635	valid_1's binary_logloss: 0.553975
[7]	training's auc: 0.88648	training's binary_logloss: 0.540324	valid_1's auc: 0.883442	valid_1's binary_logloss: 0.539637
[8]	training's auc: 0.888552	training's binary_logloss: 0.526841	valid_1's auc: 0.886299	valid_1's binary_logloss: 0.525922
[9]	training

In [57]:
y_pred = lgbm_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8164222873900293


### 모델 저장

In [58]:
pickle.dump(lgbm_clf, open('C:/workspace/p-tag/model/LightGBM.pkl', 'wb'))