In [106]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

import xgboost as xgb

### 2015 ~ 2020년도 데이터 가져오기

In [107]:
filenames=['C:/workspace/p-tag/KBO_prediction_data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

['C:/workspace/p-tag/KBO_prediction_data/baseball_2015.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2016.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2017.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2018.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2019.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2020.csv']

In [108]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

100%|██████████| 6/6 [00:00<00:00, 84.61it/s]


In [109]:
baseball_data = data.copy()

### 전처리

In [110]:
def preprocessing(df):
    # 무승부 제거
    df = df[df['win']!=0.5]

    # 넥센 팀 명이 키움으로 변경됨
    df['T_ID'] = df['T_ID'].replace('NE','WO')
    df['VS_T_ID'] = df['VS_T_ID'].replace('NE','WO')

    # 팀 명 라벨링
    le = LabelEncoder()
    le.fit(df['T_ID'])
    df.loc[:,'T_ID'] = le.transform(df['T_ID'])
    df.loc[:,'VS_T_ID'] = le.transform(df['VS_T_ID'])
    
    return df

In [111]:
baseball_data = preprocessing(baseball_data)

In [112]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']]
train_target = baseball_data['win']

train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42)

### RandomForest pred probs 구하기

In [113]:
pickled_RandomForest_model = pickle.load(open('C:/workspace/p-tag/model/random_forest.pkl', 'rb'))

In [114]:
pred = pickled_RandomForest_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8018


In [115]:
pred_probs_RandomForest = pickled_RandomForest_model.predict_proba(test_x)
pred_probs_RandomForest

array([[1.        , 0.        ],
       [0.59209104, 0.40790896],
       [0.62880041, 0.37119959],
       ...,
       [0.24332827, 0.75667173],
       [0.02423878, 0.97576122],
       [0.87817573, 0.12182427]])

### XGBoost pred probs 구하기

In [116]:
pickled_XGBoost_model = pickle.load(open('C:/workspace/p-tag/model/xgboost.pkl', 'rb'))

In [117]:
test_x

Unnamed: 0,RUN,AB,PA,OOO,T_ID,VS_T_ID,OBP
297,2.0,34.0,36.0,0.205882,2,1,0.250000
278,4.0,30.0,38.0,0.200000,3,9,0.297297
1158,4.0,37.0,41.0,0.243243,3,4,0.317073
1090,6.0,41.0,43.0,0.365854,5,8,0.395349
862,8.0,37.0,44.0,0.378378,4,1,0.452381
...,...,...,...,...,...,...,...
1461,0.0,28.0,32.0,0.142857,9,6,0.250000
388,5.0,36.0,41.0,0.277778,8,7,0.325000
608,6.0,35.0,38.0,0.342857,1,4,0.394737
415,9.0,37.0,41.0,0.432432,2,9,0.450000


In [118]:
dtest = xgb.DMatrix(data=test_x , label=test_y)

percent = pickled_XGBoost_model.predict(dtest)
preds = [ 1 if x > 0.5 else 0 for x in percent ]
accuracy_score(test_y , preds)

0.8240469208211144

In [119]:
pred_probs_xgboost = np.array([arr for arr in zip(1-percent,percent)])
pred_probs_xgboost

array([[0.97413635, 0.02586364],
       [0.5953207 , 0.4046793 ],
       [0.72930217, 0.27069783],
       ...,
       [0.26483792, 0.7351621 ],
       [0.01077127, 0.9892287 ],
       [0.9025383 , 0.09746172]], dtype=float32)

### GBM pred probs 구하기

In [120]:
pickled_GBM_model = pickle.load(open('C:/workspace/p-tag/model/GBM.pkl', 'rb'))

pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8012


In [121]:
pred_probs_GBM = pickled_GBM_model.predict_proba(test_x)
pred_probs_GBM

array([[0.94994869, 0.05005131],
       [0.53519716, 0.46480284],
       [0.75279493, 0.24720507],
       ...,
       [0.26636396, 0.73363604],
       [0.02154523, 0.97845477],
       [0.8692936 , 0.1307064 ]])

### LightGBM pred probs 구하기

In [122]:
pickled_LightGBM_model = pickle.load(open('C:/workspace/p-tag/model/LightGBM.pkl', 'rb'))

In [123]:
pred = pickled_LightGBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8158


In [124]:
pred_probs_LightGBM = pickled_LightGBM_model.predict_proba(test_x)
pred_probs_LightGBM

array([[0.94526932, 0.05473068],
       [0.65863036, 0.34136964],
       [0.76076573, 0.23923427],
       ...,
       [0.27434672, 0.72565328],
       [0.03806091, 0.96193909],
       [0.89002575, 0.10997425]])

## 각 모델들의 pred probs를 합산하여 승부를 예측한 결과의 정확도 구하기

In [125]:
total_pred_probs = (pred_probs_RandomForest + pred_probs_xgboost + pred_probs_GBM + pred_probs_LightGBM)/4


In [126]:
preds = [1 if total_pred_probs[index][0] < total_pred_probs[index][1] else 0 for index in range(len(total_pred_probs))]

print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, preds)))

예측 정확도: 0.8117


## 정리
- test set 데이터에 대한 각 모델들의 승부 예측 정확도와 모델 합산 시, 승부 예측 정확도

In [127]:
pred = pickled_RandomForest_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8018


In [128]:
pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8012


In [129]:
pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8012


In [130]:
pred = pickled_LightGBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8158


In [131]:
total_pred_probs = (pred_probs_RandomForest + pred_probs_xgboost + pred_probs_GBM + pred_probs_LightGBM)/4
preds = [1 if total_pred_probs[index][0] < total_pred_probs[index][1] else 0 for index in range(len(total_pred_probs))]

print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, preds)))

예측 정확도: 0.8117
