In [1]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

import xgboost as xgb

### 2015 ~ 2020년도 데이터 가져오기

In [2]:
filenames=['C:/workspace/p-tag/KBO_prediction_data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

['C:/workspace/p-tag/KBO_prediction_data/baseball_2015.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2016.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2017.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2018.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2019.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2020.csv']

In [3]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

100%|██████████| 6/6 [00:00<00:00, 27.00it/s]


In [4]:
baseball_data = data.copy()

### 전처리

In [5]:
def preprocessing(df):
    # 무승부 제거
    df = df[df['win']!=0.5]

    # 넥센 팀 명이 키움으로 변경됨
    df['T_ID'] = df['T_ID'].replace('NE','WO')
    df['VS_T_ID'] = df['VS_T_ID'].replace('NE','WO')

    # 팀 명 라벨링
    le = LabelEncoder()
    le.fit(df['T_ID'])
    df.loc[:,'T_ID'] = le.transform(df['T_ID'])
    df.loc[:,'VS_T_ID'] = le.transform(df['VS_T_ID'])
    
    return df

In [6]:
baseball_data = preprocessing(baseball_data)

In [7]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','OBP']]
train_target = baseball_data['win']

train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42)

### RandomForest pred probs 구하기

In [8]:
pickled_RandomForest_model = pickle.load(open('C:/workspace/p-tag/model/random_forest(except vs_t_id).pkl', 'rb'))

In [9]:
pred = pickled_RandomForest_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7988


In [10]:
pred_probs_RandomForest = pickled_RandomForest_model.predict_proba(test_x)
pred_probs_RandomForest

array([[0.94540602, 0.05459398],
       [0.640831  , 0.359169  ],
       [0.68648816, 0.31351184],
       ...,
       [0.24791193, 0.75208807],
       [0.03281916, 0.96718084],
       [0.8919361 , 0.1080639 ]])

### XGBoost pred probs 구하기

In [12]:
pickled_XGBoost_model = pickle.load(open('C:/workspace/p-tag/model/xgboost(except vs_t_id).pkl', 'rb'))

In [13]:
test_x

Unnamed: 0,RUN,AB,PA,OOO,T_ID,OBP
297,2.0,34.0,36.0,0.205882,2,0.250000
278,4.0,30.0,38.0,0.200000,3,0.297297
1158,4.0,37.0,41.0,0.243243,3,0.317073
1090,6.0,41.0,43.0,0.365854,5,0.395349
862,8.0,37.0,44.0,0.378378,4,0.452381
...,...,...,...,...,...,...
1461,0.0,28.0,32.0,0.142857,9,0.250000
388,5.0,36.0,41.0,0.277778,8,0.325000
608,6.0,35.0,38.0,0.342857,1,0.394737
415,9.0,37.0,41.0,0.432432,2,0.450000


In [14]:
dtest = xgb.DMatrix(data=test_x , label=test_y)

percent = pickled_XGBoost_model.predict(dtest)
preds = [ 1 if x > 0.5 else 0 for x in percent ]
accuracy_score(test_y , preds)

0.8134897360703812

In [15]:
pred_probs_xgboost = np.array([arr for arr in zip(1-percent,percent)])
pred_probs_xgboost

array([[0.9670341 , 0.0329659 ],
       [0.62697506, 0.3730249 ],
       [0.7429482 , 0.25705183],
       ...,
       [0.2835937 , 0.7164063 ],
       [0.01320791, 0.9867921 ],
       [0.9045191 , 0.09548094]], dtype=float32)

### GBM pred probs 구하기

In [16]:
pickled_GBM_model = pickle.load(open('C:/workspace/p-tag/model/GBM(except vs_t_id).pkl', 'rb'))

pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7965


In [17]:
pred_probs_GBM = pickled_GBM_model.predict_proba(test_x)
pred_probs_GBM

array([[0.96347006, 0.03652994],
       [0.50443855, 0.49556145],
       [0.80561121, 0.19438879],
       ...,
       [0.33722187, 0.66277813],
       [0.05350528, 0.94649472],
       [0.88942103, 0.11057897]])

### LightGBM pred probs 구하기

In [18]:
pickled_LightGBM_model = pickle.load(open('C:/workspace/p-tag/model/LightGBM(except vs_t_id).pkl', 'rb'))

In [19]:
pred = pickled_LightGBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8164


In [20]:
pred_probs_LightGBM = pickled_LightGBM_model.predict_proba(test_x)
pred_probs_LightGBM

array([[0.95495848, 0.04504152],
       [0.70067167, 0.29932833],
       [0.80037761, 0.19962239],
       ...,
       [0.35961459, 0.64038541],
       [0.04452885, 0.95547115],
       [0.90062176, 0.09937824]])

## 각 모델들의 pred probs를 합산하여 승부를 예측한 결과의 정확도 구하기

In [21]:
total_pred_probs = (pred_probs_RandomForest + pred_probs_xgboost + pred_probs_GBM + pred_probs_LightGBM)/4


In [22]:
preds = [1 if total_pred_probs[index][0] < total_pred_probs[index][1] else 0 for index in range(len(total_pred_probs))]

print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, preds)))

예측 정확도: 0.8082


## 정리
- test set 데이터에 대한 각 모델들의 승부 예측 정확도와 모델 합산 시, 승부 예측 정확도

In [23]:
pred = pickled_RandomForest_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7988


In [24]:
pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7965


In [25]:
pred = pickled_GBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7965


In [26]:
pred = pickled_LightGBM_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8164


In [27]:
total_pred_probs = (pred_probs_RandomForest + pred_probs_xgboost + pred_probs_GBM + pred_probs_LightGBM)/4
preds = [1 if total_pred_probs[index][0] < total_pred_probs[index][1] else 0 for index in range(len(total_pred_probs))]

print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, preds)))

예측 정확도: 0.8082
