## 필요한 모듈 import

In [1]:
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from hyperopt import hp
from sklearn.model_selection import KFold
from hyperopt import fmin, tpe, Trials

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier


## 2015 ~ 2020년도 데이터 가져오기

In [2]:
filenames=['C:/workspace/p-tag/KBO_prediction_data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

['C:/workspace/p-tag/KBO_prediction_data/baseball_2015.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2016.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2017.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2018.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2019.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2020.csv']

In [3]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

100%|██████████| 6/6 [00:00<00:00, 30.15it/s]


In [38]:
baseball_data = data.copy()

## 전처리 함수

In [15]:
def preprocessing(df):
    # 무승부 제거
    df = df[df['win']!=0.5]

    # 넥센 팀 명이 키움으로 변경됨
    df['T_ID'] = df['T_ID'].replace('NE','WO')
    df['VS_T_ID'] = df['VS_T_ID'].replace('NE','WO')

    # 팀 명 라벨링
    le = LabelEncoder()
    le.fit(df['T_ID'])
    df.loc[:,'T_ID'] = le.transform(df['T_ID'])
    df.loc[:,'VS_T_ID'] = le.transform(df['VS_T_ID'])
    
    return df


## 모델을 저장하는 함수

In [18]:
def model_save(model,path,model_name):
    pickle.dump(model, open(path + model_name + '.pkl','wb'))

## 모델을 불러온 후 정확도를 측정하는 함수

In [20]:
def model_load(path, model_name, test_x, test_y):
    model_df = pickle.load(open(path + model_name + '.pkl', 'rb'))
    pred = model_df.predict(test_x)
    accuracy = accuracy_score(test_y, pred)

    return accuracy

## 정확도를 측정하는 함수

In [23]:
def accuracy(model, test_x, test_y):
    pred = model.predict(test_x)
    accuracy = accuracy_score(test_y, pred)

    return accuracy

## 일정 비율로 데이터를 나누는 함수

In [31]:
def train_test_data(data, target, size):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size = size, random_state = 42)

    return train_x, test_x, train_y, test_y

## RandomForest 모델 함수

In [32]:
def RandomForest_model(data_df, feature_list, target):
    # Train_data, Train_target 나누기
    train_data = data_df[feature_list] # feature_list : ['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']
    train_target = data_df[target] # target : 'win'

    # train, test data 나누기
    train_x, test_x, train_y, test_y = train_test_data(train_data, train_target, 0.2)

    # GridSearchCV
    params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[6,9,12,15,18,21],
    'min_samples_split':[8,16,24,32,40,48,56]
}

    rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
    grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
    grid_cv.fit(train_x,train_y)

    rfc_model = grid_cv.best_estimator_

    return rfc_model


## 함수를 이용하여 RandomForest 돌리기

In [39]:
baseball_data = preprocessing(baseball_data)
baseball_data.head()

Unnamed: 0,G_ID,GDAY_DS,T_ID,VS_T_ID,HEADER_NO,TB_SC,PA,AB,RBI,RUN,...,HP,KK,GD,LOB,P_HRA_RT,P_AB_CN,P_HIT_CN,OBP,OOO,win
0,20150328HHNE0,20150328,0,6,0,T,54.0,42.0,4.0,4.0,...,1.0,7.0,0.0,24.0,0.133333,15,2,0.352941,0.238095,0.0
1,20150328HHNE0,20150328,6,0,0,B,46.0,40.0,5.0,5.0,...,0.0,7.0,1.0,18.0,0.090909,11,1,0.304348,0.2,1.0
2,20150328KTLT0,20150328,2,4,0,T,46.0,35.0,8.0,9.0,...,1.0,8.0,1.0,23.0,0.285714,14,4,0.511111,0.4,0.0
3,20150328KTLT0,20150328,4,2,0,B,42.0,37.0,12.0,12.0,...,1.0,8.0,1.0,17.0,0.4375,16,7,0.439024,0.378378,1.0
4,20150328LGHT0,20150328,3,1,0,T,34.0,29.0,1.0,1.0,...,0.0,6.0,2.0,17.0,0.2,10,2,0.333333,0.241379,0.0


In [40]:
feature_list = ['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']
data_df = baseball_data
target = 'win'

# 모델 만들기
test_model = RandomForest_model(data_df, feature_list, target)

# train, test 데이터로 나누기
train_x, test_x, train_y, test_y = train_test_data(data_df[feature_list], data_df[target], 0.2)

# 정확도 계산
pred = test_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.7982


## XGBoost 모델 함수

In [None]:
def XGBoost_model(data_df, feature_list, target):
    # Train_data, Train_target 나누기
    train_data = data_df[feature_list] # feature_list : ['RUN','AB','PA','OOO','T_ID','VS_T_ID','OBP']
    train_target = data_df[target] # target : 'win'

    # train, test data 나누기
    train_x, test_x, train_y, test_y = train_test_data(train_data, train_target, 0.2)

    # train, val data 나누기
    X_tr, X_val, y_tr, y_val = train_test_data(train_data, train_target, 0.1)

    dtr = xgb.DMatrix(data=X_tr, label=y_tr)
    dval = xgb.DMatrix(data=X_val, label=y_val)
    dtest = xgb.DMatrix(data=test_x , label=test_y)

    params = { 'max_depth':3,
          'eta': 0.05,
          'objective':'binary:logistic',
          'eval_metric':'logloss'
         }
    num_rounds = 400

    eval_list = [(dtr,'train'),(dval,'eval')]

    xgb_model = xgb.train(params = params , dtrain=dtr , num_boost_round=num_rounds ,\
                      early_stopping_rounds=50, evals=eval_list )
    
    return xgb_model
