In [1]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

import os
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

# 그래프 시각화
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
baseball_data = pd.read_excel('/Users/jun/Git/407_Project/KBO_prediction_data/baseball_2023.xlsx')

In [3]:
baseball_data.head()

Unnamed: 0,순,T_ID,팀,G,AB,PA,RUN,안타,2타,3타,...,희타,희비,OOO,OBP,장타,OPS,wOBA,wRC+,WAR*,WPA
0,1,NC,23,25,81,76,14,26,5,1,...,1,1,0.342,0.363,0.513,0.876,0.389,133.7,0.59,0.21
1,2,LG,23,30,94,81,16,27,6,0,...,1,1,0.333,0.409,0.407,0.816,0.386,118.2,0.53,0.24
2,3,KT,23,31,91,82,20,25,6,1,...,0,2,0.305,0.352,0.512,0.864,0.383,115.5,0.45,0.18
3,4,두산,23,24,83,70,12,18,2,0,...,2,1,0.257,0.346,0.371,0.717,0.331,97.7,0.29,0.64
4,5,키움,23,24,85,78,10,24,7,1,...,0,0,0.308,0.365,0.423,0.788,0.364,107.0,0.23,0.9


In [4]:
baseball_data.columns

Index(['순', 'T_ID', '팀', 'G', 'AB', 'PA', 'RUN', '안타', '2타', '3타', '홈런', '루타',
       '타점', '도루', '도실', '볼넷', '사구', '고4', '삼진', '병살', '희타', '희비', 'OOO',
       'OBP', '장타', 'OPS', 'wOBA', 'wRC+', 'WAR*', 'WPA'],
      dtype='object')

### 데이터의 결측치와 데이터 형태 확인

In [5]:
baseball_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   순       10 non-null     int64  
 1   T_ID    10 non-null     object 
 2   팀       10 non-null     int64  
 3   G       10 non-null     int64  
 4   AB      10 non-null     int64  
 5   PA      10 non-null     int64  
 6   RUN     10 non-null     int64  
 7   안타      10 non-null     int64  
 8   2타      10 non-null     int64  
 9   3타      10 non-null     int64  
 10  홈런      10 non-null     int64  
 11  루타      10 non-null     int64  
 12  타점      10 non-null     int64  
 13  도루      10 non-null     int64  
 14  도실      10 non-null     int64  
 15  볼넷      10 non-null     int64  
 16  사구      10 non-null     int64  
 17  고4      10 non-null     int64  
 18  삼진      10 non-null     int64  
 19  병살      10 non-null     int64  
 20  희타      10 non-null     int64  
 21  희비      10 non-null     int64  
 22  OOO  

In [6]:
baseball_data.describe()

Unnamed: 0,순,팀,G,AB,PA,RUN,안타,2타,3타,홈런,...,희타,희비,OOO,OBP,장타,OPS,wOBA,wRC+,WAR*,WPA
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,5.5,23.0,25.4,83.8,73.5,11.9,20.8,3.9,0.5,1.3,...,0.9,0.7,0.2809,0.3553,0.3996,0.755,0.3496,100.48,0.277,0.299
std,3.02765,0.0,2.75681,6.646637,7.33712,3.842742,4.516636,2.13177,0.527046,1.159502,...,0.737865,0.674949,0.042289,0.033116,0.064708,0.079793,0.031907,20.012207,0.188034,0.321367
min,1.0,23.0,23.0,74.0,63.0,8.0,13.0,1.0,0.0,0.0,...,0.0,0.0,0.206,0.284,0.346,0.63,0.294,64.7,0.0,-0.1
25%,3.25,23.0,24.0,80.25,67.0,9.25,18.25,2.0,0.0,0.25,...,0.25,0.0,0.25625,0.3445,0.35275,0.7055,0.33125,88.425,0.1625,0.0675
50%,5.5,23.0,24.0,82.0,77.0,11.0,19.5,4.0,0.5,1.0,...,1.0,1.0,0.279,0.354,0.368,0.7365,0.348,101.2,0.215,0.215
75%,7.75,23.0,25.0,89.5,78.0,13.5,24.75,5.75,1.0,2.0,...,1.0,1.0,0.30725,0.3645,0.419,0.809,0.37825,113.375,0.41,0.54
max,10.0,23.0,31.0,94.0,82.0,20.0,27.0,7.0,1.0,3.0,...,2.0,2.0,0.342,0.409,0.513,0.876,0.389,133.7,0.59,0.9


### 팀명 라벨 인코딩

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(baseball_data['T_ID']) # 각 카테고리별로 라벨을 만들어주고 
baseball_data.loc[:,'T_ID'] = le.transform(baseball_data['T_ID']) # 생성한 라벨로 문자열 형태의 T_ID를 대체
# loc는 특정 행이나 열을 조회하는 기능을 하는데, 먼저 열에 대해 fit을 사용하고 transform을 사용해 구조를 변형시켜준다...?
# baseball_data.loc[:,'VS_T_ID'] = le.transform(baseball_data['VS_T_ID'])
baseball_data.head()

Unnamed: 0,순,T_ID,팀,G,AB,PA,RUN,안타,2타,3타,...,희타,희비,OOO,OBP,장타,OPS,wOBA,wRC+,WAR*,WPA
0,1,3,23,25,81,76,14,26,5,1,...,1,1,0.342,0.363,0.513,0.876,0.389,133.7,0.59,0.21
1,2,2,23,30,94,81,16,27,6,0,...,1,1,0.333,0.409,0.407,0.816,0.386,118.2,0.53,0.24
2,3,1,23,31,91,82,20,25,6,1,...,0,2,0.305,0.352,0.512,0.864,0.383,115.5,0.45,0.18
3,4,5,23,24,83,70,12,18,2,0,...,2,1,0.257,0.346,0.371,0.717,0.331,97.7,0.29,0.64
4,5,8,23,24,85,78,10,24,7,1,...,0,0,0.308,0.365,0.423,0.788,0.364,107.0,0.23,0.9


- train_x, train_y : 훈련 데이터
- test_x, test_y : 테스트 데이터

In [10]:
train_data = baseball_data[['RUN','OBP', 'AB', 'PA', 'OOO','T_ID']]
# train_target = baseball_data['win']

In [11]:
import pickle
test_model = pickle.load(open('/Users/jun/Git/407_Project/model/random_forest2.pkl','rb'))
print(test_model.predict(train_data))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [12]:
train_data

Unnamed: 0,RUN,OBP,AB,PA,OOO,T_ID
0,14,0.363,81,76,0.342,3
1,16,0.409,94,81,0.333,2
2,20,0.352,91,82,0.305,1
3,12,0.346,83,70,0.257,5
4,10,0.365,85,78,0.308,8
5,8,0.356,74,63,0.27,7
6,10,0.392,80,66,0.288,0
7,12,0.344,92,78,0.244,6
8,9,0.342,77,63,0.206,4
9,8,0.284,81,78,0.256,9
