In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### 2015 ~ 2020년도 데이터 가져오기

In [4]:
filenames=['C:/workspace/p-tag/KBO_prediction_data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

['C:/workspace/p-tag/KBO_prediction_data/baseball_2015.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2016.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2017.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2018.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2019.csv',
 'C:/workspace/p-tag/KBO_prediction_data/baseball_2020.csv']

In [5]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

100%|██████████| 6/6 [00:00<00:00, 23.18it/s]


In [6]:
baseball_data = data.copy()

### 날짜 데이터, HEADER_NO 삭제
- HEADER_NO 데이터의 경우 0 값만 존재하므로 삭제

In [7]:
baseball_data = baseball_data.drop(['G_ID','GDAY_DS','HEADER_NO'], axis=1)
baseball_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8656 entries, 0 to 1367
Data columns (total 25 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   T_ID      8656 non-null   object 
 1   VS_T_ID   8656 non-null   object 
 2   TB_SC     8656 non-null   object 
 3   PA        8656 non-null   float64
 4   AB        8656 non-null   float64
 5   RBI       8656 non-null   float64
 6   RUN       8656 non-null   float64
 7   HIT       8656 non-null   float64
 8   H2        8656 non-null   int64  
 9   H3        8656 non-null   int64  
 10  HR        8656 non-null   float64
 11  SB        8656 non-null   int64  
 12  CS        8656 non-null   int64  
 13  SF        8656 non-null   int64  
 14  BB        8656 non-null   float64
 15  HP        8656 non-null   float64
 16  KK        8656 non-null   float64
 17  GD        8656 non-null   float64
 18  LOB       8656 non-null   float64
 19  P_HRA_RT  8656 non-null   float64
 20  P_AB_CN   8656 non-null   int6

### 무승부 제거

In [8]:
baseball_data = baseball_data[baseball_data['win']!=0.5]

In [9]:
len(baseball_data)

8522

### 팀 명 중 넥센을 키움으로 변경

In [10]:
baseball_data['T_ID'] = baseball_data['T_ID'].replace('NE','WO')
baseball_data['VS_T_ID'] = baseball_data['VS_T_ID'].replace('NE','WO')

In [11]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID']]
train_target = baseball_data['win']

### 팀명 라벨 인코딩

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_data['T_ID'])
train_data.loc[:,'T_ID'] = le.transform(train_data['T_ID'])
train_data.loc[:,'VS_T_ID'] = le.transform(train_data['VS_T_ID'])
train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:,'T_ID'] = le.transform(train_data['T_ID'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:,'VS_T_ID'] = le.transform(train_data['VS_T_ID'])


Unnamed: 0,RUN,AB,PA,OOO,T_ID,VS_T_ID
0,4.0,42.0,54.0,0.238095,0,9
1,5.0,40.0,46.0,0.2,9,0
2,9.0,35.0,46.0,0.4,2,4
3,12.0,37.0,42.0,0.378378,4,2
4,1.0,29.0,34.0,0.241379,3,1


In [13]:
train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(6817, 6) (1705, 6) (6817,) (1705,)


In [14]:
params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[6,9,12,15,18,21],
    'min_samples_split':[8,16,24,32,40,48,56]
}

rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1,
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [4, 8, 12, 16, 20, 24],
                         'min_samples_leaf': [6, 9, 12, 15, 18, 21],
                         'min_samples_split': [8, 16, 24, 32, 40, 48, 56]})

In [15]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 24, 'min_samples_leaf': 9, 'min_samples_split': 8}
최고 예측 정확도: 0.8097


### 정확도 측정

In [16]:
from sklearn.metrics import accuracy_score

rfc_model = grid_cv.best_estimator_
pred = rfc_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

예측 정확도: 0.8047


### 확률 구하기

In [17]:
pred_probs = rfc_model.predict_proba(test_x)
pred_probs

array([[0.98549535, 0.01450465],
       [0.59050008, 0.40949992],
       [0.67264158, 0.32735842],
       ...,
       [0.22251691, 0.77748309],
       [0.03711752, 0.96288248],
       [0.89864775, 0.10135225]])