In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### 2015 ~ 2020년도 데이터 가져오기

In [None]:
filenames=['/content/drive/MyDrive/teamproject/data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

In [None]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

In [None]:
baseball_data = data.copy()

### 날짜 데이터, HEADER_NO 삭제
- HEADER_NO 데이터의 경우 0 값만 존재하므로 삭제

In [None]:
baseball_data = baseball_data.drop(['G_ID','GDAY_DS','HEADER_NO'], axis=1)
baseball_data.info()

### 무승부 제거

In [None]:
baseball_data = baseball_data[baseball_data['win']!=0.5]

In [None]:
len(baseball_data)

In [None]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID']]
train_target = baseball_data['win']

### 팀명 라벨 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_data['T_ID'])
train_data.loc[:,'T_ID'] = le.transform(train_data['T_ID'])
train_data.loc[:,'VS_T_ID'] = le.transform(train_data['VS_T_ID'])
train_data.head()

In [None]:
train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

In [None]:
params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[6,9,12,15,18,21],
    'min_samples_split':[8,16,24,32,40,48,56]
}

rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

In [None]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

### 정확도 측정

In [None]:
from sklearn.metrics import accuracy_score

rfc_model = grid_cv.best_estimator_
pred = rfc_model.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

### 확률 구하기

In [None]:
pred_probs = rfc_model.predict_proba(test_x)
pred_probs