In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

### 2015 ~ 2020년도 데이터 가져오기
- 드라이브 마운트 필요

In [None]:
filenames=['/content/drive/MyDrive/teamproject/data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

In [None]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

In [None]:
baseball_data = data.copy()

In [None]:
baseball_data.head()

In [None]:
baseball_data.columns

### 데이터의 결측치와 데이터 형태 확인

In [None]:
baseball_data.info()

In [None]:
baseball_data.describe()

### 날짜 데이터, HEADER_NO 삭제
- HEADER_NO 데이터의 경우 0 값만 존재하므로 삭제

In [None]:
baseball_data = baseball_data.drop(['G_ID','GDAY_DS','HEADER_NO'], axis=1)
baseball_data.info()

### feature 간에 상관 관계(corr)

In [None]:
baseball_data_corr = baseball_data.corr()
baseball_data_corr

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(baseball_data_corr)

### 상관관계 정도를 그래프로 그리기

In [None]:
baseball_data_corr['win'].sort_values(ascending=False)

In [None]:
baseball_data_corr['win'].sort_values(ascending=False).plot.barh()

=> corr로 상관성 분석 시, RUN, RBI, OBP, OOO, P_HIT_CN, HIT feature들이 win과 40% 이상 상관성을 가지고 있음

## RUN, RBI, OBP, OOO, P_HIT_CN, HIT, T_ID,VS_T_ID 으로 RandomForest 학습

### win 데이터에서 무승부를 0.5에서 2로 값 변경

In [None]:
baseball_data = baseball_data.replace({'win':0.5},2)
baseball_data['win'].unique()

In [None]:
baseball_data['win'].value_counts()

### 팀명 라벨 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(baseball_data['T_ID'])
baseball_data.loc[:,'T_ID'] = le.transform(baseball_data['T_ID'])
baseball_data.loc[:,'VS_T_ID'] = le.transform(baseball_data['VS_T_ID'])
baseball_data.head()

In [None]:
train_data = baseball_data[['RUN','RBI','OBP','OOO','P_HIT_CN','HIT','T_ID','VS_T_ID']]
train_target = baseball_data['win']

In [None]:
train_data

In [None]:
train_target

- train_x, train_y : 훈련 데이터
- test_x, test_y : 테스트 데이터

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

### GridSearchCV로 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[1,3,6,9,12,15],
    'min_samples_split':[2,8,12,16]
}

rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

In [None]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

### 정확도 측정

In [None]:
from sklearn.metrics import accuracy_score

rfc_2 = RandomForestClassifier(n_estimators=50,random_state=42, max_depth=20, min_samples_leaf = 3, min_samples_split=12, n_jobs=-1)
rfc_2.fit(train_x,train_y)
pred = rfc_2.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))