In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn import preprocessing

### 2015 ~ 2020년도 데이터 가져오기

In [None]:
filenames=['/content/drive/MyDrive/teamproject/data/baseball_' + str(x) + '.csv' for x in list(range(2015,2021))]
filenames

In [None]:
data = pd.DataFrame()
for filename in tqdm(filenames):
    temp = pd.read_csv(filename)
    data = pd.concat([data,temp])

In [None]:
baseball_data = data.copy()

### 날짜 데이터, HEADER_NO 삭제

In [None]:
baseball_data = baseball_data.drop(['G_ID','GDAY_DS','HEADER_NO'], axis=1)
baseball_data.info()

### win 데이터에서 무승부를 0.5에서 2로 값 변경

In [None]:
baseball_data = baseball_data.replace({'win':0.5},2)
baseball_data['win'].unique()

In [None]:
baseball_data['win'].value_counts()

### 결정 트리를 이용하여 주요 변수 알아내기
- 결정 트리의 경우 별도의 전처리가 필요 없음
- train_data1, train_target1

In [None]:
train_data1 = baseball_data[['T_ID', 'VS_T_ID', 'TB_SC', 'PA', 'AB', 'RBI', 'RUN', 'HIT', 'H2', 'H3',
       'HR', 'SB', 'CS', 'SF', 'BB', 'HP', 'KK', 'GD', 'LOB', 'P_HRA_RT',
       'P_AB_CN', 'P_HIT_CN', 'OBP', 'OOO']]
train_target1 = baseball_data['win']

### 팀 명 라벨 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_data1['T_ID'])
train_data1.loc[:,'T_ID'] = le.transform(train_data1['T_ID'])
train_data1.loc[:,'VS_T_ID'] = le.transform(train_data1['VS_T_ID'])
train_data1.head()

### 데이터 형태가 object인 값(TB_SC)을 라벨 인코딩

In [None]:
le = LabelEncoder()
le.fit(train_data1['TB_SC'])
train_data1.loc[:,'TB_SC'] = le.transform(train_data1['TB_SC'])
train_data1.head()

### 데이터 세트 나누기

In [None]:
from sklearn.tree import DecisionTreeClassifier

x_train, x_test, y_train, y_test = train_test_split(train_data1, train_target1, test_size=0.2, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

### 결정 트리 모델 학습

In [None]:
dt_clf = DecisionTreeClassifier(random_state=156)
dt_clf.fit(x_train,y_train)

### 결정 트리 모델 시각화

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(dt_clf, out_file='tree.dot',class_names=['패','승','무'], feature_names = x_train.columns, impurity=True, filled=True)

In [None]:
import graphviz

with open('tree.dot') as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

### 피처별 중요도

In [None]:
import seaborn as sns
import numpy as np

print('Feature importances:\n{0}'.format(np.round(dt_clf.feature_importances_,3)))

for name, value in zip(x_train.columns, dt_clf.feature_importances_):
    print('{0} : {1:.3f}'.format(name, value))

sns.barplot(x=dt_clf.feature_importances_, y = x_train.columns)

- 상관도에서의 주요 피처 : RUN, RBI, OBP, OOO, P_HIT_CN, HIT
- 결정 트리의 주요 피처 : RUN, AB, PA, OOO

### 결정 트리로 확인한 주요 피처로 학습

### 1. DecisionTree 학습

In [None]:
baseball_data.head()

In [None]:
baseball_data['win'].unique()

In [None]:
train_data = baseball_data[['RUN','AB','PA','OOO','T_ID','VS_T_ID']]
train_target = baseball_data['win']

#### 팀명 라벨 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_data['T_ID'])
train_data.loc[:,'T_ID'] = le.transform(train_data['T_ID'])
train_data.loc[:,'VS_T_ID'] = le.transform(train_data['VS_T_ID'])
train_data.head()

- train_x, train_y : 훈련 데이터
- test_x, test_y : 테스트 데이터

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(train_data, train_target, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

### GridSearchCV로 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params={
    'max_depth':[4,8,12,16,20,24],
    'min_samples_leaf':[1,3,6,9,12,15],
    'min_samples_split':[2,4,6,8,10,12,14]
}

dt_clf = DecisionTreeClassifier(random_state=42)
grid_cv=GridSearchCV(dt_clf, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

In [None]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

### 정확도 측정

In [None]:
from sklearn.metrics import accuracy_score

dt_clf_2 = DecisionTreeClassifier(random_state=42, max_depth=8, min_samples_leaf = 15, min_samples_split=2)
dt_clf_2.fit(train_x,train_y)
pred = dt_clf_2.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))

### 2. RandomForest 학습

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50,random_state=42,n_jobs=-1)
grid_cv=GridSearchCV(rfc, param_grid=params,cv=5,n_jobs=-1)
grid_cv.fit(train_x,train_y)

In [None]:
print('최적 하이퍼 파라미터:\n',grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

### 정확도 측정

In [None]:
from sklearn.metrics import accuracy_score

rfc_2 = RandomForestClassifier(n_estimators=50,random_state=42, max_depth=8, min_samples_leaf = 6, min_samples_split=2, n_jobs=-1)
rfc_2.fit(train_x,train_y)
pred = rfc_2.predict(test_x)
print('예측 정확도: {0:.4f}'.format(accuracy_score(test_y, pred)))