### Import python library

In [1]:
import tensorflow as tf
import numpy as np

from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
# from tensorflow.keras.utils import np_utils
from tensorflow.keras.layers import Dense, LSTM, Dropout
import tensorflow.keras.backend as K 

from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.express as px
# import plotly.graph_objects as go

from math import sqrt

### 1. Read data: augment_24group_1620.csv필요

In [2]:
# lstm_train_X = pd.read_csv("lstm_data/new_AVG_train_X.csv")
# lstm_train_y = pd.read_csv("lstm_data/AVG_lstm_final_train_y.csv")

# lstm_test_X = pd.read_csv("lstm_data/new_AVG_test_X.csv")
# lstm_test_y = pd.read_csv("lstm_data/AVG_lstm_final_test_y.csv")

lstm_train_X = pd.read_csv("lstm_data/new_PCT_train_X.csv")
lstm_train_y = pd.read_csv("lstm_data/PCT_lstm_final_train_y.csv")

lstm_test_X = pd.read_csv("lstm_data/new_PCT_test_X.csv")
lstm_test_y = pd.read_csv("lstm_data/PCT_lstm_final_test_y.csv")

In [3]:
team = list(lstm_train_X.T_ID.unique())
year = list(lstm_train_y.YEAR.unique())

In [4]:
team, year

(['LG', 'HH', 'NC', 'HT', 'SK', 'KT', 'WO', 'LT', 'SS', 'OB'],
 [2016, 2017, 2018, 2019])

#### 4) input shape로 변경 (row, timestep=2, feature)

ex) 
timestep = 2

* X_train_v 구성예시: [[1 ~ 24경기 데이터, 25 ~ 48경기 데이터], [49 ~ 72경기 데이터, 73 ~ 96경기 데이터] ]  
X_train_v.shape >> (2,2*x)             # x: 각 24group에 대한 변수 개수
* y_train_v 구성예시: 97 ~ 120 경기 승률

=> reshape

* X_train_v.shape >> (2,2,x)  # row, timestep, feature

### 2. 모델 구성(LSTM)
- optimizer: RMSprop -> lr(learning rate) 조절
- LSTM: 모델이 계속 동일한 결과값이 나올 때, input 뉴런 개수를 늘려야 한다는 글을 읽고 계속 input 노드 개수를 바꿔주면서 모델 생성중
- loss: MSE

- early_stop: patience를 크게하면 과적합 되는 경우가 있어서 최대한 작게 설정해둠
- batch_size: 모델이 계속 동일한 결과값이 나올 때, 데이터가 적어 batch size를 줄여보라는 글을 읽고 1로 설정해둠

### <span style = "color:red"> MSE로 통일했습니다! </span>

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
model_dict = dict()
hist_dict = dict()
test_pred_df = pd.DataFrame([],columns = ['YEAR','T_ID','y','y_pred'])


X = lstm_train_X.drop(["T_ID","YEAR"],axis = 1)
y = lstm_train_y.drop(["T_ID","YEAR"],axis=1)
X_test = lstm_test_X.drop(["T_ID","YEAR"],axis=1)
y_test = lstm_test_y.drop(["T_ID","YEAR"],axis=1)
        
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, shuffle=False, random_state=1004)    
    

X_train_v = X_train.values
y_train_v = y_train.values

X_valid_v = X_valid.values
y_valid_v = y_valid.values

X_test_v = X_test.values
y_test_v = y_test.values

X_train_t = X_train_v.reshape(X_train_v.shape[0], 2,X_train_v.shape[1]//2)
X_valid_t = X_valid_v.reshape(X_valid_v.shape[0],2,X_valid_v.shape[1]//2)
X_test_t = X_test_v.reshape(X_test_v.shape[0], 2,X_test_v.shape[1]//2)


## model
K.clear_session() 

model = Sequential()
optimizer = Adam(lr=0.01)
model.add(LSTM(100,input_shape = (2,X_train_v.shape[1]//2))) # (timestep, feature)
model.add(Dense(1)) # output = 1
model.compile(loss='mean_squared_error', optimizer=optimizer,metrics=['mae'])

model.summary()

early_stop = EarlyStopping(monitor='val_loss', mode = 'min',patience=1, verbose=1)

hist1 = model.fit(X_train_t, y_train_v, epochs=100,
          batch_size=1, verbose=1, callbacks=[early_stop], validation_data = (X_valid_t, y_valid_v))
##

y_pred = model.predict(X_test_t)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               86000     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 86,101
Trainable params: 86,101
Non-trainable params: 0
_________________________________________________________________
Train on 1568 samples, validate on 392 samples
Epoch 1/100
Epoch 2/100
Epoch 00002: early stopping


## 성능평가

In [10]:
test_pred_df = lstm_test_y.copy()
test_pred_df['PCT_pred'] = y_pred

In [11]:
test_pred_df

Unnamed: 0,T_ID,YEAR,PCT,PCT_pred
0,HH,2016,0.5,0.579627
1,HT,2016,0.458333,0.579637
2,KT,2016,0.291667,0.579624
3,LG,2016,0.608696,0.579627
4,LT,2016,0.5,0.579627
5,NC,2016,0.565217,0.579636
6,OB,2016,0.666667,0.579641
7,SK,2016,0.458333,0.579627
8,SS,2016,0.5,0.579627
9,WO,2016,0.375,0.579647


In [13]:
mean_squared_error(test_pred_df['PCT'],test_pred_df['PCT_pred'])

0.017394340714362685

In [14]:
from sklearn.metrics import r2_score
tmp = test_pred_df.copy()
# tmp['half']= 0.5
r2_y_predict = r2_score(tmp['PCT'], tmp['PCT_pred'])
r2_y_predict

-0.5524908576095005