### Import python library

In [1]:
import tensorflow as tf
import numpy as np

from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
# from tensorflow.keras.utils import np_utils
from tensorflow.keras.layers import Dense, LSTM, Dropout
import tensorflow.keras.backend as K 

from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.express as px
# import plotly.graph_objects as go

from math import sqrt

### 1. Read data: augment_24group_1620.csv필요

In [2]:
PCT_lstm_train_X = pd.read_csv("lstmPCT/PCT_lstm_final_train_X.csv")
PCT_lstm_train_y = pd.read_csv("lstmPCT/PCT_lstm_final_train_y.csv")

PCT_lstm_test_X = pd.read_csv("lstmPCT/PCT_lstm_final_test_X.csv")
PCT_lstm_test_y = pd.read_csv("lstmPCT/PCT_lstm_final_test_y.csv")

In [3]:
team = list(PCT_lstm_train_X.T_ID.unique())
year = list(PCT_lstm_train_y.YEAR.unique())

In [4]:
team, year

(['LG', 'HH', 'NC', 'HT', 'SK', 'KT', 'WO', 'LT', 'SS', 'OB'],
 [2016, 2017, 2018, 2019])

#### 4) input shape로 변경 (row, timestep=2, feature)

ex) 
timestep = 2

* X_train_v 구성예시: [[1 ~ 24경기 데이터, 25 ~ 48경기 데이터], [49 ~ 72경기 데이터, 73 ~ 96경기 데이터] ]  
X_train_v.shape >> (2,2*x)             # x: 각 24group에 대한 변수 개수
* y_train_v 구성예시: 97 ~ 120 경기 승률

=> reshape

* X_train_v.shape >> (2,2,x)  # row, timestep, feature

### 2. 모델 구성(LSTM)
- optimizer: RMSprop -> lr(learning rate) 조절
- LSTM: 모델이 계속 동일한 결과값이 나올 때, input 뉴런 개수를 늘려야 한다는 글을 읽고 계속 input 노드 개수를 바꿔주면서 모델 생성중
- loss: MSE

- early_stop: patience를 크게하면 과적합 되는 경우가 있어서 최대한 작게 설정해둠
- batch_size: 모델이 계속 동일한 결과값이 나올 때, 데이터가 적어 batch size를 줄여보라는 글을 읽고 1로 설정해둠

In [5]:
model_dict = dict()
hist_dict = dict()
test_pred_df = pd.DataFrame([],columns = ['YEAR','T_ID','y','y_pred',"shift_PCT_1","shift_PCT_2",'rms','rms0.5'])

idx = 0

tmp1 = PCT_lstm_train_X
tmp2 = PCT_lstm_train_y
tmp3 = PCT_lstm_test_X
tmp4 = PCT_lstm_test_y

for t in team:
    name = '{}'.format(t)
    print(name,"=======================================")

    X_train = tmp1[tmp1["T_ID"] == t].drop(["T_ID","YEAR"],axis = 1)
    y_train = tmp2[tmp2["T_ID"] == t].drop(["T_ID","YEAR"],axis=1)
    X_test = tmp3[tmp3["T_ID"] == t].drop(["T_ID",'YEAR'],axis=1)
    y_test = tmp4[tmp4["T_ID"] == t].drop(["T_ID",'YEAR'],axis=1)

    X_train_v = X_train.values
    y_train_v = y_train.values

    X_test_v = X_test.values
    y_test_v = y_test.values

    X_train_t = X_train_v.reshape(X_train_v.shape[0], 2,X_train_v.shape[1]//2)
    X_test_t = X_test_v.reshape(X_test_v.shape[0], 2,X_test_v.shape[1]//2)

    ## model
    K.clear_session() 

    model = Sequential()
    optimizer = Adam(lr=0.01)
#         optimizer = RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)

    model.add(LSTM(100,input_shape = (2,X_train_v.shape[1]//2))) # (timestep, feature)
    model.add(Dense(1)) # output = 1
    model.compile(loss='mean_squared_error', optimizer=optimizer,metrics=['mae'])

    model.summary()

#         hist1 = model.fit(X_train_t, y_train_v, epochs=100, batch_size=1, verbose=1)

    early_stop = EarlyStopping(monitor='loss', mode = 'min',patience=2, verbose=1)

    hist1 = model.fit(X_train_t, y_train_v, epochs=100,
              batch_size=2, verbose=1, callbacks=[early_stop])
    ##

    model_dict[name] = model
    hist_dict[name] = hist1
    
    
    year = 2016
    for tt in range(len(X_test_t)):
        ttmp = X_test_t[tt].reshape(1,X_test_t[tt].shape[0],X_test_t[tt].shape[1])
        y_pred = model.predict(ttmp)
        rms = sqrt(mean_squared_error(y_test_v[tt], y_pred))
        rms05 = sqrt(mean_squared_error(y_test_v[tt], [0.5]))
    #     rms_avg = sqrt(mean_squared_error(y_test_v,[y_train.mean()[0]]))


        test_pred_df.loc[idx,:] = [year,t,y_test_v[tt].reshape(-1)[0],y_pred.reshape(-1)[0],
                                  X_test.loc[X_test.index[tt],["shift_PCT_1"]][0],
                                  X_test.loc[X_test.index[tt],["shift_PCT_2"]][0], rms,rms05]
        year += 1
        idx += 1

test_pred_df[['y','y_pred',"shift_PCT_1","shift_PCT_2",'rms','rms0.5']] = test_pred_df[['y','y_pred',"shift_PCT_1","shift_PCT_2",'rms','rms0.5']].astype(float)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               63200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 63,301
Trainable params: 63,301
Non-trainable params: 0
_________________________________________________________________
Train on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               63200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 63,301
Trainable params: 63,301
Non-trainable params:

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               63200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 63,301
Trainable params: 63,301
Non-trainable params: 0
_________________________________________________________________
Train on 196 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  

In [11]:
from sklearn.metrics import r2_score
tmp = test_pred_df.copy()
# tmp['half']= 0.5
r2_y_predict = r2_score(tmp['y'], tmp['y_pred'])
r2_y_predict

-0.08037503385826161

In [7]:
test_pred_df.groupby(["YEAR"]).mean()

Unnamed: 0_level_0,y,y_pred,shift_PCT_1,shift_PCT_2,rms,rms0.5
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016,0.492391,0.497217,0.504167,0.508514,0.084939,0.075725
2017,0.49837,0.485499,0.491848,0.521937,0.095652,0.07663
2018,0.507971,0.493422,0.500362,0.49748,0.050633,0.071377
2019,0.505072,0.509464,0.508152,0.49529,0.115817,0.105072


In [7]:
test_pred_df.sort_values(by = ["YEAR","T_ID"]).to_csv("PCT_t2_noseason.csv", index = False)

In [8]:
mean_squared_error(test_pred_df['y'],test_pred_df['y_pred'])

0.014882024328722784

In [40]:
test_pred_df.groupby(["YEAR"]).mean()

Unnamed: 0_level_0,y,y_pred,shift_PCT_1,shift_PCT_2,rms,rms0.5
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016,0.492391,0.510814,0.504167,0.508514,0.070072,0.075725
2017,0.49837,0.488735,0.491848,0.521937,0.105351,0.07663
2018,0.507971,0.511459,0.500362,0.49748,0.042255,0.071377
2019,0.505072,0.520403,0.508152,0.49529,0.12183,0.105072


In [41]:
test_pred_df

Unnamed: 0,YEAR,T_ID,y,y_pred,shift_PCT_1,shift_PCT_2,rms,rms0.5
0,2016,LG,0.608696,0.650341,0.583333,0.458333,0.041645,0.108696
1,2017,LG,0.434783,0.505163,0.304348,0.652174,0.07038,0.065217
2,2018,LG,0.416667,0.464049,0.291667,0.478261,0.047382,0.083333
3,2019,LG,0.541667,0.497537,0.541667,0.5,0.044129,0.041667
4,2016,HH,0.5,0.465625,0.5,0.608696,0.034375,0.0
5,2017,HH,0.434783,0.472319,0.541667,0.333333,0.037537,0.065217
6,2018,HH,0.5,0.424813,0.416667,0.583333,0.075187,0.0
7,2019,HH,0.583333,0.435061,0.375,0.25,0.148272,0.083333
8,2016,NC,0.565217,0.460554,0.5,0.541667,0.104664,0.065217
9,2017,NC,0.5,0.455474,0.458333,0.541667,0.044526,0.0


In [33]:
test_pred_df.groupby(["T_ID"]).mean()

Unnamed: 0_level_0,y,y_pred,shift_PCT_1,shift_PCT_2,rms,rms0.5
T_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HH,0.504529,0.532457,0.458333,0.443841,0.089852,0.037138
HT,0.5,0.493976,0.504982,0.539402,0.019504,0.020833
KT,0.431159,0.376189,0.401268,0.434824,0.0847,0.089674
LG,0.500453,0.489523,0.430254,0.522192,0.04477,0.074728
LT,0.510417,0.482492,0.493659,0.435771,0.161231,0.15625
NC,0.511775,0.556893,0.505435,0.494565,0.051286,0.053442
OB,0.631793,0.594275,0.674366,0.580616,0.042592,0.131793
SK,0.479167,0.515187,0.510417,0.5625,0.144313,0.083333
SS,0.457428,0.458,0.443388,0.450593,0.054178,0.053442
WO,0.48279,0.587607,0.589221,0.59375,0.105755,0.121377


In [34]:
test_pred_df["diff"] = test_pred_df["rms"] - test_pred_df["rms0.5"]
test_pred_df

Unnamed: 0,T_ID,y,y_pred,shift_PCT_1,shift_PCT_2,rms,rms0.5,diff
0,LG,0.608696,0.562782,0.583333,0.458333,0.045914,0.108696,-0.062782
1,LG,0.434783,0.475488,0.304348,0.652174,0.040705,0.065217,-0.024512
2,LG,0.416667,0.443642,0.291667,0.478261,0.026975,0.083333,-0.056358
3,LG,0.541667,0.476182,0.541667,0.5,0.065484,0.041667,0.023818
4,HH,0.5,0.547024,0.5,0.608696,0.047024,0.0,0.047024
5,HH,0.434783,0.623317,0.541667,0.333333,0.188535,0.065217,0.123317
6,HH,0.5,0.49421,0.416667,0.583333,0.00579,0.0,0.00579
7,HH,0.583333,0.465275,0.375,0.25,0.118058,0.083333,0.034725
8,NC,0.565217,0.565169,0.5,0.541667,4.9e-05,0.065217,-0.065169
9,NC,0.5,0.558253,0.458333,0.541667,0.058253,0.0,0.058253
