### Import python library

In [6]:
import tensorflow as tf
import numpy as np

from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
# from tensorflow.keras.utils import np_utils
from tensorflow.keras.layers import Dense, LSTM, Dropout
import tensorflow.keras.backend as K 

from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.express as px
# import plotly.graph_objects as go

from math import sqrt

### 1. Read data: augment_24group_1620.csv필요

In [7]:
# PCT_lstm_train_X = pd.read_csv("AVG_lstm_final_train_X_hit.csv")
# PCT_lstm_train_y = pd.read_csv("AVG_lstm_final_train_y_hit.csv")

# PCT_lstm_test_X = pd.read_csv("AVG_lstm_final_test_X_hit.csv")
# PCT_lstm_test_y = pd.read_csv("AVG_lstm_final_test_y_hit.csv")

lstm_train_X = pd.read_csv("lstm_data/new_AVG_train_X.csv")
lstm_train_y = pd.read_csv("lstm_data/AVG_lstm_final_train_y.csv")

lstm_test_X = pd.read_csv("lstm_data/new_AVG_test_X.csv")
lstm_test_y = pd.read_csv("lstm_data/AVG_lstm_final_test_y.csv")

In [9]:
team = list(lstm_train_X.T_ID.unique())
year = list(lstm_train_y.YEAR.unique())

In [10]:
team, year

(['LG', 'HH', 'NC', 'HT', 'SK', 'KT', 'WO', 'LT', 'SS', 'OB'],
 [2016, 2017, 2018, 2019])

#### 4) input shape로 변경 (row, timestep=2, feature)

ex) 
timestep = 2

* X_train_v 구성예시: [[1 ~ 24경기 데이터, 25 ~ 48경기 데이터], [49 ~ 72경기 데이터, 73 ~ 96경기 데이터] ]  
X_train_v.shape >> (2,2*x)             # x: 각 24group에 대한 변수 개수
* y_train_v 구성예시: 97 ~ 120 경기 승률

=> reshape

* X_train_v.shape >> (2,2,x)  # row, timestep, feature

### 2. 모델 구성(LSTM)
- optimizer: RMSprop -> lr(learning rate) 조절
- LSTM: 모델이 계속 동일한 결과값이 나올 때, input 뉴런 개수를 늘려야 한다는 글을 읽고 계속 input 노드 개수를 바꿔주면서 모델 생성중
- loss: MSE

- early_stop: patience를 크게하면 과적합 되는 경우가 있어서 최대한 작게 설정해둠
- batch_size: 모델이 계속 동일한 결과값이 나올 때, 데이터가 적어 batch size를 줄여보라는 글을 읽고 1로 설정해둠

In [12]:
model_dict = dict()
hist_dict = dict()
test_pred_df = pd.DataFrame([],columns = ['YEAR','T_ID','y','y_pred',"shift_AVG_1","shift_AVG_2",'MSE','MSE_avg'])

idx = 0
for y in year:
    tmp1 = lstm_train_X[lstm_train_X["YEAR"] == y]
    tmp2 = lstm_train_y[lstm_train_y["YEAR"] == y]
    tmp3 = lstm_test_X[lstm_test_X["YEAR"] == y]
    tmp4 = lstm_test_y[lstm_test_y["YEAR"] == y]
    for t in team:
        name = '{}{}'.format(y,t)
        print(name,"=======================================")
        
        X_train = tmp1[tmp1["T_ID"] == t].drop(["T_ID","YEAR"],axis = 1)
        y_train = tmp2[tmp2["T_ID"] == t].drop(["T_ID","YEAR"],axis=1)
        X_test = tmp3[tmp3["T_ID"] == t].drop(["T_ID","YEAR"],axis=1)
        y_test = tmp4[tmp4["T_ID"] == t].drop(["T_ID","YEAR"],axis=1)
        
        X_train_v = X_train.values
        y_train_v = y_train.values

        X_test_v = X_test.values
        y_test_v = y_test.values
        
        X_train_t = X_train_v.reshape(X_train_v.shape[0], 2,X_train_v.shape[1]//2)
        X_test_t = X_test_v.reshape(X_test_v.shape[0], 2,X_test_v.shape[1]//2)
        
        ## model
        K.clear_session() 

        model = Sequential()
        optimizer = Adam(lr=0.01)
#         optimizer = RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)

        model.add(LSTM(100,input_shape = (2,X_train_v.shape[1]//2))) # (timestep, feature)
        model.add(Dense(1)) # output = 1
        model.compile(loss='mean_squared_error', optimizer=optimizer,metrics=['mae'])

        model.summary()
        
#         hist1 = model.fit(X_train_t, y_train_v, epochs=100, batch_size=1, verbose=1)
        
        early_stop = EarlyStopping(monitor='loss', mode = 'min',patience=2, verbose=1)

        hist1 = model.fit(X_train_t, y_train_v, epochs=100,
                  batch_size=1, verbose=1, callbacks=[early_stop])
        ##
        
        model_dict[name] = model
        hist_dict[name] = hist1
        
        y_pred = model.predict(X_test_t)
        mse = mean_squared_error(y_test_v, y_pred)
        mse_avg = mean_squared_error(y_test_v,[y_train.mean()[0]])
        
        
        test_pred_df.loc[idx,:] = [y,t,y_test_v.reshape(-1)[0],y_pred.reshape(-1)[0],
                                  X_test.loc[X_test.index[0],["shift_AVG_1"]][0],
                                  X_test.loc[X_test.index[0],["shift_AVG_2"]][0], mse,mse_avg]

        idx += 1

test_pred_df[['y','y_pred',"shift_AVG_1","shift_AVG_2",'MSE','MSE_avg']] = test_pred_df[['y','y_pred',"shift_AVG_1","shift_AVG_2",'MSE','MSE_avg']].astype(float)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,30

Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
____________________________________________________

Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (No

Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
________________________________________________________

Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
____________

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 77,301
Trainable params: 77,301
Non-trainable params: 0
_________________________________________________________________
Train on 49 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               77200     
____________________________

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping


In [13]:
mean_squared_error(test_pred_df['y'],test_pred_df['y_pred'])

0.0005603615437343463

In [15]:
test_pred_df

Unnamed: 0,YEAR,T_ID,y,y_pred,shift_AVG_1,shift_AVG_2,MSE,MSE_avg
0,2016,LG,0.296069,0.29771,0.294471,0.29711,2.69336e-06,1.517778e-05
1,2016,HH,0.288575,0.301862,0.293083,0.314581,0.0001765523,0.0002450427
2,2016,NC,0.28744,0.291479,0.294611,0.286241,1.631802e-05,3.194852e-07
3,2016,HT,0.256739,0.299684,0.293286,0.31386,0.001844245,0.002007821
4,2016,SK,0.305263,0.300787,0.286055,0.308046,2.003944e-05,1.099573e-05
5,2016,KT,0.295455,0.276185,0.281437,0.262626,0.0003713209,0.0007989715
6,2016,WO,0.289941,0.31066,0.297398,0.326291,0.0004292818,0.0003572876
7,2016,LT,0.309893,0.275043,0.25625,0.285211,0.001214532,0.0008075197
8,2016,SS,0.283863,0.316396,0.329186,0.283688,0.001058382,4.781198e-05
9,2016,OB,0.298225,0.293107,0.297974,0.285885,2.618811e-05,4.693716e-05


In [14]:
# test_pred_df.sort_values(by=["YEAR","T_ID"]).to_csv("AVG_t2.csv",index = False)

### 아래 데이터 이용

In [26]:
# PCT_lstm_train_X = pd.read_csv("AVG_lstm_final_train_X_hit.csv")
# PCT_lstm_train_y = pd.read_csv("AVG_lstm_final_train_y_hit.csv")

# PCT_lstm_test_X = pd.read_csv("AVG_lstm_final_test_X_hit.csv")
# PCT_lstm_test_y = pd.read_csv("AVG_lstm_final_test_y_hit.csv")

In [24]:
test_pred_df.groupby(["YEAR"]).mean()

Unnamed: 0_level_0,y,y_pred,shift_AVG_1,shift_AVG_2,rms,rms_avg
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016,0.291146,0.287639,0.292375,0.296354,0.021336,0.015759
2017,0.291601,0.283259,0.284698,0.293109,0.021382,0.022077
2018,0.285146,0.284798,0.293297,0.283993,0.015053,0.012738
2019,0.257812,0.27512,0.269043,0.272767,0.019414,0.015692


In [25]:
test_pred_df

Unnamed: 0,YEAR,T_ID,y,y_pred,shift_AVG_1,shift_AVG_2,rms,rms_avg
0,2016,LG,0.296069,0.284477,0.294471,0.29711,0.011592,0.003896
1,2016,HH,0.288575,0.316201,0.293083,0.314581,0.027626,0.015654
2,2016,NC,0.28744,0.270977,0.294611,0.286241,0.016463,0.000565
3,2016,HT,0.256739,0.315041,0.293286,0.31386,0.058302,0.044809
4,2016,SK,0.305263,0.299972,0.286055,0.308046,0.005292,0.003316
5,2016,KT,0.295455,0.268301,0.281437,0.262626,0.027153,0.028266
6,2016,WO,0.289941,0.293155,0.297398,0.326291,0.003215,0.018902
7,2016,LT,0.309893,0.265357,0.25625,0.285211,0.044536,0.028417
8,2016,SS,0.283863,0.276533,0.329186,0.283688,0.00733,0.006915
9,2016,OB,0.298225,0.286373,0.297974,0.285885,0.011852,0.006851


In [9]:
from sklearn.metrics import r2_score
tmp = test_pred_df.copy()
# tmp['half']= 0.5
r2_y_predict = r2_score(tmp['y'], tmp['y_pred'])
r2_y_predict

-0.3438337229787254

#### 아래 파일 이용

In [None]:
# PCT_lstm_train_X = pd.read_csv("AVG_lstm_final_train_X.csv")
# PCT_lstm_train_y = pd.read_csv("AVG_lstm_final_train_y.csv")

# PCT_lstm_test_X = pd.read_csv("AVG_lstm_final_test_X.csv")
# PCT_lstm_test_y = pd.read_csv("AVG_lstm_final_test_y.csv")

In [14]:
test_pred_df.groupby(["YEAR"]).mean()

Unnamed: 0_level_0,y,y_pred,shift_AVG_1,shift_AVG_2,rms,rms_avg
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016,0.291146,0.292641,0.292375,0.296354,0.01838,0.015759
2017,0.291601,0.290575,0.284698,0.293109,0.020686,0.022077
2018,0.285146,0.285081,0.293297,0.283993,0.012775,0.012738
2019,0.257812,0.261197,0.269043,0.272767,0.016623,0.015692


In [15]:
test_pred_df

Unnamed: 0,YEAR,T_ID,y,y_pred,shift_AVG_1,shift_AVG_2,rms,rms_avg
0,2016,LG,0.296069,0.293986,0.294471,0.29711,0.002082,0.003896
1,2016,HH,0.288575,0.31035,0.293083,0.314581,0.021776,0.015654
2,2016,NC,0.28744,0.311469,0.294611,0.286241,0.024029,0.000565
3,2016,HT,0.256739,0.299722,0.293286,0.31386,0.042983,0.044809
4,2016,SK,0.305263,0.296655,0.286055,0.308046,0.008608,0.003316
5,2016,KT,0.295455,0.266835,0.281437,0.262626,0.02862,0.028266
6,2016,WO,0.289941,0.293779,0.297398,0.326291,0.003839,0.018902
7,2016,LT,0.309893,0.273169,0.25625,0.285211,0.036723,0.028417
8,2016,SS,0.283863,0.29061,0.329186,0.283688,0.006747,0.006915
9,2016,OB,0.298225,0.289828,0.297974,0.285885,0.008397,0.006851
