#Import Libaries

In [1]:
import numpy
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import tensorflow as tf


pd.set_option("display.max_rows", 1000)#可显示1000行
pd.set_option("display.max_columns", 1000)#可显示1000列
%matplotlib inline

#Load Data

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
import os
os.chdir("/content/drive/My Drive/CA683 Assignment2")

Mounted at /content/drive/


In [3]:
df = pd.read_csv('Premier League_2014_2022.csv')
df = df[df['ranking']!='None']

#Data Processing

In [4]:
MOVING_AVERAGE = 5
MINIMUM_RECORD = 80

In [5]:
dict_player = {}
dict_player_name = {}
for player_id in df['player_id'].unique():
  list_rate = list(df[df['player_id']==player_id]['ranking'].astype(float).rolling(MOVING_AVERAGE).mean())
  if(len(list_rate)>=MINIMUM_RECORD):
    dict_player[player_id] = list_rate[MOVING_AVERAGE-1:]
    dict_player_name[player_id] = df[df['player_id']==player_id]['player_name'].unique()[0]

In [6]:
def create_inout_sequences(input_data, tw):
    feature_seq = []
    result_seq = []
    L = len(input_data)
    for i in range(L-tw):
        train_seq = input_data[i:i+tw]
        train_label = input_data[i+tw:i+tw+1]#预测time_step之后的第一个数值
        feature_seq.append(train_seq)#feature_seq内的数据不断更新，但是总量只有tw个
        result_seq.append(train_label)

    return feature_seq,result_seq
def rmsle(predicted,real):
  sum=0.0
  for x in range(len(predicted)):
    p = np.log(predicted[x]+1)
    r = np.log(real[x]+1)
    sum = sum + (p - r)**2
  return (sum/len(predicted))**0.5

In [7]:
len(dict_player)

359

In [8]:
import random
test_player_list = random.sample(dict_player_name.keys(), 5)
test_player_list

[36865, 121338, 33104, 23067, 11796]

In [9]:
test_player_list

[36865, 121338, 33104, 23067, 11796]

#Grid Search Para

In [11]:
class MODEL_CONFIG:
  TEST_DATA_SIZE = 20
  INITIAL_WINDOW = 40
  WINDOW_LENTH = 20
  Filename = './Comparison_Work/RNN_ParaResultCV.csv'
  Foldername = './Comparison_Work/Pic20220402/'
  def __init__(self, \
        LEARNING_RATE = 0.001,\
        TIME_STEP = 10,\
        DENSE = 1,\
        NUM_NODES = 30,\
        EPOCHS = 500,\
        OPTIMIZER = 'SGD'):
    self.LEARNING_RATE = LEARNING_RATE
    self.TIME_STEP = TIME_STEP
    self.DENSE = DENSE
    self.NUM_NODES = NUM_NODES
    self.EPOCHS = EPOCHS
    if OPTIMIZER == "SGD":
      self.OPTIMIZER = tf.keras.optimizers.SGD(learning_rate=LEARNING_RATE, decay=1e-6, momentum=0.9, nesterov=True)
    if OPTIMIZER == "ADAM":
      self.OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE,beta_1=0.9,beta_2=0.999,epsilon=1e-07,amsgrad=False)
    if OPTIMIZER == "RMSprop":
      self.OPTIMIZER = tf.keras.optimizers.RMSprop(learning_rate=LEARNING_RATE,rho=0.9,momentum=0.0,epsilon=1e-07,centered=False)

In [12]:
#reload
test_player_list = [3088, 36301, 95390, 4618, 63912]

In [13]:
def sliding_cv_fit(data, config):
  list_mae = []
  list_mse = []
  list_r2 = []
  list_rmsle = []
  
  model = Sequential()
  model.add(SimpleRNN(config.NUM_NODES, input_shape=(1, config.TIME_STEP)))
  model.add(Dense(config.DENSE))
  model.compile(loss='mean_squared_error', optimizer=config.OPTIMIZER)
  for train_mark in range(config.INITIAL_WINDOW, (len(data)//config.WINDOW_LENTH)*config.WINDOW_LENTH, config.WINDOW_LENTH):
    data_piece = data[train_mark - config.INITIAL_WINDOW : train_mark + config.WINDOW_LENTH]
    data_seq,result_seq = create_inout_sequences(data_piece, config.TIME_STEP)
    
    train_X = np.array(data_seq[:-config.TEST_DATA_SIZE])
    test_X = np.array(data_seq[-config.TEST_DATA_SIZE:])
    train_Y = np.array(result_seq[:-config.TEST_DATA_SIZE])
    test_Y = np.array(result_seq[-config.TEST_DATA_SIZE:])
    
    train_X = np.reshape(train_X, (train_X.shape[0],1,train_X.shape[1]))
    test_X = np.reshape(test_X, (test_X.shape[0],1,test_X.shape[1]))
    
    model.fit(train_X, train_Y, epochs=config.EPOCHS, batch_size=1, verbose=0)

    trainPredict = model(train_X)
    testPredict = model(test_X)
  
    list_mae.append(mean_absolute_error(test_Y, testPredict))
    list_mse.append(mean_squared_error(test_Y, testPredict))
    list_r2.append(r2_score(test_Y, testPredict))
    list_rmsle.append(rmsle(test_Y, testPredict))
  #print('\nlist_mae  \t'+str(list_mae))
  #print('list_mse  \t'+str(list_mse))
  #print('list_r2  \t'+str(list_r2))
  #print('list_rmsle \t'+str(list(list_rmsle)))

  return np.mean(list_mae),np.mean(list_mse),np.mean(list_r2),np.mean(list_rmsle)

In [14]:
def fit_model(player_list,config):
  list_mae = []
  list_mse = []
  list_r2 = []
  list_rmsle = []
  for player_id in tqdm(player_list):
    data = dict_player[player_id]
    sliding_cv_fit(data,config)
    mae,mse,r2,rmsle = sliding_cv_fit(data,config)
    list_mae.append(mae)
    list_mse.append(mse)
    list_r2.append(r2)
    list_rmsle.append(rmsle)
  #print('\nlist_mae  \t'+str(list_mae))
  #print('list_mse  \t'+str(list_mse))
  #print('list_r2  \t'+str(list_r2))
  #print('list_rmsle \t'+str(list(list_rmsle)))
  return np.mean(list_mae),np.mean(list_mse),np.mean(list_r2),np.mean(list_rmsle)

In [15]:
#LEARNING_RATE = 0.01 0.001 0.0001
#TIME_STEP = 6 7 8
#DENSE = 1 2 3
#NUM_NODES = 20 30 40
#EPOCHS = 400 500
#OPTIMIZER = SGD ADAM
#for LEARNING_RATE in [0.01,0.001,0.0001]:
for LEARNING_RATE in [0.001]:
  for TIME_STEP in [6,7,8]:
    for DENSE in [1]:
      for NUM_NODES in [20,30,40]:
        for EPOCHS in [400,500]:
          for OPTIMIZER in ["SGD","ADAM"]:
            config = MODEL_CONFIG(LEARNING_RATE,TIME_STEP,DENSE,NUM_NODES,EPOCHS,OPTIMIZER)
            df_para = pd.read_csv(config.Filename)
            if len(df_para[  (df_para['LEARNING_RATE']==LEARNING_RATE) &\
                      (df_para['TIME_STEP']==TIME_STEP) &\
                      (df_para['DENSE']==DENSE) &\
                      (df_para['NUM_NODES']==NUM_NODES) &\
                      (df_para['EPOCHS']==EPOCHS) &\
                      (df_para['OPTIMIZER']==OPTIMIZER) \
                      ]) > 0:
                continue
            print("CURRENT TEST: \t"+str(len(df_para)+1))            
            print("\tLEARNING_RATE: \t"+str(LEARNING_RATE))
            print("\tTIME_STEP:   \t"+str(TIME_STEP))
            print("\tDENSE:     \t"+str(DENSE))
            print("\tNUM_NODES:   \t"+str(NUM_NODES))
            print("\tEPOCHS:     \t"+str(EPOCHS))
            print("\tOPTIMIZER:   \t"+OPTIMIZER)
            MAE,MSE,R2,RMSLE = fit_model(test_player_list,config)
            df_para = df_para.append({'LEARNING_RATE':LEARNING_RATE,\
                          'TIME_STEP':TIME_STEP,\
                          'DENSE':DENSE,\
                          'NUM_NODES':NUM_NODES,\
                          'EPOCHS':EPOCHS,\
                          'OPTIMIZER':OPTIMIZER,\
                          'TIME_STEP':TIME_STEP,\
                          'MAE':MAE,\
                          'MSE':MSE,\
                          'RMSLE':RMSLE,\
                          'R2':R2},ignore_index='true')
            df_para.to_csv(config.Filename,index=0)


CURRENT TEST: 	1
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:29<00:00, 77.83s/it]


CURRENT TEST: 	2
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:34<00:00, 78.98s/it]


CURRENT TEST: 	3
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [10:06<00:00, 121.37s/it]


CURRENT TEST: 	4
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [10:18<00:00, 123.70s/it]


CURRENT TEST: 	5
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:23<00:00, 76.76s/it]


CURRENT TEST: 	6
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:52<00:00, 82.51s/it]


CURRENT TEST: 	7
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [09:19<00:00, 111.95s/it]


CURRENT TEST: 	8
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [10:48<00:00, 129.78s/it]


CURRENT TEST: 	9
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:18<00:00, 75.67s/it]


CURRENT TEST: 	10
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:49<00:00, 81.89s/it]


CURRENT TEST: 	11
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [09:37<00:00, 115.55s/it]


CURRENT TEST: 	12
	LEARNING_RATE: 	0.001
	TIME_STEP:   	6
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [11:03<00:00, 132.79s/it]


CURRENT TEST: 	13
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:20<00:00, 76.13s/it]


CURRENT TEST: 	14
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:36<00:00, 79.25s/it]


CURRENT TEST: 	15
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [10:08<00:00, 121.77s/it]


CURRENT TEST: 	16
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [09:27<00:00, 113.59s/it]


CURRENT TEST: 	17
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:32<00:00, 78.59s/it]


CURRENT TEST: 	18
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:47<00:00, 81.51s/it]


CURRENT TEST: 	19
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [11:08<00:00, 133.64s/it]


CURRENT TEST: 	20
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [10:42<00:00, 128.41s/it]


CURRENT TEST: 	21
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:23<00:00, 76.67s/it]


CURRENT TEST: 	22
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:38<00:00, 79.78s/it]


CURRENT TEST: 	23
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [09:51<00:00, 118.35s/it]


CURRENT TEST: 	24
	LEARNING_RATE: 	0.001
	TIME_STEP:   	7
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [10:37<00:00, 127.53s/it]


CURRENT TEST: 	25
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:07<00:00, 73.51s/it]


CURRENT TEST: 	26
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:35<00:00, 79.19s/it]


CURRENT TEST: 	27
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [09:22<00:00, 112.47s/it]


CURRENT TEST: 	28
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	20
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [10:50<00:00, 130.10s/it]


CURRENT TEST: 	29
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:11<00:00, 74.23s/it]


CURRENT TEST: 	30
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:41<00:00, 80.21s/it]


CURRENT TEST: 	31
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [08:40<00:00, 104.16s/it]


CURRENT TEST: 	32
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	30
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [11:41<00:00, 140.29s/it]


CURRENT TEST: 	33
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	400
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [06:31<00:00, 78.22s/it]


CURRENT TEST: 	34
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	400
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [06:38<00:00, 79.69s/it]


CURRENT TEST: 	35
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	500
	OPTIMIZER:   	SGD


100%|██████████| 5/5 [08:23<00:00, 100.72s/it]


CURRENT TEST: 	36
	LEARNING_RATE: 	0.001
	TIME_STEP:   	8
	DENSE:     	1
	NUM_NODES:   	40
	EPOCHS:     	500
	OPTIMIZER:   	ADAM


100%|██████████| 5/5 [11:06<00:00, 133.26s/it]


#Not in Use(Previous Code)

In [10]:
#初始化文件，更换文件名称后的第一次点（做一个空文件），做的过程中千万不要点，点了就清空了。
df_para = pd.DataFrame(columns = ['LEARNING_RATE','TIME_STEP','DENSE','NUM_NODES','EPOCHS','OPTIMIZER','TIME_STEP','MAE','MSE','RMSLE','R2'],dtype = float)
df_para['OPTIMIZER'] = df_para['OPTIMIZER'].astype(str)
df_para.to_csv('./Comparison_Work/RNN_ParaResultCV.csv',index=0)