In [33]:
import pandas as pd
import os
from datetime import date
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [34]:
if not os.getcwd().endswith('Football Forecasting Version 2'):
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

dir_path = os.getcwd()

In [35]:
current_season = int(input('What is the current season?'))

In [36]:
today = date.today()

df_train = pd.read_csv(fr'{dir_path}\{current_season}_{current_season + 1}_Clean_Data\{current_season}_{current_season + 1}_{today.strftime("%b%d")}_historical_data.csv', index_col = 0)
df = df_train

In [37]:
## RESULT ENCODING ##
def encode_result(x):
    if x['Result'] == 'W':
        val= 2
    elif x['Result'] == 'D':
        val= 1
    else:
        val = 0
    return val

df['Result'] = df.apply(encode_result, axis = 1) 

#POINTS DIFF#
df['Points_Diff'] = (df['Pts_x']/df['Pl_x'] - df['Pts_y']/df['Pl_y'])/3


##SPI Diff##
df['SPI_Diff'] = df['spi_x'] - df['spi_y']
df['SPI_Diff'] = (df['SPI_Diff'] - df.groupby(['Season'])['SPI_Diff'].transform(min))/(df.groupby(['Season'])['SPI_Diff'].transform(max) - df.groupby(['Season'])['SPI_Diff'].transform(min))
df['Off_Diff'] = df['off_x'] - df['off_y']
df['Off_Diff'] = (df['Off_Diff'] - df.groupby(['Season'])['Off_Diff'].transform(min))/(df.groupby(['Season'])['Off_Diff'].transform(max) - df.groupby(['Season'])['Off_Diff'].transform(min))
df['Def_Diff'] = df['def_x'] - df['def_y']
df['Def_Diff'] = (df['Def_Diff'] - df.groupby(['Season'])['Def_Diff'].transform(min))/(df.groupby(['Season'])['Def_Diff'].transform(max) - df.groupby(['Season'])['Def_Diff'].transform(min))

##FORM##
df['Form_Diff'] = (df['Form_x'] - df['Form_y'])/15

# Creating form for goals and goals conceded and expected values for last 5 games
df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['Avg_GF_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['Avg_GA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['Avg_xG_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['Avg_xGA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Avg_Poss_last_5'].rolling(5).mean().reset_index([0,1],drop=True)

#Standardisation
df['Avg_GF_last_5'] = (df['Avg_GF_last_5'] - df.groupby(['Season'])['Avg_GF_last_5'].transform(min))/(df.groupby(['Season'])['Avg_GF_last_5'].transform(max) - df.groupby(['Season'])['Avg_GF_last_5'].transform(min))
df['Avg_GA_last_5'] = (df['Avg_GA_last_5'] - df.groupby(['Season'])['Avg_GA_last_5'].transform(min))/(df.groupby(['Season'])['Avg_GA_last_5'].transform(max) - df.groupby(['Season'])['Avg_GA_last_5'].transform(min))
df['Avg_xG_last_5'] = (df['Avg_xG_last_5'] - df.groupby(['Season'])['Avg_xG_last_5'].transform(min))/(df.groupby(['Season'])['Avg_xG_last_5'].transform(max) - df.groupby(['Season'])['Avg_xG_last_5'].transform(min))
df['Avg_xGA_last_5'] = (df['Avg_xGA_last_5'] - df.groupby(['Season'])['Avg_xGA_last_5'].transform(min))/(df.groupby(['Season'])['Avg_xGA_last_5'].transform(max) - df.groupby(['Season'])['Avg_xGA_last_5'].transform(min))
df['Avg_Poss_last_5'] = (df['Avg_Poss_last_5'] - df.groupby(['Season'])['Avg_Poss_last_5'].transform(min))/(df.groupby(['Season'])['Avg_Poss_last_5'].transform(max) - df.groupby(['Season'])['Avg_Poss_last_5'].transform(min))

##SEASON STATS##
df['Avg_GF_season'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
df['Avg_GF_season'] = df.groupby(['Season'])['Avg_GF_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_GA_season'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
df['Avg_GA_season'] = df.groupby(['Season'])['Avg_GA_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_xG_season'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
df['Avg_xG_season'] = df.groupby(['Season'])['Avg_xG_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_xGA_season'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
df['Avg_xGA_season'] = df.groupby(['Season'])['Avg_xGA_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_Poss_season'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
df['Avg_Poss_season'] = df.groupby(['Season'])['Avg_Poss_season'].expanding(1).mean().reset_index([0],drop=True)

#Standardisation
df['Avg_GF_season'] = (df['Avg_GF_season'] - df.groupby(['Season'])['Avg_GF_season'].transform(min))/(df.groupby(['Season'])['Avg_GF_season'].transform(max) - df.groupby(['Season'])['Avg_GF_season'].transform(min))
df['Avg_GA_season'] = (df['Avg_GA_season'] - df.groupby(['Season'])['Avg_GA_season'].transform(min))/(df.groupby(['Season'])['Avg_GA_season'].transform(max) - df.groupby(['Season'])['Avg_GA_season'].transform(min))
df['Avg_xG_season'] = (df['Avg_xG_season'] - df.groupby(['Season'])['Avg_xG_season'].transform(min))/(df.groupby(['Season'])['Avg_xG_season'].transform(max) - df.groupby(['Season'])['Avg_xG_season'].transform(min))
df['Avg_xGA_season'] = (df['Avg_xGA_season'] - df.groupby(['Season'])['Avg_xGA_season'].transform(min))/(df.groupby(['Season'])['Avg_xGA_season'].transform(max) - df.groupby(['Season'])['Avg_xGA_season'].transform(min))
df['Avg_Poss_season'] = (df['Avg_Poss_season'] - df.groupby(['Season'])['Avg_Poss_season'].transform(min))/(df.groupby(['Season'])['Avg_Poss_season'].transform(max) - df.groupby(['Season'])['Avg_Poss_season'].transform(min))

##AGAINST OPPONENT##
df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['GF_x'].shift(1)
df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['Avg_GF_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['GA_x'].shift(1)
df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['Avg_GA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['xG_x'].shift(1)
df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['Avg_xG_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['xGA_x'].shift(1)
df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['Avg_xGA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Poss_x'].shift(1)
df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Avg_Poss_Opp'].rolling(2).mean().reset_index([0,1],drop=True)

#Standardisation
df['Avg_GF_Opp'] = (df['Avg_GF_Opp'] - df.groupby(['Opp'])['Avg_GF_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_GF_Opp'].transform(max) - df.groupby(['Opp'])['Avg_GF_Opp'].transform(min))
df['Avg_GA_Opp'] = (df['Avg_GA_Opp'] - df.groupby(['Opp'])['Avg_GA_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_GA_Opp'].transform(max) - df.groupby(['Opp'])['Avg_GA_Opp'].transform(min))
df['Avg_xG_Opp'] = (df['Avg_xG_Opp'] - df.groupby(['Opp'])['Avg_xG_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_xG_Opp'].transform(max) - df.groupby(['Opp'])['Avg_xG_Opp'].transform(min))
df['Avg_xGA_Opp'] = (df['Avg_xGA_Opp'] - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_xGA_Opp'].transform(max) - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(min))
df['Avg_Poss_Opp'] = (df['Avg_Poss_Opp'] - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_Poss_Opp'].transform(max) - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(min))

In [38]:
# ## RESULT ENCODING ##
# def encode_result(x):
#     if x['Result'] == 'W':
#         val= 2
#     elif x['Result'] == 'D':
#         val= 1
#     else:
#         val = 0
#     return val

# df['Result'] = df.apply(encode_result, axis = 1) 

# #POINTS DIFF#
# df['Points_Diff'] = (df['Pts_x']/df['Pl_x'] - df['Pts_y']/df['Pl_y'])/3


# ##SPI Diff##
# df['SPI_Diff'] = df['spi_x'] - df['spi_y']
# df['SPI_Diff'] = (df['SPI_Diff'] - df.groupby(['Season'])['SPI_Diff'].transform(np.mean))/df.groupby(['Season'])['SPI_Diff'].transform(np.std)
# df['Off_Diff'] = df['off_x'] - df['off_y']
# df['Off_Diff'] = (df['Off_Diff'] - df.groupby(['Season'])['Off_Diff'].transform(np.mean))/df.groupby(['Season'])['Off_Diff'].transform(np.std) 
# df['Def_Diff'] = df['def_x'] - df['def_y']
# df['Def_Diff'] = (df['Def_Diff'] - df.groupby(['Season'])['Def_Diff'].transform(np.mean))/df.groupby(['Season'])['Def_Diff'].transform(np.std)

# ##FORM##
# df['Form_Diff'] = (df['Form_x'] - df['Form_y'])/15

# # Creating form for goals and goals conceded and expected values for last 5 games
# df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
# df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['Avg_GF_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
# df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
# df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['Avg_GA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
# df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
# df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['Avg_xG_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
# df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
# df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['Avg_xGA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
# df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
# df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Avg_Poss_last_5'].rolling(5).mean().reset_index([0,1],drop=True)

# #Standardisation
# df['Avg_GF_last_5'] = (df['Avg_GF_last_5'] - df.groupby(['Season'])['Avg_GF_last_5'].transform(np.mean))/df.groupby(['Season'])['Avg_GF_last_5'].transform(np.std)
# df['Avg_GA_last_5'] = (df['Avg_GA_last_5'] - df.groupby(['Season'])['Avg_GA_last_5'].transform(np.mean))/df.groupby(['Season'])['Avg_GA_last_5'].transform(np.std) 
# df['Avg_xG_last_5'] = (df['Avg_xG_last_5'] - df.groupby(['Season'])['Avg_xG_last_5'].transform(np.mean))/df.groupby(['Season'])['Avg_xG_last_5'].transform(np.std) 
# df['Avg_xGA_last_5'] = (df['Avg_xGA_last_5'] - df.groupby(['Season'])['Avg_xGA_last_5'].transform(np.mean))/df.groupby(['Season'])['Avg_xGA_last_5'].transform(np.std) 
# df['Avg_Poss_last_5'] = (df['Avg_Poss_last_5'] - df.groupby(['Season'])['Avg_Poss_last_5'].transform(np.mean))/df.groupby(['Season'])['Avg_Poss_last_5'].transform(np.std) 

# ##SEASON STATS##
# df['Avg_GF_season'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
# df['Avg_GF_season'] = df.groupby(['Season'])['Avg_GF_season'].expanding(1).mean().reset_index([0],drop=True)
# df['Avg_GA_season'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
# df['Avg_GA_season'] = df.groupby(['Season'])['Avg_GA_season'].expanding(1).mean().reset_index([0],drop=True)
# df['Avg_xG_season'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
# df['Avg_xG_season'] = df.groupby(['Season'])['Avg_xG_season'].expanding(1).mean().reset_index([0],drop=True)
# df['Avg_xGA_season'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
# df['Avg_xGA_season'] = df.groupby(['Season'])['Avg_xGA_season'].expanding(1).mean().reset_index([0],drop=True)
# df['Avg_Poss_season'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
# df['Avg_Poss_season'] = df.groupby(['Season'])['Avg_Poss_season'].expanding(1).mean().reset_index([0],drop=True)

# #Standardisation
# df['Avg_GF_season'] = (df['Avg_GF_season'] - df.groupby(['Season'])['Avg_GF_season'].transform(np.mean))/df.groupby(['Season'])['Avg_GF_season'].transform(np.std) 
# df['Avg_GA_season'] = (df['Avg_GA_season'] - df.groupby(['Season'])['Avg_GA_season'].transform(np.mean))/df.groupby(['Season'])['Avg_GA_season'].transform(np.std) 
# df['Avg_xG_season'] = (df['Avg_xG_season'] - df.groupby(['Season'])['Avg_xG_season'].transform(np.mean))/df.groupby(['Season'])['Avg_xG_season'].transform(np.std) 
# df['Avg_xGA_season'] = (df['Avg_xGA_season'] - df.groupby(['Season'])['Avg_xGA_season'].transform(np.mean))/df.groupby(['Season'])['Avg_xGA_season'].transform(np.std) 
# df['Avg_Poss_season'] = (df['Avg_Poss_season'] - df.groupby(['Season'])['Avg_Poss_season'].transform(np.mean))/df.groupby(['Season'])['Avg_Poss_season'].transform(np.std) 

# ##AGAINST OPPONENT##
# df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['GF_x'].shift(1)
# df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['Avg_GF_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
# df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['GA_x'].shift(1)
# df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['Avg_GA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
# df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['xG_x'].shift(1)
# df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['Avg_xG_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
# df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['xGA_x'].shift(1)
# df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['Avg_xGA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
# df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Poss_x'].shift(1)
# df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Avg_Poss_Opp'].rolling(2).mean().reset_index([0,1],drop=True)

# #Standardisation
# df['Avg_GF_Opp'] = (df['Avg_GF_Opp'] - df.groupby(['Opp'])['Avg_GF_Opp'].transform(np.mean))/df.groupby(['Opp'])['Avg_GF_Opp'].transform(np.std) 
# df['Avg_GA_Opp'] = (df['Avg_GA_Opp'] - df.groupby(['Opp'])['Avg_GA_Opp'].transform(np.mean))/df.groupby(['Opp'])['Avg_GA_Opp'].transform(np.std) 
# df['Avg_xG_Opp'] = (df['Avg_xG_Opp'] - df.groupby(['Opp'])['Avg_xG_Opp'].transform(np.mean))/df.groupby(['Opp'])['Avg_xG_Opp'].transform(np.std) 
# df['Avg_xGA_Opp'] = (df['Avg_xGA_Opp'] - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(np.mean))/df.groupby(['Opp'])['Avg_xGA_Opp'].transform(np.std) 
# df['Avg_Poss_Opp'] = (df['Avg_Poss_Opp'] - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(np.mean))/df.groupby(['Opp'])['Avg_Poss_Opp'].transform(np.std) 

In [39]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

unwanted_columns = ['Date','Opp','Result','GF_x', 'xG_x', 'xGA_x',
       'Poss_x', 'Club', 'Poss_y', 'Position_x', 'Pl_x', 'Pts_x', 'Form_x',
       'Position_y', 'Pl_y', 'Pts_y', 'Form_y', 'spi_x', 'off_x', 'def_x',
       'Season', 'spi_y', 'off_y','def_y']
df_train = df_train.drop(columns = unwanted_columns)
df_test = df_test.drop(columns = unwanted_columns)
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train= pd.get_dummies(df_train, columns=['Venue'])
df_test= pd.get_dummies(df_test, columns=['Venue'])

boo = df_train.drop(columns = 'GA_x')

X_train = df_train[boo.columns]
y_train = df_train['GA_x']

X_test = df_test[boo.columns]
y_test = df_test['GA_x']

df_test.columns

Index(['GA_x', 'Points_Diff', 'SPI_Diff', 'Off_Diff', 'Def_Diff', 'Form_Diff',
       'Avg_GF_last_5', 'Avg_GA_last_5', 'Avg_xG_last_5', 'Avg_xGA_last_5',
       'Avg_Poss_last_5', 'Avg_GF_season', 'Avg_GA_season', 'Avg_xG_season',
       'Avg_xGA_season', 'Avg_Poss_season', 'Avg_GF_Opp', 'Avg_GA_Opp',
       'Avg_xG_Opp', 'Avg_xGA_Opp', 'Avg_Poss_Opp', 'Venue_Away',
       'Venue_Home'],
      dtype='object')

In [40]:
dtr = RandomForestRegressor(n_estimators = 50, max_depth = 4, random_state = 0)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)


In [41]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE (test): {mse:.4f}')
print(f'RMSE (test): {mse**0.5:.4f}')
print(f'MAE (test): {mae:.4f}')
print(f'R-squared is: {r2}')

MSE (test): 1.6066
RMSE (test): 1.2675
MAE (test): 0.9379
R-squared is: 0.15082401688095604


In [42]:
def hyper_tuning_random_forest(X_train, y_train, X_test, y_test, est_list, depth_list):
    data = []
    for estimate in est_list:
        for depth in depth_list:
            inner_dict = {}
            inner_dict['depth'] = depth
            dt = RandomForestRegressor(n_estimators = estimate, max_depth = depth, random_state = 200)
            dt.fit(X_train, y_train)
            y_pred = dt.predict(X_test)
        
            mse = mean_squared_error(y_test, y_pred)
            rmse = mse**0.5
            mae = mean_absolute_error(y_test,y_pred)
            r2 = r2_score(y_test, y_pred)
        
            inner_dict['estimate'] = estimate
            inner_dict['mae'] = mae
            inner_dict['mse'] = mse
            inner_dict['rmse'] = rmse
            inner_dict['r2'] = r2
            data.append(inner_dict)
            
    summary_df = pd.DataFrame(data)
    summary_df.sort_values('rmse', inplace = True)
    return summary_df

In [43]:
depth_list = [3,4,5,6,7,8,9,10]
est_list = [30,40,50,60,70,80,90,100,110,125,150,200,250]

hyper_tuning_random_forest(X_train, y_train, X_test, y_test, est_list, depth_list).head()

Unnamed: 0,depth,estimate,mae,mse,rmse,r2
96,3,250,0.934816,1.595678,1.263202,0.156611
88,3,200,0.934918,1.597885,1.264075,0.155444
80,3,150,0.935049,1.599137,1.26457,0.154782
72,3,125,0.935591,1.600625,1.265158,0.153996
56,3,100,0.936668,1.601883,1.265655,0.153331


In [44]:
xg_reg = xgb.XGBRegressor(learning_rate = 0.1, max_depth = 4, n_estimators = 150)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE (test): {mse:.4f}')
print(f'RMSE (test): {mse**0.5:.4f}')
print(f'MAE (test): {mae:.4f}')
print(f'R-squared is: {r2}')

MSE (test): 1.7237
RMSE (test): 1.3129
MAE (test): 0.9772
R-squared is: 0.08894806264425059


In [45]:
def hyper_tuning_xg_boost(X_train, y_train, X_test, y_test, est_list, depth_list, rate_list):
    data = []
    for estimate in est_list:
        for depth in depth_list:
            for rate in rate_list:
                inner_dict = {}
                inner_dict['depth'] = depth
                dt = xgb.XGBRegressor(n_estimators = estimate, max_depth = depth, learning_rate = rate)
                dt.fit(X_train, y_train)
                y_pred = dt.predict(X_test)
            
                mse = mean_squared_error(y_test, y_pred)
                rmse = mse**0.5
                mae = mean_absolute_error(y_test,y_pred)
                r2 = r2_score(y_test, y_pred)
            
                inner_dict['estimate'] = estimate
                inner_dict['learning rate'] = rate
                inner_dict['mae'] = mae
                inner_dict['mse'] = mse
                inner_dict['rmse'] = rmse
                inner_dict['r2'] = r2
                data.append(inner_dict)
            
    summary_df = pd.DataFrame(data)
    summary_df.sort_values('rmse', inplace = True)
    return summary_df

In [46]:
depth_list = [3,4,5,6,7,8,9,10]
est_list = [30,40,50,60,70,80,90,100,110,125,150,200,250]
rate_list = [0.01, 0.02, 0.05, 0.07, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]

hyper_tuning_xg_boost(X_train, y_train, X_test, y_test, est_list, depth_list, rate_list).head()

Unnamed: 0,depth,estimate,learning rate,mae,mse,rmse,r2
268,3,60,0.1,0.932882,1.61472,1.270716,0.146546
91,3,40,0.07,0.928617,1.617498,1.271809,0.145078
92,3,40,0.1,0.93185,1.61888,1.272352,0.144348
180,3,50,0.1,0.935033,1.619034,1.272413,0.144266
356,3,70,0.1,0.933616,1.620954,1.273167,0.143251
