In [41]:
import pandas as pd
import os
from datetime import date
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [42]:
if not os.getcwd().endswith('Football Forecasting Version 2'):
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

dir_path = os.getcwd()

In [43]:
current_season = int(input('What is the current season?'))

In [44]:
today = date.today()

df_train = pd.read_csv(fr'{dir_path}\{current_season}_{current_season + 1}_Clean_Data\{current_season}_{current_season + 1}_{today.strftime("%b%d")}_historical_data.csv', index_col = 0)

df_pred = pd.read_csv(fr'{dir_path}\{current_season}_{current_season + 1}_Clean_Data\{current_season}_{current_season + 1}_{today.strftime("%b%d")}_matchweek_data.csv', index_col = 0)

In [45]:
df = pd.concat([df_train,df_pred])
df = df.sort_values(by = ['Club','Season','Date',])
df=df.reset_index(drop = True)
df['Date'] = pd.to_datetime(df['Date'])

In [46]:
## RESULT ENCODING ##
def encode_result(x):
    if x['Result'] == 'W':
        val= 2
    elif x['Result'] == 'D':
        val= 1
    else:
        val = 0
    return val

df['Result'] = df.apply(encode_result, axis = 1) 

#POINTS DIFF#
df['Points_Diff'] = (df['Pts_x']/df['Pl_x'] - df['Pts_y']/df['Pl_y'])/3


##SPI Diff##
df['SPI_Diff'] = df['spi_x'] - df['spi_y']
df['SPI_Diff'] = (df['SPI_Diff'] - df.groupby(['Season'])['SPI_Diff'].transform(min))/(df.groupby(['Season'])['SPI_Diff'].transform(max) - df.groupby(['Season'])['SPI_Diff'].transform(min))
df['Off_Diff'] = df['off_x'] - df['off_y']
df['Off_Diff'] = (df['Off_Diff'] - df.groupby(['Season'])['Off_Diff'].transform(min))/(df.groupby(['Season'])['Off_Diff'].transform(max) - df.groupby(['Season'])['Off_Diff'].transform(min))
df['Def_Diff'] = df['def_x'] - df['def_y']
df['Def_Diff'] = (df['Def_Diff'] - df.groupby(['Season'])['Def_Diff'].transform(min))/(df.groupby(['Season'])['Def_Diff'].transform(max) - df.groupby(['Season'])['Def_Diff'].transform(min))

##FORM##
df['Form_Diff'] = (df['Form_x'] - df['Form_y'])/15

# Creating form for goals and goals conceded and expected values for last 5 games
df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['Avg_GF_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['Avg_GA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['Avg_xG_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['Avg_xGA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Avg_Poss_last_5'].rolling(5).mean().reset_index([0,1],drop=True)

#Standardisation
df['Avg_GF_last_5'] = (df['Avg_GF_last_5'] - df.groupby(['Season'])['Avg_GF_last_5'].transform(min))/(df.groupby(['Season'])['Avg_GF_last_5'].transform(max) - df.groupby(['Season'])['Avg_GF_last_5'].transform(min))
df['Avg_GA_last_5'] = (df['Avg_GA_last_5'] - df.groupby(['Season'])['Avg_GA_last_5'].transform(min))/(df.groupby(['Season'])['Avg_GA_last_5'].transform(max) - df.groupby(['Season'])['Avg_GA_last_5'].transform(min))
df['Avg_xG_last_5'] = (df['Avg_xG_last_5'] - df.groupby(['Season'])['Avg_xG_last_5'].transform(min))/(df.groupby(['Season'])['Avg_xG_last_5'].transform(max) - df.groupby(['Season'])['Avg_xG_last_5'].transform(min))
df['Avg_xGA_last_5'] = (df['Avg_xGA_last_5'] - df.groupby(['Season'])['Avg_xGA_last_5'].transform(min))/(df.groupby(['Season'])['Avg_xGA_last_5'].transform(max) - df.groupby(['Season'])['Avg_xGA_last_5'].transform(min))
df['Avg_Poss_last_5'] = (df['Avg_Poss_last_5'] - df.groupby(['Season'])['Avg_Poss_last_5'].transform(min))/(df.groupby(['Season'])['Avg_Poss_last_5'].transform(max) - df.groupby(['Season'])['Avg_Poss_last_5'].transform(min))

##SEASON STATS##
df['Avg_GF_season'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
df['Avg_GF_season'] = df.groupby(['Season'])['Avg_GF_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_GA_season'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
df['Avg_GA_season'] = df.groupby(['Season'])['Avg_GA_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_xG_season'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
df['Avg_xG_season'] = df.groupby(['Season'])['Avg_xG_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_xGA_season'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
df['Avg_xGA_season'] = df.groupby(['Season'])['Avg_xGA_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_Poss_season'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
df['Avg_Poss_season'] = df.groupby(['Season'])['Avg_Poss_season'].expanding(1).mean().reset_index([0],drop=True)

#Standardisation
df['Avg_GF_season'] = (df['Avg_GF_season'] - df.groupby(['Season'])['Avg_GF_season'].transform(min))/(df.groupby(['Season'])['Avg_GF_season'].transform(max) - df.groupby(['Season'])['Avg_GF_season'].transform(min))
df['Avg_GA_season'] = (df['Avg_GA_season'] - df.groupby(['Season'])['Avg_GA_season'].transform(min))/(df.groupby(['Season'])['Avg_GA_season'].transform(max) - df.groupby(['Season'])['Avg_GA_season'].transform(min))
df['Avg_xG_season'] = (df['Avg_xG_season'] - df.groupby(['Season'])['Avg_xG_season'].transform(min))/(df.groupby(['Season'])['Avg_xG_season'].transform(max) - df.groupby(['Season'])['Avg_xG_season'].transform(min))
df['Avg_xGA_season'] = (df['Avg_xGA_season'] - df.groupby(['Season'])['Avg_xGA_season'].transform(min))/(df.groupby(['Season'])['Avg_xGA_season'].transform(max) - df.groupby(['Season'])['Avg_xGA_season'].transform(min))
df['Avg_Poss_season'] = (df['Avg_Poss_season'] - df.groupby(['Season'])['Avg_Poss_season'].transform(min))/(df.groupby(['Season'])['Avg_Poss_season'].transform(max) - df.groupby(['Season'])['Avg_Poss_season'].transform(min))

##AGAINST OPPONENT##
df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['GF_x'].shift(1)
df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['Avg_GF_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['GA_x'].shift(1)
df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['Avg_GA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['xG_x'].shift(1)
df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['Avg_xG_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['xGA_x'].shift(1)
df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['Avg_xGA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Poss_x'].shift(1)
df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Avg_Poss_Opp'].rolling(2).mean().reset_index([0,1],drop=True)

#Standardisation
df['Avg_GF_Opp'] = (df['Avg_GF_Opp'] - df.groupby(['Opp'])['Avg_GF_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_GF_Opp'].transform(max) - df.groupby(['Opp'])['Avg_GF_Opp'].transform(min))
df['Avg_GA_Opp'] = (df['Avg_GA_Opp'] - df.groupby(['Opp'])['Avg_GA_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_GA_Opp'].transform(max) - df.groupby(['Opp'])['Avg_GA_Opp'].transform(min))
df['Avg_xG_Opp'] = (df['Avg_xG_Opp'] - df.groupby(['Opp'])['Avg_xG_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_xG_Opp'].transform(max) - df.groupby(['Opp'])['Avg_xG_Opp'].transform(min))
df['Avg_xGA_Opp'] = (df['Avg_xGA_Opp'] - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_xGA_Opp'].transform(max) - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(min))
df['Avg_Poss_Opp'] = (df['Avg_Poss_Opp'] - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_Poss_Opp'].transform(max) - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(min))

In [47]:
df_test = df[df['Date'] < np.datetime64('today')]
df_pred = df[df['Date'] >  np.datetime64('today')]
df_pred['GF_x'] = 0
df_pred['GA_x'] = 0
df_poo = df_pred
display(df_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['GF_x'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['GA_x'] = 0


Unnamed: 0,Date,Venue,Result,GF_x,GA_x,Opp,xG_x,xGA_x,Poss_x,Club,...,Avg_GF_season,Avg_GA_season,Avg_xG_season,Avg_xGA_season,Avg_Poss_season,Avg_GF_Opp,Avg_GA_Opp,Avg_xG_Opp,Avg_xGA_Opp,Avg_Poss_Opp
204,2023-03-12,Away,0,0,0,FUL,,,,ARS,...,0.536537,0.634572,0.792799,0.599076,0.811538,0.333333,0.333333,0.666667,0.255814,1.0
336,2023-03-12,Away,0,0,0,WHU,,,,AVL,...,0.265102,0.817136,0.349688,0.834303,0.338194,0.125,0.428571,0.12069,0.225806,0.493827
537,2023-03-11,Away,0,0,0,LEE,,,,BHA,...,0.302653,0.83744,0.413178,0.817562,0.463779,0.181818,0.142857,0.347826,0.789474,0.384615
669,2023-03-11,Home,0,0,0,LIV,,,,BOU,...,0.176542,0.967775,0.189634,0.930752,0.157323,0.125,1.0,0.319149,0.433333,0.075949
728,2023-03-11,Away,0,0,0,EVE,,,,BRE,...,0.171077,0.955154,0.199282,0.923049,0.047967,0.4,0.428571,0.509434,0.634146,0.7625
1148,2023-03-11,Away,0,0,0,LEI,,,,CHE,...,0.104641,0.905604,0.142955,0.909185,0.144503,0.333333,0.2,0.467742,0.369231,0.643678
1352,2023-03-11,Home,0,0,0,MCI,,,,CRY,...,0.053744,0.895377,0.054103,0.914678,0.071751,0.4,0.333333,0.105263,0.448718,0.15942
1557,2023-03-11,Home,0,0,0,BRE,,,,EVE,...,0.005094,0.904593,0.0171,0.944712,0.020854,0.4,0.666667,0.510638,0.964286,0.0
1653,2023-03-12,Home,0,0,0,ARS,,,,FUL,...,0.018632,0.902076,0.011529,0.975359,0.004956,0.25,0.3,0.193548,0.716418,0.031915
1821,2023-03-11,Home,0,0,0,BHA,,,,LEE,...,0.004774,0.91839,0.008599,0.980495,0.00583,0.111111,0.333333,0.5,0.826087,0.489362


In [48]:
unwanted_columns = ['Date','Opp','Result','GF_x', 'xG_x', 'xGA_x',
       'Poss_x', 'Club', 'Poss_y', 'Position_x', 'Pl_x', 'Pts_x', 'Form_x',
       'Position_y', 'Pl_y', 'Pts_y', 'Form_y', 'spi_x', 'off_x', 'def_x',
       'Season', 'spi_y', 'off_y','def_y']
df_test_1 = df_test.drop(columns = unwanted_columns)
df_pred_1 = df_pred.drop(columns = unwanted_columns)
df_test_1 = df_test_1.dropna()
df_pred_1 = df_pred_1.dropna()


In [49]:
df_test_1= pd.get_dummies(df_test_1, columns=['Venue'])
df_pred_1= pd.get_dummies(df_pred_1, columns=['Venue'])

In [50]:
boo = df_test_1.drop(columns = 'GA_x')

X_train_1 = df_test_1[boo.columns]
y_train_1 = df_test_1['GA_x']

X_test_1 = df_pred_1[boo.columns]

rfr = RandomForestRegressor(n_estimators = 30, max_depth = 3, random_state = 0)
rfr.fit(X_train_1, y_train_1)
y_pred_1 = rfr.predict(X_test_1)


In [51]:
unwanted_columns = ['Date','Opp','Result','GA_x', 'xG_x', 'xGA_x',
       'Poss_x', 'Club', 'Poss_y', 'Position_x', 'Pl_x', 'Pts_x', 'Form_x',
       'Position_y', 'Pl_y', 'Pts_y', 'Form_y', 'spi_x', 'off_x', 'def_x',
       'Season', 'spi_y', 'off_y','def_y']
df_test = df_test.drop(columns = unwanted_columns)
df_pred = df_pred.drop(columns = unwanted_columns)
df_test = df_test.dropna()
df_pred = df_pred.dropna()


In [52]:
df_test.columns
df_test= pd.get_dummies(df_test, columns=['Venue'])
df_pred= pd.get_dummies(df_pred, columns=['Venue'])

In [53]:
boo = df_test.drop(columns = 'GF_x')

X_train = df_test[boo.columns]
y_train = df_test['GF_x']

X_test = df_pred[boo.columns]

rfr = xgb.XGBRegressor(learning_rate = 0.15, max_depth = 3, n_estimators = 30)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)


In [54]:
df_poo = df_poo[['Season','Club','Opp','Venue']]
a = pd.DataFrame(y_pred, columns = ['GF'], index = X_test.index)
b = pd.DataFrame(y_pred_1, columns = ['GA'], index = X_test_1.index)
c = pd.merge(df_poo,a, left_index=True, right_index=True)
d1 = pd.merge(c,b, left_index=True, right_index=True)
d2 = pd.merge(d1,d1, left_on = 'Club', right_on = 'Opp')
d2 = d2.drop(columns = ['Season_y','Club_y','Opp_y','Venue_y'])

d2['GF'] = (d2['GF_x'] + d2['GA_y']) /2
d2['GA'] = (d2['GF_y'] + d2['GA_x']) /2
d2 = d2.drop(columns = ['GF_x','GA_x','GF_y','GA_y'])

display (d2)

Unnamed: 0,Season_x,Club_x,Opp_x,Venue_x,GF,GA
0,2022,ARS,FUL,Away,1.844422,0.714883
1,2022,AVL,WHU,Away,1.221164,1.239447
2,2022,BHA,LEE,Away,2.028424,0.818597
3,2022,BOU,LIV,Home,0.84179,2.444313
4,2022,BRE,EVE,Away,1.622722,0.966606
5,2022,CHE,LEI,Away,1.439038,1.007018
6,2022,CRY,MCI,Home,0.686959,2.30587
7,2022,EVE,BRE,Home,0.966606,1.622722
8,2022,FUL,ARS,Home,0.714883,1.844422
9,2022,LEE,BHA,Home,0.818597,2.028424


In [55]:
d3 = d2.drop (columns = ['Venue_x','GA'])

In [56]:
d3.columns

Index(['Season_x', 'Club_x', 'Opp_x', 'GF'], dtype='object')

In [57]:


for score in range (6):
    d3['Score' + str(score)] = (d3['GF'] ** score * np.exp(- d3['GF']))/np.math.factorial(score)

d4 = pd.merge(d3,d3, left_on = 'Club_x', right_on = 'Opp_x')

score_list = []

win_list = []
draw_list = []
loss_list = []

for i in range (6):
    for j in range (6):
        d4[fr'{i} - {j}'] = d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']
        score_list.append(fr'{i} - {j}')

        if i > j:
            win_list.append(list(d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']))
        elif i == j:
            draw_list.append(list(d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']))
        else:
            loss_list.append(list(d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']))


d4[score_list].idxmax(axis = 1)
display(d4)

d2['Most likely score'] = list(d4[score_list].idxmax(axis = 1))

Unnamed: 0,Season_x_x,Club_x_x,Opp_x_x,GF_x,Score0_x,Score1_x,Score2_x,Score3_x,Score4_x,Score5_x,...,4 - 2,4 - 3,4 - 4,4 - 5,5 - 0,5 - 1,5 - 2,5 - 3,5 - 4,5 - 5
0,2022,ARS,FUL,1.844422,0.158117,0.291634,0.268948,0.165351,0.076244,0.028125,...,0.009532,0.002271,0.000406,5.8e-05,0.01376,0.009837,0.003516,0.000838,0.00015,2.1e-05
1,2022,AVL,WHU,1.221164,0.294887,0.360105,0.219874,0.089501,0.027324,0.006673,...,0.006077,0.002511,0.000778,0.000193,0.001932,0.002395,0.001484,0.000613,0.00019,4.7e-05
2,2022,BHA,LEE,2.028424,0.131543,0.266824,0.270616,0.182975,0.092788,0.037643,...,0.013712,0.003741,0.000766,0.000125,0.016602,0.013591,0.005563,0.001518,0.000311,5.1e-05
3,2022,BOU,LIV,0.84179,0.430938,0.36276,0.152684,0.042843,0.009016,0.001518,...,0.002337,0.001905,0.001164,0.000569,0.000132,0.000322,0.000394,0.000321,0.000196,9.6e-05
4,2022,BRE,EVE,1.622722,0.197361,0.320262,0.259848,0.140554,0.05702,0.018505,...,0.010132,0.003265,0.000789,0.000153,0.007039,0.006804,0.003288,0.00106,0.000256,4.9e-05
5,2022,CHE,LEI,1.439038,0.237156,0.341276,0.245555,0.117788,0.042375,0.012196,...,0.007849,0.002635,0.000663,0.000134,0.004455,0.004487,0.002259,0.000758,0.000191,3.8e-05
6,2022,CRY,MCI,0.686959,0.503104,0.345612,0.118711,0.027183,0.004668,0.000641,...,0.001237,0.000951,0.000548,0.000253,6.4e-05,0.000147,0.00017,0.000131,7.5e-05,3.5e-05
7,2022,EVE,BRE,0.966606,0.380372,0.36767,0.177696,0.057254,0.013836,0.002675,...,0.003595,0.001945,0.000789,0.000256,0.000528,0.000857,0.000695,0.000376,0.000153,4.9e-05
8,2022,FUL,ARS,0.714883,0.48925,0.349756,0.125017,0.029791,0.005324,0.000761,...,0.001432,0.00088,0.000406,0.00015,0.00012,0.000222,0.000205,0.000126,5.8e-05,2.1e-05
9,2022,LEE,BHA,0.818597,0.44105,0.361042,0.147774,0.040322,0.008252,0.001351,...,0.002233,0.00151,0.000766,0.000311,0.000178,0.00036,0.000366,0.000247,0.000125,5.1e-05


In [58]:
win_list = [sum(i) for i in list(zip(*win_list))]
draw_list = [sum(i) for i in list(zip(*draw_list))]
loss_list = [sum(i) for i in list(zip(*loss_list))]


In [59]:
d2['win_prob'] = win_list
d2['draw_prob'] = draw_list
d2['loss_prob'] = loss_list

In [60]:
display(d2)

Unnamed: 0,Season_x,Club_x,Opp_x,Venue_x,GF,GA,Most likely score,win_prob,draw_prob,loss_prob
0,2022,ARS,FUL,Away,1.844422,0.714883,1 - 0,0.631607,0.218336,0.138377
1,2022,AVL,WHU,Away,1.221164,1.239447,1 - 1,0.357695,0.272565,0.366344
2,2022,BHA,LEE,Away,2.028424,0.818597,2 - 0,0.638005,0.202536,0.141643
3,2022,BOU,LIV,Home,0.84179,2.444313,0 - 2,0.109337,0.164246,0.687781
4,2022,BRE,EVE,Away,1.622722,0.966606,1 - 0,0.520808,0.24788,0.224366
5,2022,CHE,LEI,Away,1.439038,1.007018,1 - 1,0.466584,0.265688,0.263459
6,2022,CRY,MCI,Home,0.686959,2.30587,0 - 2,0.094037,0.167153,0.708439
7,2022,EVE,BRE,Home,0.966606,1.622722,0 - 1,0.224366,0.24788,0.520808
8,2022,FUL,ARS,Home,0.714883,1.844422,0 - 1,0.138377,0.218336,0.631607
9,2022,LEE,BHA,Home,0.818597,2.028424,0 - 2,0.141643,0.202536,0.638005
