In [1]:
import pandas as pd
import os
from datetime import date
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
if not os.getcwd().endswith('Football Forecasting Version 2'):
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

dir_path = os.getcwd()

In [3]:
current_season = int(input('What is the current season?'))

In [4]:
today = date.today()

df_train = pd.read_csv(fr'{dir_path}\{current_season}_{current_season + 1}_Clean_Data\{current_season}_{current_season + 1}_{today.strftime("%b%d")}_historical_data.csv', index_col = 0)

df_pred = pd.read_csv(fr'{dir_path}\{current_season}_{current_season + 1}_Clean_Data\{current_season}_{current_season + 1}_{today.strftime("%b%d")}_matchweek_data.csv', index_col = 0)

In [5]:
df = pd.concat([df_train,df_pred])
df = df.sort_values(by = ['Club','Season','Date',])
df=df.reset_index(drop = True)
df['Date'] = pd.to_datetime(df['Date'])

In [6]:
## RESULT ENCODING ##
def encode_result(x):
    if x['Result'] == 'W':
        val= 2
    elif x['Result'] == 'D':
        val= 1
    else:
        val = 0
    return val

df['Result'] = df.apply(encode_result, axis = 1) 

#POINTS DIFF#
df['Points_Diff'] = (df['Pts_x']/df['Pl_x'] - df['Pts_y']/df['Pl_y'])/3


##SPI Diff##
df['SPI_Diff'] = df['spi_x'] - df['spi_y']
df['SPI_Diff'] = (df['SPI_Diff'] - df.groupby(['Season'])['SPI_Diff'].transform(min))/(df.groupby(['Season'])['SPI_Diff'].transform(max) - df.groupby(['Season'])['SPI_Diff'].transform(min))
df['Off_Diff'] = df['off_x'] - df['off_y']
df['Off_Diff'] = (df['Off_Diff'] - df.groupby(['Season'])['Off_Diff'].transform(min))/(df.groupby(['Season'])['Off_Diff'].transform(max) - df.groupby(['Season'])['Off_Diff'].transform(min))
df['Def_Diff'] = df['def_x'] - df['def_y']
df['Def_Diff'] = (df['Def_Diff'] - df.groupby(['Season'])['Def_Diff'].transform(min))/(df.groupby(['Season'])['Def_Diff'].transform(max) - df.groupby(['Season'])['Def_Diff'].transform(min))

##FORM##
df['Form_Diff'] = (df['Form_x'] - df['Form_y'])/15

# Creating form for goals and goals conceded and expected values for last 5 games
df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
df['Avg_GF_last_5'] = df.groupby(['Club','Season'])['Avg_GF_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
df['Avg_GA_last_5'] = df.groupby(['Club','Season'])['Avg_GA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
df['Avg_xG_last_5'] = df.groupby(['Club','Season'])['Avg_xG_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
df['Avg_xGA_last_5'] = df.groupby(['Club','Season'])['Avg_xGA_last_5'].rolling(5).mean().reset_index([0,1],drop=True)
df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
df['Avg_Poss_last_5'] = df.groupby(['Club','Season'])['Avg_Poss_last_5'].rolling(5).mean().reset_index([0,1],drop=True)

#Standardisation
df['Avg_GF_last_5'] = (df['Avg_GF_last_5'] - df.groupby(['Season'])['Avg_GF_last_5'].transform(min))/(df.groupby(['Season'])['Avg_GF_last_5'].transform(max) - df.groupby(['Season'])['Avg_GF_last_5'].transform(min))
df['Avg_GA_last_5'] = (df['Avg_GA_last_5'] - df.groupby(['Season'])['Avg_GA_last_5'].transform(min))/(df.groupby(['Season'])['Avg_GA_last_5'].transform(max) - df.groupby(['Season'])['Avg_GA_last_5'].transform(min))
df['Avg_xG_last_5'] = (df['Avg_xG_last_5'] - df.groupby(['Season'])['Avg_xG_last_5'].transform(min))/(df.groupby(['Season'])['Avg_xG_last_5'].transform(max) - df.groupby(['Season'])['Avg_xG_last_5'].transform(min))
df['Avg_xGA_last_5'] = (df['Avg_xGA_last_5'] - df.groupby(['Season'])['Avg_xGA_last_5'].transform(min))/(df.groupby(['Season'])['Avg_xGA_last_5'].transform(max) - df.groupby(['Season'])['Avg_xGA_last_5'].transform(min))
df['Avg_Poss_last_5'] = (df['Avg_Poss_last_5'] - df.groupby(['Season'])['Avg_Poss_last_5'].transform(min))/(df.groupby(['Season'])['Avg_Poss_last_5'].transform(max) - df.groupby(['Season'])['Avg_Poss_last_5'].transform(min))

##SEASON STATS##
df['Avg_GF_season'] = df.groupby(['Club','Season'])['GF_x'].shift(1)
df['Avg_GF_season'] = df.groupby(['Season'])['Avg_GF_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_GA_season'] = df.groupby(['Club','Season'])['GA_x'].shift(1)
df['Avg_GA_season'] = df.groupby(['Season'])['Avg_GA_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_xG_season'] = df.groupby(['Club','Season'])['xG_x'].shift(1)
df['Avg_xG_season'] = df.groupby(['Season'])['Avg_xG_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_xGA_season'] = df.groupby(['Club','Season'])['xGA_x'].shift(1)
df['Avg_xGA_season'] = df.groupby(['Season'])['Avg_xGA_season'].expanding(1).mean().reset_index([0],drop=True)
df['Avg_Poss_season'] = df.groupby(['Club','Season'])['Poss_x'].shift(1)
df['Avg_Poss_season'] = df.groupby(['Season'])['Avg_Poss_season'].expanding(1).mean().reset_index([0],drop=True)

#Standardisation
df['Avg_GF_season'] = (df['Avg_GF_season'] - df.groupby(['Season'])['Avg_GF_season'].transform(min))/(df.groupby(['Season'])['Avg_GF_season'].transform(max) - df.groupby(['Season'])['Avg_GF_season'].transform(min))
df['Avg_GA_season'] = (df['Avg_GA_season'] - df.groupby(['Season'])['Avg_GA_season'].transform(min))/(df.groupby(['Season'])['Avg_GA_season'].transform(max) - df.groupby(['Season'])['Avg_GA_season'].transform(min))
df['Avg_xG_season'] = (df['Avg_xG_season'] - df.groupby(['Season'])['Avg_xG_season'].transform(min))/(df.groupby(['Season'])['Avg_xG_season'].transform(max) - df.groupby(['Season'])['Avg_xG_season'].transform(min))
df['Avg_xGA_season'] = (df['Avg_xGA_season'] - df.groupby(['Season'])['Avg_xGA_season'].transform(min))/(df.groupby(['Season'])['Avg_xGA_season'].transform(max) - df.groupby(['Season'])['Avg_xGA_season'].transform(min))
df['Avg_Poss_season'] = (df['Avg_Poss_season'] - df.groupby(['Season'])['Avg_Poss_season'].transform(min))/(df.groupby(['Season'])['Avg_Poss_season'].transform(max) - df.groupby(['Season'])['Avg_Poss_season'].transform(min))

##AGAINST OPPONENT##
df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['GF_x'].shift(1)
df['Avg_GF_Opp'] = df.groupby(['Club','Opp'])['Avg_GF_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['GA_x'].shift(1)
df['Avg_GA_Opp'] = df.groupby(['Club','Opp'])['Avg_GA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['xG_x'].shift(1)
df['Avg_xG_Opp'] = df.groupby(['Club','Opp'])['Avg_xG_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['xGA_x'].shift(1)
df['Avg_xGA_Opp'] = df.groupby(['Club','Opp'])['Avg_xGA_Opp'].rolling(2).mean().reset_index([0,1],drop=True)
df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Poss_x'].shift(1)
df['Avg_Poss_Opp'] = df.groupby(['Club','Opp'])['Avg_Poss_Opp'].rolling(2).mean().reset_index([0,1],drop=True)

#Standardisation
df['Avg_GF_Opp'] = (df['Avg_GF_Opp'] - df.groupby(['Opp'])['Avg_GF_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_GF_Opp'].transform(max) - df.groupby(['Opp'])['Avg_GF_Opp'].transform(min))
df['Avg_GA_Opp'] = (df['Avg_GA_Opp'] - df.groupby(['Opp'])['Avg_GA_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_GA_Opp'].transform(max) - df.groupby(['Opp'])['Avg_GA_Opp'].transform(min))
df['Avg_xG_Opp'] = (df['Avg_xG_Opp'] - df.groupby(['Opp'])['Avg_xG_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_xG_Opp'].transform(max) - df.groupby(['Opp'])['Avg_xG_Opp'].transform(min))
df['Avg_xGA_Opp'] = (df['Avg_xGA_Opp'] - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_xGA_Opp'].transform(max) - df.groupby(['Opp'])['Avg_xGA_Opp'].transform(min))
df['Avg_Poss_Opp'] = (df['Avg_Poss_Opp'] - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(min))/(df.groupby(['Opp'])['Avg_Poss_Opp'].transform(max) - df.groupby(['Opp'])['Avg_Poss_Opp'].transform(min))

In [7]:
df_test = df[df['Date'] < np.datetime64('today')]
df_pred = df[df['Date'] >  np.datetime64('today')]
df_pred['GF_x'] = 0
df_pred['GA_x'] = 0
df_poo = df_pred
display(df_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['GF_x'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['GA_x'] = 0


Unnamed: 0,Date,Venue,Result,GF_x,GA_x,Opp,xG_x,xGA_x,Poss_x,Club,...,Avg_GF_season,Avg_GA_season,Avg_xG_season,Avg_xGA_season,Avg_Poss_season,Avg_GF_Opp,Avg_GA_Opp,Avg_xG_Opp,Avg_xGA_Opp,Avg_Poss_Opp
205,2023-03-19,Home,0,0,0,CRY,,,,ARS,...,0.554988,0.621457,0.791036,0.590748,0.794782,0.181818,0.625,0.333333,0.391304,0.662921
338,2023-03-18,Home,0,0,0,BOU,,,,AVL,...,0.27106,0.813353,0.355355,0.833703,0.347727,,,,,
540,2023-03-15,Home,0,0,0,CRY,,,,BHA,...,0.307246,0.846749,0.418213,0.81136,0.472698,0.181818,0.25,0.703704,0.195652,0.820225
673,2023-03-18,Away,0,0,0,AVL,,,,BOU,...,0.180241,0.967027,0.194324,0.920032,0.160006,,,,,
733,2023-03-15,Away,0,0,0,SOU,,,,BRE,...,0.166347,0.956,0.20928,0.911241,0.064626,0.5,0.0,0.602941,0.25,0.322222
1154,2023-03-18,Home,0,0,0,EVE,,,,CHE,...,0.109316,0.908591,0.154698,0.905025,0.155828,0.1,0.285714,0.54717,0.195122,1.0
1359,2023-03-15,Away,0,0,0,BHA,,,,CRY,...,0.054667,0.899306,0.059544,0.914654,0.077403,0.222222,0.333333,0.15,0.891304,0.042553
1565,2023-03-18,Away,0,0,0,CHE,,,,EVE,...,0.006724,0.905378,0.021713,0.944607,0.022113,0.25,0.111111,0.27907,0.457143,0.0
1662,2023-03-18,Away,0,0,0,LIV,,,,FUL,...,0.015964,0.910044,0.01345,0.976023,0.0053,0.25,0.0,0.553191,0.166667,0.151899
1831,2023-03-18,Away,0,0,0,WOL,,,,LEE,...,0.004578,0.928808,0.008331,0.980981,0.003374,0.5,0.6,0.516667,0.390244,0.531646


In [8]:
unwanted_columns = ['Date','Opp','Result','GF_x', 'xG_x', 'xGA_x',
       'Poss_x', 'Club', 'Poss_y', 'Position_x', 'Pl_x', 'Pts_x', 'Form_x',
       'Position_y', 'Pl_y', 'Pts_y', 'Form_y', 'spi_x', 'off_x', 'def_x',
       'Season', 'spi_y', 'off_y','def_y']
df_test_1 = df_test.drop(columns = unwanted_columns)
df_pred_1 = df_pred.drop(columns = unwanted_columns)
df_test_1 = df_test_1.dropna()
df_pred_1 = df_pred_1.dropna()


In [9]:
df_test_1= pd.get_dummies(df_test_1, columns=['Venue'])
df_pred_1= pd.get_dummies(df_pred_1, columns=['Venue'])

In [10]:
boo = df_test_1.drop(columns = 'GA_x')

X_train_1 = df_test_1[boo.columns]
y_train_1 = df_test_1['GA_x']

X_test_1 = df_pred_1[boo.columns]

rfr = RandomForestRegressor(n_estimators = 30, max_depth = 3, random_state = 0)
rfr.fit(X_train_1, y_train_1)
y_pred_1 = rfr.predict(X_test_1)


In [11]:
unwanted_columns = ['Date','Opp','Result','GA_x', 'xG_x', 'xGA_x',
       'Poss_x', 'Club', 'Poss_y', 'Position_x', 'Pl_x', 'Pts_x', 'Form_x',
       'Position_y', 'Pl_y', 'Pts_y', 'Form_y', 'spi_x', 'off_x', 'def_x',
       'Season', 'spi_y', 'off_y','def_y']
df_test = df_test.drop(columns = unwanted_columns)
df_pred = df_pred.drop(columns = unwanted_columns)
df_test = df_test.dropna()
df_pred = df_pred.dropna()


In [12]:
df_test.columns
df_test= pd.get_dummies(df_test, columns=['Venue'])
df_pred= pd.get_dummies(df_pred, columns=['Venue'])

In [13]:
boo = df_test.drop(columns = 'GF_x')

X_train = df_test[boo.columns]
y_train = df_test['GF_x']

X_test = df_pred[boo.columns]

rfr = xgb.XGBRegressor(learning_rate = 0.15, max_depth = 3, n_estimators = 30)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)


In [14]:
df_poo = df_poo[['Season','Club','Opp','Venue']]
a = pd.DataFrame(y_pred, columns = ['GF'], index = X_test.index)
b = pd.DataFrame(y_pred_1, columns = ['GA'], index = X_test_1.index)
c = pd.merge(df_poo,a, left_index=True, right_index=True)
d1 = pd.merge(c,b, left_index=True, right_index=True)
d2 = pd.merge(d1,d1, left_on = 'Club', right_on = 'Opp')
d2 = d2.drop(columns = ['Season_y','Club_y','Opp_y','Venue_y'])

d2['GF'] = (d2['GF_x'] + d2['GA_y']) /2
d2['GA'] = (d2['GF_y'] + d2['GA_x']) /2
d2 = d2.drop(columns = ['GF_x','GA_x','GF_y','GA_y'])

display (d2)

Unnamed: 0,Season_x,Club_x,Opp_x,Venue_x,GF,GA
0,2022,BHA,CRY,Home,2.143085,0.772497
1,2022,BHA,CRY,Home,1.816784,0.975316
2,2022,BRE,SOU,Away,1.370419,0.951679
3,2022,BRE,SOU,Away,1.636719,0.951822
4,2022,CHE,EVE,Home,1.692841,0.734044
5,2022,CRY,BHA,Away,0.71762,2.192422
6,2022,CRY,BHA,Away,0.772497,2.143085
7,2022,EVE,CHE,Away,0.734044,1.692841
8,2022,FUL,LIV,Away,0.685871,2.298706
9,2022,LEE,WOL,Away,1.172491,1.204267


In [15]:
d3 = d2.drop (columns = ['Venue_x','GA'])

In [23]:
df[(df['Club'] == 'AVL') & (df['Opp'] == 'BOU')]

Unnamed: 0,Date,Venue,Result,GF_x,GA_x,Opp,xG_x,xGA_x,Poss_x,Club,...,Avg_GF_season,Avg_GA_season,Avg_xG_season,Avg_xGA_season,Avg_Poss_season,Avg_GF_Opp,Avg_GA_Opp,Avg_xG_Opp,Avg_xGA_Opp,Avg_Poss_Opp
228,2020-02-01,Away,0,1.0,2.0,BOU,1.7,2.2,60.0,AVL,...,0.385965,0.156863,0.310777,0.284479,0.362242,,,,,
338,2023-03-18,Home,0,,,BOU,,,,AVL,...,0.27106,0.813353,0.355355,0.833703,0.347727,,,,,


In [17]:


for score in range (6):
    d3['Score' + str(score)] = (d3['GF'] ** score * np.exp(- d3['GF']))/np.math.factorial(score)

d4 = pd.merge(d3,d3, left_on = 'Club_x', right_on = 'Opp_x')

score_list = []

win_list = []
draw_list = []
loss_list = []

for i in range (6):
    for j in range (6):
        d4[fr'{i} - {j}'] = d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']
        score_list.append(fr'{i} - {j}')

        if i > j:
            win_list.append(list(d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']))
        elif i == j:
            draw_list.append(list(d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']))
        else:
            loss_list.append(list(d4[fr'Score{i}_x'] * d4[fr'Score{j}_y']))


d4[score_list].idxmax(axis = 1)
display(d4)

d2['Most likely score'] = list(d4[score_list].idxmax(axis = 1))

Unnamed: 0,Season_x_x,Club_x_x,Opp_x_x,GF_x,Score0_x,Score1_x,Score2_x,Score3_x,Score4_x,Score5_x,...,4 - 2,4 - 3,4 - 4,4 - 5,5 - 0,5 - 1,5 - 2,5 - 3,5 - 4,5 - 5
0,2022,BHA,CRY,2.143085,0.117292,0.251368,0.269351,0.192414,0.10309,0.044186,...,0.012951,0.003098,0.000556,8e-05,0.021559,0.015471,0.005551,0.001328,0.000238,3.4e-05
1,2022,BHA,CRY,2.143085,0.117292,0.251368,0.269351,0.192414,0.10309,0.044186,...,0.014207,0.003658,0.000706,0.000109,0.020408,0.015765,0.006089,0.001568,0.000303,4.7e-05
2,2022,BHA,CRY,1.816784,0.162548,0.295314,0.268261,0.162457,0.073787,0.026811,...,0.00927,0.002217,0.000398,5.7e-05,0.013082,0.009388,0.003368,0.000806,0.000145,2.1e-05
3,2022,BHA,CRY,1.816784,0.162548,0.295314,0.268261,0.162457,0.073787,0.026811,...,0.010168,0.002618,0.000506,7.8e-05,0.012383,0.009566,0.003695,0.000951,0.000184,2.8e-05
4,2022,BRE,SOU,1.370419,0.254,0.348087,0.238513,0.108954,0.037328,0.010231,...,0.006528,0.002071,0.000493,9.4e-05,0.00395,0.003759,0.001789,0.000568,0.000135,2.6e-05
5,2022,BRE,SOU,1.370419,0.254,0.348087,0.238513,0.108954,0.037328,0.010231,...,0.006245,0.001901,0.000434,7.9e-05,0.004105,0.003749,0.001712,0.000521,0.000119,2.2e-05
6,2022,BRE,SOU,1.636719,0.194618,0.318534,0.260675,0.142217,0.058193,0.019049,...,0.010176,0.003229,0.000768,0.000146,0.007354,0.006999,0.003331,0.001057,0.000251,4.8e-05
7,2022,BRE,SOU,1.636719,0.194618,0.318534,0.260675,0.142217,0.058193,0.019049,...,0.009735,0.002963,0.000676,0.000124,0.007644,0.00698,0.003187,0.00097,0.000221,4e-05
8,2022,CHE,EVE,1.692841,0.183996,0.311476,0.26364,0.148767,0.06296,0.021316,...,0.008141,0.001992,0.000366,5.4e-05,0.010231,0.00751,0.002756,0.000674,0.000124,1.8e-05
9,2022,CRY,BHA,0.71762,0.487912,0.350135,0.125632,0.030052,0.005391,0.000774,...,0.001452,0.001037,0.000556,0.000238,9.1e-05,0.000195,0.000208,0.000149,8e-05,3.4e-05


ValueError: Length of values (24) does not match length of index (16)

In [None]:
win_list = [sum(i) for i in list(zip(*win_list))]
draw_list = [sum(i) for i in list(zip(*draw_list))]
loss_list = [sum(i) for i in list(zip(*loss_list))]


In [None]:
d2['win_prob'] = win_list
d2['draw_prob'] = draw_list
d2['loss_prob'] = loss_list

In [None]:
display(d2)

Unnamed: 0,Season_x,Club_x,Opp_x,Venue_x,GF,GA,Most likely score,win_prob,draw_prob,loss_prob
0,2022,ARS,FUL,Away,1.844422,0.714883,1 - 0,0.631607,0.218336,0.138377
1,2022,AVL,WHU,Away,1.221164,1.239447,1 - 1,0.357695,0.272565,0.366344
2,2022,BHA,LEE,Away,2.028424,0.818597,2 - 0,0.638005,0.202536,0.141643
3,2022,BOU,LIV,Home,0.84179,2.444313,0 - 2,0.109337,0.164246,0.687781
4,2022,BRE,EVE,Away,1.622722,0.966606,1 - 0,0.520808,0.24788,0.224366
5,2022,CHE,LEI,Away,1.439038,1.007018,1 - 1,0.466584,0.265688,0.263459
6,2022,CRY,MCI,Home,0.686959,2.30587,0 - 2,0.094037,0.167153,0.708439
7,2022,EVE,BRE,Home,0.966606,1.622722,0 - 1,0.224366,0.24788,0.520808
8,2022,FUL,ARS,Home,0.714883,1.844422,0 - 1,0.138377,0.218336,0.631607
9,2022,LEE,BHA,Home,0.818597,2.028424,0 - 2,0.141643,0.202536,0.638005
