In [1]:
import pandas as pd

# read in the match data as a dataframe, and specify that the first column in our data is the index column
matches = pd.read_csv("matches.csv", index_col=0)

In [2]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,11.0,5.0,19.1,0.0,0,0,2025,Manchester City
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,...,Match Report,,13.0,4.0,17.8,1.0,1,1,2025,Manchester City
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,...,Match Report,,10.0,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion
1,2024-08-24,12:30,Premier League,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,...,Match Report,,14.0,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,...,Match Report,,18.0,6.0,17.3,0.0,0,0,2025,Arsenal


In [3]:
matches.shape

(800, 28)

In [4]:
# In the EPL, there are 38 matches played each season, and there are 20 teams each season
# If we have data for 2 complete seasons, we should have 38 * 20 * 2 = 1520 matches
# But, as of now we have fewer because we only have the first 2 weeks of matches played for the 2024-2025 season

In [5]:
# figure out how many matches we have for each team in our data
matches["team"].value_counts()

team
Manchester City             40
Brighton and Hove Albion    40
Arsenal                     40
Liverpool                   40
Tottenham Hotspur           40
Newcastle United            40
Nottingham Forest           40
Chelsea                     40
West Ham United             40
Manchester United           40
Fulham                      40
Aston Villa                 40
Brentford                   40
Bournemouth                 40
Crystal Palace              40
Everton                     40
Wolverhampton Wanderers     40
Burnley                     38
Luton Town                  38
Sheffield United            38
Leicester City               2
Ipswich Town                 2
Southampton                  2
Name: count, dtype: int64

In [6]:
# figure out which match week each match was played
# Matchweek 1 has 40 matches , because it is only matchweek to have finished for 2024-2025 and 2024-2024 seasons
matches["round"].value_counts()

round
Matchweek 1     40
Matchweek 2     40
Matchweek 3     20
Matchweek 4     20
Matchweek 5     20
Matchweek 6     20
Matchweek 7     20
Matchweek 8     20
Matchweek 9     20
Matchweek 10    20
Matchweek 11    20
Matchweek 12    20
Matchweek 13    20
Matchweek 14    20
Matchweek 15    20
Matchweek 16    20
Matchweek 17    20
Matchweek 19    20
Matchweek 20    20
Matchweek 21    20
Matchweek 22    20
Matchweek 23    20
Matchweek 24    20
Matchweek 25    20
Matchweek 18    20
Matchweek 26    20
Matchweek 27    20
Matchweek 28    20
Matchweek 30    20
Matchweek 31    20
Matchweek 32    20
Matchweek 33    20
Matchweek 29    20
Matchweek 35    20
Matchweek 36    20
Matchweek 37    20
Matchweek 34    20
Matchweek 38    20
Name: count, dtype: int64

In [7]:
# looking at the type of each column
# machine learning algorithms can only work with numeric data, so float or ints
# machine learning algorithms can not work with data that is an object
# we must convert the object columns into numeric columns so they can be used by machine learning algos
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [8]:
del matches["comp"]
del matches["notes"]

In [9]:
matches.head()

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,referee,match report,sh,sot,dist,fk,pk,pkatt,season,team
1,2024-08-18,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,0.8,...,Anthony Taylor,Match Report,11.0,5.0,19.1,0.0,0,0,2025,Manchester City
2,2024-08-24,15:00,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,3.3,...,Samuel Allison,Match Report,13.0,4.0,17.8,1.0,1,1,2025,Manchester City
0,2024-08-17,15:00,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,1.4,...,Simon Hooper,Match Report,10.0,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion
1,2024-08-24,12:30,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,2.1,...,Craig Pawson,Match Report,14.0,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion
0,2024-08-17,15:00,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,1.2,...,Jarred Gillett,Match Report,18.0,6.0,17.3,0.0,0,0,2025,Arsenal


In [10]:
# convert data column (object type) to type datetime
matches["date"] = pd.to_datetime(matches["date"])

In [11]:
matches.dtypes

date             datetime64[ns]
time                     object
round                    object
day                      object
venue                    object
result                   object
gf                      float64
ga                      float64
opponent                 object
xg                      float64
xga                     float64
poss                    float64
attendance              float64
captain                  object
formation                object
opp formation            object
referee                  object
match report             object
sh                      float64
sot                     float64
dist                    float64
fk                      float64
pk                        int64
pkatt                     int64
season                    int64
team                     object
dtype: object

In [12]:
# Creating a few predictors, ones added are a base/foundation that isn't too complex, can add onto it

# First predictor: Converting the venue column (Home or Away game) into a numeric column
# ".astype" converts it from a string into categories
# ".cat.codes" converts categories into integers
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [13]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,match report,sh,sot,dist,fk,pk,pkatt,season,team,venue_code
1,2024-08-18,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,0.8,...,Match Report,11.0,5.0,19.1,0.0,0,0,2025,Manchester City,0
2,2024-08-24,15:00,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,3.3,...,Match Report,13.0,4.0,17.8,1.0,1,1,2025,Manchester City,1
0,2024-08-17,15:00,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,1.4,...,Match Report,10.0,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion,0
1,2024-08-24,12:30,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,2.1,...,Match Report,14.0,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion,1
0,2024-08-17,15:00,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,1.2,...,Match Report,18.0,6.0,17.3,0.0,0,0,2025,Arsenal,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,0.8,...,Match Report,10.0,4.0,17.8,1.0,0,0,2024,Sheffield United,0
37,2024-04-27,15:00,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,1.5,...,Match Report,15.0,4.0,13.5,0.0,0,0,2024,Sheffield United,0
38,2024-05-04,15:00,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,2.2,...,Match Report,16.0,4.0,18.0,0.0,1,1,2024,Sheffield United,1
39,2024-05-11,15:00,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,0.6,...,Match Report,13.0,1.0,21.0,0.0,0,0,2024,Sheffield United,0


In [14]:
# Second predictor: creating a new code for each opponent squad using some method as first predictor 
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [15]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code
1,2024-08-18,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,0.8,...,11.0,5.0,19.1,0.0,0,0,2025,Manchester City,0,6
2,2024-08-24,15:00,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,3.3,...,13.0,4.0,17.8,1.0,1,1,2025,Manchester City,1,10
0,2024-08-17,15:00,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,1.4,...,10.0,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion,0,8
1,2024-08-24,12:30,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,2.1,...,14.0,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion,1,15
0,2024-08-17,15:00,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,1.2,...,18.0,6.0,17.3,0.0,0,0,2025,Arsenal,1,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,0.8,...,10.0,4.0,17.8,1.0,0,0,2024,Sheffield United,0,15
37,2024-04-27,15:00,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,1.5,...,15.0,4.0,13.5,0.0,0,0,2024,Sheffield United,0,16
38,2024-05-04,15:00,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,2.2,...,16.0,4.0,18.0,0.0,1,1,2024,Sheffield United,1,17
39,2024-05-11,15:00,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,0.6,...,13.0,1.0,21.0,0.0,0,0,2024,Sheffield United,0,8


In [16]:
# Third predictor: Looking at whether teams play better at certain times of the day
# Writing a regular expression to replace the colon and the minutes in the "time" column with nothing
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [17]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour
1,2024-08-18,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,0.8,...,5.0,19.1,0.0,0,0,2025,Manchester City,0,6,16
2,2024-08-24,15:00,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,3.3,...,4.0,17.8,1.0,1,1,2025,Manchester City,1,10,15
0,2024-08-17,15:00,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,1.4,...,5.0,13.8,0.0,0,0,2025,Brighton and Hove Albion,0,8,15
1,2024-08-24,12:30,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,2.1,...,4.0,14.2,1.0,0,0,2025,Brighton and Hove Albion,1,15,12
0,2024-08-17,15:00,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,1.2,...,6.0,17.3,0.0,0,0,2025,Arsenal,1,22,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,0.8,...,4.0,17.8,1.0,0,0,2024,Sheffield United,0,15,20
37,2024-04-27,15:00,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,1.5,...,4.0,13.5,0.0,0,0,2024,Sheffield United,0,16,15
38,2024-05-04,15:00,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,2.2,...,4.0,18.0,0.0,1,1,2024,Sheffield United,1,17,15
39,2024-05-11,15:00,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,0.6,...,1.0,21.0,0.0,0,0,2024,Sheffield United,0,8,15


In [18]:
# Fourth predictor: gets the day of week of each match
# "dt.dayofweek" get's the day of week property of that column
# Saturday is coded as 5, Sunday as 6
matches["day_code"] = matches["date"].dt.dayofweek

In [19]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
1,2024-08-18,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,0.8,...,19.1,0.0,0,0,2025,Manchester City,0,6,16,6
2,2024-08-24,15:00,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,3.3,...,17.8,1.0,1,1,2025,Manchester City,1,10,15,5
0,2024-08-17,15:00,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,1.4,...,13.8,0.0,0,0,2025,Brighton and Hove Albion,0,8,15,5
1,2024-08-24,12:30,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,2.1,...,14.2,1.0,0,0,2025,Brighton and Hove Albion,1,15,12,5
0,2024-08-17,15:00,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,1.2,...,17.3,0.0,0,0,2025,Arsenal,1,22,15,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,0.8,...,17.8,1.0,0,0,2024,Sheffield United,0,15,20,2
37,2024-04-27,15:00,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,1.5,...,13.5,0.0,0,0,2024,Sheffield United,0,16,15,5
38,2024-05-04,15:00,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,2.2,...,18.0,0.0,1,1,2024,Sheffield United,1,17,15,5
39,2024-05-11,15:00,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,0.6,...,21.0,0.0,0,0,2024,Sheffield United,0,8,15,5


In [20]:
# Setting up target: whether a team won or not
matches["target"] = (matches["result"] == "W").astype("int")
# If the result is a Loss or Draw, we will code it as a zero
# If the result is a Win, we will code it as a one
# True = 1, False = 0

# If want it more complex, could code it as 0, 1, or 2 for losses, draws, and wins


In [21]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2024-08-18,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,0.8,...,0.0,0,0,2025,Manchester City,0,6,16,6,1
2,2024-08-24,15:00,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,3.3,...,1.0,1,1,2025,Manchester City,1,10,15,5,1
0,2024-08-17,15:00,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,1.4,...,0.0,0,0,2025,Brighton and Hove Albion,0,8,15,5,1
1,2024-08-24,12:30,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,2.1,...,1.0,0,0,2025,Brighton and Hove Albion,1,15,12,5,1
0,2024-08-17,15:00,Matchweek 1,Sat,Home,W,2.0,0.0,Wolves,1.2,...,0.0,0,0,2025,Arsenal,1,22,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,0.8,...,1.0,0,0,2024,Sheffield United,0,15,20,2,0
37,2024-04-27,15:00,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,1.5,...,0.0,0,0,2024,Sheffield United,0,16,15,5,0
38,2024-05-04,15:00,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,2.2,...,0.0,1,1,2024,Sheffield United,1,17,15,5,0
39,2024-05-11,15:00,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,0.6,...,0.0,0,0,2024,Sheffield United,0,8,15,5,0


In [22]:
from sklearn.ensemble import RandomForestClassifier

# Random forest is a ML model that can pick up non-linearities in the data
# Would be able to pick up the fact that oppcode would not have a linear relationship

'''
    n_estimators: number of individual decision trees we want to train
    min_samples_split: number of samples we want to have in a leaf of the decision tree before we actually split the node
    -the higher the number of samples the less likely we are to overfit but the lower our accuracy on the training data
    random_state: if we set a random state, means that if we run the random forest multiple times it will yield the same result, as long as data is the same
'''
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [23]:
# trainig set will consist of all matches before 2024-01-01
train = matches[matches["date"] < "2024-01-01"]
# test set will consist of all matches after 2024-01-01 (what we are trying to predict)
test = matches[matches["date"] > "2024-01-01"]

In [24]:
# create a list of all our predictors
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [25]:
# fit the random forest model to try and predict our target, using the training data and predictors
rf.fit(train[predictors], train["target"])

In [26]:
# obtaining predictions on the test data
preds = rf.predict(test[predictors])

In [27]:
# determining the accuracy of the model

In [28]:
from sklearn.metrics import accuracy_score
# what percent of the time your prediction was accurate

In [29]:
acc = accuracy_score(test["target"], preds)

In [30]:
acc

0.5985221674876847

In [31]:
# observing in which situations our accuracy was high versus low

# first create a dataframe, combine our actual values and predicted values
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [32]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,187,65
1,98,56


In [33]:
# For the times we predicted a win (1), we were wrong (65) more times than we were right (56)

In [34]:
from sklearn.metrics import precision_score
# when we predicted win, what percentage of the time did the team actually win

In [35]:
precision_score(test["target"], preds)

np.float64(0.4628099173553719)

In [36]:
# not great precision (around 46%)

In [37]:
# We are splitting our matches dataframe up by team, want to calculate rolling averages for individual team performance
grouped_matches = matches.groupby("team")

In [38]:
# grab Man city's matches
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [39]:
group

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2023-08-11,20:00,Matchweek 1,Fri,Away,W,3.0,0.0,Burnley,1.9,...,0.0,0,0,2024,Manchester City,0,5,20,4,1
3,2023-08-19,20:00,Matchweek 2,Sat,Home,W,1.0,0.0,Newcastle Utd,1.0,...,0.0,0,0,2024,Manchester City,1,16,20,5,1
4,2023-08-27,14:00,Matchweek 3,Sun,Away,W,2.0,1.0,Sheffield Utd,3.5,...,2.0,0,1,2024,Manchester City,0,18,14,6,1
5,2023-09-02,15:00,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,2.2,...,0.0,1,1,2024,Manchester City,1,9,15,5,1
6,2023-09-16,15:00,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,3.6,...,1.0,0,0,2024,Manchester City,0,21,15,5,1
8,2023-09-23,15:00,Matchweek 6,Sat,Home,W,2.0,0.0,Nott'ham Forest,1.3,...,2.0,0,0,2024,Manchester City,1,17,15,5,1
10,2023-09-30,15:00,Matchweek 7,Sat,Away,L,1.0,2.0,Wolves,0.9,...,1.0,0,0,2024,Manchester City,0,22,15,5,0
12,2023-10-08,16:30,Matchweek 8,Sun,Away,L,0.0,1.0,Arsenal,0.5,...,0.0,0,0,2024,Manchester City,0,0,16,6,0
13,2023-10-21,15:00,Matchweek 9,Sat,Home,W,2.0,1.0,Brighton,0.8,...,1.0,0,0,2024,Manchester City,1,4,15,5,1
15,2023-10-29,15:30,Matchweek 10,Sun,Away,W,3.0,0.0,Manchester Utd,4.0,...,2.0,1,1,2024,Manchester City,0,15,15,6,1


In [40]:
# grab Brighton Hove and Albion's matches
group_check = grouped_matches.get_group("Brighton and Hove Albion").sort_values("date")
group_check

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-12,15:00,Matchweek 1,Sat,Home,W,4.0,1.0,Luton Town,4.0,...,0.0,1,1,2024,Brighton and Hove Albion,1,13,15,5,1
1,2023-08-19,15:00,Matchweek 2,Sat,Away,W,4.0,1.0,Wolves,2.2,...,0.0,0,0,2024,Brighton and Hove Albion,0,22,15,5,1
2,2023-08-26,17:30,Matchweek 3,Sat,Home,L,1.0,3.0,West Ham,1.5,...,1.0,0,0,2024,Brighton and Hove Albion,1,21,17,5,0
3,2023-09-02,17:30,Matchweek 4,Sat,Home,W,3.0,1.0,Newcastle Utd,1.3,...,0.0,0,0,2024,Brighton and Hove Albion,1,16,17,5,1
4,2023-09-16,15:00,Matchweek 5,Sat,Away,W,3.0,1.0,Manchester Utd,1.7,...,0.0,0,0,2024,Brighton and Hove Albion,0,15,15,5,1
6,2023-09-24,14:00,Matchweek 6,Sun,Home,W,3.0,1.0,Bournemouth,1.9,...,0.0,0,0,2024,Brighton and Hove Albion,1,2,14,6,1
8,2023-09-30,12:30,Matchweek 7,Sat,Away,L,1.0,6.0,Aston Villa,1.7,...,0.0,0,0,2024,Brighton and Hove Albion,0,1,12,5,0
10,2023-10-08,14:00,Matchweek 8,Sun,Home,D,2.0,2.0,Liverpool,2.3,...,2.0,0,0,2024,Brighton and Hove Albion,1,12,14,6,0
11,2023-10-21,15:00,Matchweek 9,Sat,Away,L,1.0,2.0,Manchester City,0.8,...,0.0,0,0,2024,Brighton and Hove Albion,0,14,15,5,0
13,2023-10-29,14:00,Matchweek 10,Sun,Home,D,1.0,1.0,Fulham,1.4,...,1.0,0,0,2024,Brighton and Hove Albion,1,9,14,6,0


In [41]:
'''
Purpose: take in a group, a set of columns we want to compute rolling averages for, and take in a set
of new columns that we want to assign the rolling averages to
Parameters:
    group: matches played for a specific team
    cols: set of columns we want to compute the rolling averages for
    new_cols: set of new columns that we want to assign the rolling averages to
    
'''
def rolling_averages(group, cols, new_cols):
    # sort the matches by date, because we want to look at the last 3 matches
    group = group.sort_values("date")
    # calculate rolling averages for cols, need to set closed to left otherwise pandas will compute the
    # rolling average for the 3 weeks and assign that average to week 3, which involves using knowledge of the future
    # basically takes the current week out and computes a rolling average of the previous 3 games
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    # assign these rolling states back to the original dataframe but with new names
    group[new_cols] = rolling_stats
    # drop any missing values, if is week 2 or 3 and are trying to make a prediction, there aren't 3 previous games to analyze
    group = group.dropna(subset=new_cols)
    return group
    

In [42]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [43]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [44]:
# calculating rolling averages for Man city, group was set equal to Man City
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2023-09-02,15:00,Matchweek 4,Sat,Home,W,5.0,1.0,Fulham,2.2,...,5,1,2.0,0.333333,20.0,7.0,16.366667,0.666667,0.0,0.333333
6,2023-09-16,15:00,Matchweek 5,Sat,Away,W,3.0,1.0,West Ham,3.6,...,5,1,2.666667,0.666667,16.333333,5.666667,16.666667,0.666667,0.333333,0.666667
8,2023-09-23,15:00,Matchweek 6,Sat,Home,W,2.0,0.0,Nott'ham Forest,1.3,...,5,1,3.333333,1.0,21.333333,8.666667,16.166667,1.0,0.333333,0.666667
10,2023-09-30,15:00,Matchweek 7,Sat,Away,L,1.0,2.0,Wolves,0.9,...,5,0,3.333333,0.666667,14.0,7.0,16.133333,1.0,0.333333,0.333333
12,2023-10-08,16:30,Matchweek 8,Sun,Away,L,0.0,1.0,Arsenal,0.5,...,6,0,2.0,1.0,19.666667,8.333333,17.633333,1.333333,0.0,0.0
13,2023-10-21,15:00,Matchweek 9,Sat,Home,W,2.0,1.0,Brighton,0.8,...,5,1,1.0,1.0,11.333333,4.0,17.133333,1.0,0.0,0.0
15,2023-10-29,15:30,Matchweek 10,Sun,Away,W,3.0,0.0,Manchester Utd,4.0,...,6,1,1.0,1.333333,12.333333,4.333333,17.666667,0.666667,0.0,0.0
16,2023-11-04,15:00,Matchweek 11,Sat,Home,W,6.0,1.0,Bournemouth,1.9,...,5,1,1.666667,0.666667,11.333333,4.666667,16.966667,1.0,0.333333,0.333333
18,2023-11-12,16:30,Matchweek 12,Sun,Away,D,4.0,4.0,Chelsea,2.9,...,6,0,3.666667,0.666667,17.0,7.333333,17.466667,1.0,0.333333,0.333333
19,2023-11-25,12:30,Matchweek 13,Sat,Home,D,1.0,1.0,Liverpool,1.3,...,5,0,4.333333,1.666667,18.333333,8.666667,16.733333,1.0,0.666667,0.666667


In [53]:
#print(grouped_matches.groups)

{'Arsenal': [0, 1, 1, 2, 3, 4, 5, 7, 9, 11, 12, 14, 16, 18, 19, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34, 36, 37, 38, 40, 41, 42, 44, 46, 47, 48, 49, 50, 51], 'Aston Villa': [0, 1, 0, 1, 3, 5, 6, 8, 10, 12, 13, 15, 16, 18, 19, 21, 22, 23, 25, 26, 27, 28, 30, 32, 33, 35, 36, 37, 38, 40, 42, 43, 44, 45, 47, 49, 50, 52, 54, 55], 'Bournemouth': [0, 1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43], 'Brentford': [0, 1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41], 'Brighton and Hove Albion': [0, 1, 0, 1, 2, 3, 4, 6, 8, 10, 11, 13, 14, 16, 17, 19, 20, 21, 23, 24, 25, 26, 28, 30, 31, 32, 33, 34, 36, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'Burnley': [0, 1, 3, 4, 5, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 4

In [45]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

# SOLUTION for future versions of pandas:
# matches_rolling = matches.groupby("team", include_groups=False).apply(lambda x: rolling_averages(x, cols, new_cols))

# If I need the regular "team" column within my "apply" function, restroes the "team" column as a regular column
#matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x.reset_index(), cols, new_cols))


  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [46]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2023-09-03,16:30,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,2.3,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
Arsenal,5,2023-09-17,16:30,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,1.0,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
Arsenal,7,2023-09-24,14:00,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,1.8,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
Arsenal,9,2023-09-30,15:00,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,3.4,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
Arsenal,11,2023-10-08,16:30,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,0.4,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,42,2024-05-04,17:30,Matchweek 36,Sat,Away,L,1.0,5.0,Manchester City,0.3,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,43,2024-05-11,15:00,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,1.6,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,44,2024-05-19,16:00,Matchweek 38,Sun,Away,L,0.0,2.0,Liverpool,0.5,...,6,0,1.333333,3.000000,9.666667,4.666667,14.133333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,0,2024-08-17,15:00,Matchweek 1,Sat,Away,L,0.0,2.0,Arsenal,0.5,...,5,0,0.666667,3.333333,6.666667,3.666667,14.200000,0.333333,0.000000,0.000000


In [48]:
# We currently have each team name as a separate level to our pandas index, we do not need this
matches_rolling = matches_rolling.droplevel('team')
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2023-09-03,16:30,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,2.3,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
5,2023-09-17,16:30,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,1.0,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
7,2023-09-24,14:00,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,1.8,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
9,2023-09-30,15:00,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,3.4,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
11,2023-10-08,16:30,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,0.4,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2024-05-04,17:30,Matchweek 36,Sat,Away,L,1.0,5.0,Manchester City,0.3,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
43,2024-05-11,15:00,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,1.6,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000
44,2024-05-19,16:00,Matchweek 38,Sun,Away,L,0.0,2.0,Liverpool,0.5,...,6,0,1.333333,3.000000,9.666667,4.666667,14.133333,0.000000,0.000000,0.000000
0,2024-08-17,15:00,Matchweek 1,Sat,Away,L,0.0,2.0,Arsenal,0.5,...,5,0,0.666667,3.333333,6.666667,3.666667,14.200000,0.333333,0.000000,0.000000


In [49]:
# We have 734 rows, but are having index values repeated, but we want unique indices
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2023-09-03,16:30,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,2.3,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
1,2023-09-17,16:30,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,1.0,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
2,2023-09-24,14:00,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,1.8,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
3,2023-09-30,15:00,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,3.4,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
4,2023-10-08,16:30,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,0.4,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,2024-05-04,17:30,Matchweek 36,Sat,Away,L,1.0,5.0,Manchester City,0.3,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
730,2024-05-11,15:00,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,1.6,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000
731,2024-05-19,16:00,Matchweek 38,Sun,Away,L,0.0,2.0,Liverpool,0.5,...,6,0,1.333333,3.000000,9.666667,4.666667,14.133333,0.000000,0.000000,0.000000
732,2024-08-17,15:00,Matchweek 1,Sat,Away,L,0.0,2.0,Arsenal,0.5,...,5,0,0.666667,3.333333,6.666667,3.666667,14.200000,0.333333,0.000000,0.000000


In [53]:
# we now have a new set of predictors, so we can make a new set of predictions
def make_predictions(data, predictors):
    # trainig set will consist of all matches before 2024-01-01
    train = data[data["date"] < "2024-01-01"]
    # test set will consist of all matches after 2024-01-01 (what we are trying to predict)
    test = data[data["date"] > "2024-01-01"]
    # fit the random forest model to try and predict our target, using the training data and predictors
    rf.fit(train[predictors], train["target"])
    # obtaining predictions on the test data
    preds = rf.predict(test[predictors])
    # first create a dataframe, combine our actual values and predicted values
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    # calculate precision score of predictions against the test set
    precision = precision_score(test["target"], preds)
    return combined, precision

In [54]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [55]:
precision

np.float64(0.5182481751824818)

In [56]:
# observe where our missp redictions where, but can't tell us which team played in each match
combined

Unnamed: 0,actual,prediction
17,1,0
18,1,1
19,1,1
20,1,0
21,1,1
...,...,...
729,0,0
730,0,0
731,0,0
732,0,0


In [57]:
# can fix this by adding in the team and opponent
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
17,1,0,2024-01-20,Arsenal,Crystal Palace,W
18,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
19,1,1,2024-02-04,Arsenal,Liverpool,W
20,1,0,2024-02-11,Arsenal,West Ham,W
21,1,1,2024-02-17,Arsenal,Burnley,W
...,...,...,...,...,...,...
729,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L
730,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L
731,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L
732,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L


In [None]:
# There could be the case where for the same match we could predict that both teams will win

In [58]:
# Helps replace team names for instances where full name is used
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchested Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
# Creating an instance of MissingDict
mapping = MissingDict(**map_values)

In [59]:
mapping["Arsenal"]

'Arsenal'

In [60]:
mapping["West Ham United"]

'West Ham'

In [61]:
combined["new_team"] = combined["team"].map(mapping)

In [62]:
combined

Unnamed: 0,actual,prediction,date,team,opponent,result,new_team
17,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal
18,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal
19,1,1,2024-02-04,Arsenal,Liverpool,W,Arsenal
20,1,0,2024-02-11,Arsenal,West Ham,W,Arsenal
21,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal
...,...,...,...,...,...,...,...
729,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves
730,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves
731,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves
732,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L,Wolves


In [63]:
# merging the dataframe with itself, because for instance there is a row with Arsenal vs Liverpool and also a row with Liverpool vs Arsenal
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
# want to make sure the predictions match up
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nottingham Forest
2,1,1,2024-02-04,Arsenal,Liverpool,W,Arsenal,0,0,Liverpool,Arsenal,L,Liverpool
3,1,0,2024-02-11,Arsenal,West Ham,W,Arsenal,0,0,West Ham United,Arsenal,L,West Ham
4,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal,0,0,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,Manchester City,Wolves,W,Manchester City
333,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,0,Crystal Palace,Wolves,W,Crystal Palace
334,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,0,Liverpool,Wolves,W,Liverpool
335,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L,Wolves,1,0,Arsenal,Wolves,W,Arsenal


In [67]:
# Looking at cases where the model predicts team x wins and team y loses
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts()

actual_x
1    51
0    36
Name: count, dtype: int64

In [68]:
# What our accuracy was:
51 / 87

0.5862068965517241

In [None]:
''' Next steps:
1. Could go ahead a scrape more data, maybe do 10 or 20 seasons rather than 1 to do predictions
2. Could use more of the columns to generate predictions (Example: venue, referee, captain)
3. Could try different algorithm instead of Random Forest (could use different ML algorith, such 
as a neural network, or any other algorithm that can pick up non-linear tendencies)
4. Could integrate some additional data from what we scraped (we isolated only premier league matches),
but could look at how team performed outside of premier league (could factor in if they are tired from 
other matches?)
5. Could also look at current team's record for the season and the opponents record for the season
'''