In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("eplmatches.csv", index_col=0)

In [3]:
matches.sample(8)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
10,2022-10-15,15:00,Premier League,Matchweek 11,Sat,Away,D,2.0,2.0,Fulham,...,Match Report,,10.0,6.0,13.0,0.0,0.0,0.0,2022,Bournemouth
35,2021-04-04,19:30,Premier League,Matchweek 30,Sun,Away,L,1.0,2.0,Manchester Utd,...,Match Report,,5.0,3.0,10.3,0.0,0.0,0.0,2020,Brighton and Hove Albion
24,2021-02-14,16:30,Premier League,Matchweek 24,Sun,Away,L,2.0,4.0,Arsenal,...,Match Report,,9.0,5.0,17.6,0.0,0.0,0.0,2020,Leeds United
24,2020-12-30,20:00,Premier League,Matchweek 16,Wed,Away,D,0.0,0.0,Newcastle Utd,...,Match Report,,11.0,4.0,18.3,0.0,0.0,0.0,2020,Liverpool
37,2022-02-19,17:30,Premier League,Matchweek 26,Sat,Away,W,3.0,2.0,Manchester City,...,Match Report,,6.0,5.0,15.0,0.0,0.0,0.0,2021,Tottenham Hotspur
28,2023-02-18,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,1.0,Bournemouth,...,Match Report,,15.0,4.0,13.8,0.0,0.0,0.0,2022,Wolverhampton Wanderers
55,2021-05-09,14:05,Premier League,Matchweek 35,Sun,Away,W,3.0,1.0,Aston Villa,...,Match Report,,17.0,6.0,21.5,0.0,1.0,1.0,2020,Manchester United
40,2022-05-07,17:30,Premier League,Matchweek 36,Sat,Home,W,4.0,0.0,Manchester Utd,...,Match Report,,17.0,5.0,15.9,0.0,0.0,0.0,2021,Brighton and Hove Albion


In [4]:
matches.shape

(1998, 27)

In [5]:
matches["team"].value_counts()

Tottenham Hotspur           101
Manchester City             101
Arsenal                     100
Chelsea                     100
Everton                     100
West Ham United             100
Leeds United                100
Aston Villa                 100
Southampton                 100
Crystal Palace              100
Leicester City              100
Wolverhampton Wanderers     100
Manchester United           100
Newcastle United             99
Liverpool                    99
Brighton and Hove Albion     98
Burnley                      76
Fulham                       63
Brentford                    61
West Bromwich Albion         38
Watford                      38
Norwich City                 38
Sheffield United             38
Nottingham Forest            24
Bournemouth                  24
Name: team, dtype: int64

In [6]:
matches["round"].value_counts()

Matchweek 21    60
Matchweek 18    60
Matchweek 10    60
Matchweek 13    60
Matchweek 2     60
Matchweek 9     60
Matchweek 5     60
Matchweek 20    60
Matchweek 4     60
Matchweek 23    60
Matchweek 19    60
Matchweek 1     60
Matchweek 6     60
Matchweek 16    60
Matchweek 15    60
Matchweek 3     60
Matchweek 22    60
Matchweek 14    60
Matchweek 24    60
Matchweek 12    60
Matchweek 11    60
Matchweek 17    60
Matchweek 25    56
Matchweek 8     56
Matchweek 7     46
Matchweek 37    40
Matchweek 36    40
Matchweek 31    40
Matchweek 38    40
Matchweek 32    40
Matchweek 33    40
Matchweek 26    40
Matchweek 34    40
Matchweek 28    40
Matchweek 29    40
Matchweek 27    40
Matchweek 35    40
Matchweek 30    40
Name: round, dtype: int64


### Some teams have fewer than 100 matches. This is okay and it is because we scraped data for the ongoing season when some teams have matches pending. Also 3 teams are relegated/promoted to the EPL every season.

### Next we will clean the data and prepare it for machine learning.


In [7]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [8]:
matches["date"] = pd.to_datetime(matches["date"])

In [9]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team                    object
dtype: object

#### Creating a 'coded column' of 1s and 0s converting strings into categories and the categories into numbers so the algorithm can actually use it.

In [10]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [11]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [12]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,10.0,2.0,14.6,1.0,0.0,0.0,2022,Arsenal,0,7
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,19.0,7.0,13.0,0.0,0.0,0.0,2022,Arsenal,1,11
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,14.0,6.0,14.8,0.0,0.0,0.0,2022,Arsenal,0,2
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,22.0,8.0,15.5,1.0,0.0,0.0,2022,Arsenal,1,9
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,22.0,8.0,16.3,1.0,0.0,0.0,2022,Arsenal,1,1


### Working on the time column, stripping it of some characters and converting an integer to feed to the model. This is to find if teams play better at certain times.

In [14]:
matches["hour"] = matches["time"].str.replace(":.+","", regex=True).astype("int")

In [17]:
matches.sample(7)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour
30,2022-02-12,15:00,Premier League,Matchweek 25,Sat,Home,D,0.0,0.0,Crystal Palace,...,2.0,12.9,0.0,0.0,0.0,2021,Brentford,1,7,15
14,2021-10-31,16:30,Premier League,Matchweek 10,Sun,Away,W,4.0,1.0,Aston Villa,...,8.0,16.5,3.0,0.0,0.0,2021,West Ham United,0,1,16
12,2021-11-20,15:00,Premier League,Matchweek 12,Sat,Away,D,3.0,3.0,Burnley,...,8.0,16.4,1.0,0.0,0.0,2021,Crystal Palace,0,5,15
5,2020-10-17,20:00,Premier League,Matchweek 5,Sat,Away,W,4.0,1.0,Newcastle Utd,...,11.0,20.5,0.0,0.0,1.0,2020,Manchester United,0,15,20
37,2021-02-21,16:30,Premier League,Matchweek 25,Sun,Away,W,1.0,0.0,Arsenal,...,3.0,16.2,0.0,0.0,0.0,2020,Manchester City,0,0,16
16,2022-12-26,15:00,Premier League,Matchweek 17,Mon,Away,W,3.0,0.0,Crystal Palace,...,10.0,13.4,1.0,0.0,0.0,2022,Fulham,0,7,15
34,2022-03-12,15:00,Premier League,Matchweek 29,Sat,Home,W,2.0,0.0,Burnley,...,2.0,21.5,0.0,1.0,1.0,2021,Brentford,1,5,15


### Creating a day_code columns from the dayofweek. There might be some information there.

In [18]:
matches["day_code"] = matches["date"].dt.dayofweek

In [20]:
matches.sample(5)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
30,2021-02-22,20:00,Premier League,Matchweek 25,Mon,Home,L,1.0,2.0,Crystal Palace,...,14.5,0.0,0.0,0.0,2020,Brighton and Hove Albion,1,7,20,0
29,2022-02-26,15:00,Premier League,Matchweek 27,Sat,Home,D,1.0,1.0,Burnley,...,18.7,2.0,0.0,0.0,2021,Crystal Palace,1,5,15,5
3,2021-08-29,14:00,Premier League,Matchweek 3,Sun,Away,D,1.0,1.0,Burnley,...,16.1,0.0,0.0,0.0,2021,Leeds United,0,5,14,6
28,2023-02-04,12:30,Premier League,Matchweek 22,Sat,Away,L,0.0,1.0,Everton,...,17.6,0.0,0.0,0.0,2022,Arsenal,0,8,12,5
4,2020-09-28,17:45,Premier League,Matchweek 3,Mon,Home,L,0.0,3.0,Aston Villa,...,16.6,0.0,0.0,0.0,2020,Fulham,1,1,17,0


## Creating the actual ML model

In [24]:
# for our target we create a boolean column from "result" where 1 is a win(W) and 0 is a loss/draw(L/D)
matches["target"] = (matches["result"] == "W").astype("int")

In [27]:
matches.sample(5)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
28,2021-12-29,20:15,Premier League,Matchweek 20,Wed,Away,W,1.0,0.0,Brentford,...,0.0,0.0,0.0,2021,Manchester City,0,3,20,2,1
0,2020-09-13,14:00,Premier League,Matchweek 1,Sun,Away,W,3.0,0.0,West Brom,...,1.0,2.0,2.0,2020,Leicester City,0,22,14,6,1
21,2023-01-15,14:00,Premier League,Matchweek 20,Sun,Away,L,0.0,1.0,Newcastle Utd,...,0.0,0.0,1.0,2022,Fulham,0,15,14,6,0
34,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Away,D,0.0,0.0,Brighton,...,0.0,0.0,0.0,2021,Norwich City,0,4,15,5,0
36,2022-04-16,15:00,Premier League,Matchweek 33,Sat,Away,L,2.0,3.0,Manchester Utd,...,1.0,0.0,0.0,2021,Norwich City,0,14,15,5,0


### Now that we have target and predictors, we can train our model. We'll use a random forest classifier to make our initial predictions.

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [32]:
# we have to be careful when we split, cos in the real world we can't use date in the future to predict data in the past.
train = matches[matches["date"] < '2022-08-05']

In [33]:
test = matches[matches["date"] > '2022-08-05']

In [34]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [35]:
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [36]:
preds = rf.predict(test[predictors])

### We'll try different metric to determine the accuracy of the model

In [38]:
from sklearn.metrics import accuracy_score

In [39]:
acc = accuracy_score(test["target"], preds)

In [40]:
acc

0.5819327731092437

In [41]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [42]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,228,68
1,131,49


In [43]:
from sklearn.metrics import precision_score

In [44]:
precision_score(test["target"], preds)

0.4188034188034188

### Creating some more predictors to improve the accuracy of the model. To do this, we split the matches dataframe by teams

In [46]:
grouped_matches = matches.groupby("team")

In [51]:
group = grouped_matches.get_group("Manchester City")

In [52]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,1.0,1.0,1.0,2022,Manchester City,0,23,16,6,1
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.0,0.0,0.0,2022,Manchester City,1,2,15,5,1
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,1.0,0.0,0.0,2022,Manchester City,0,15,16,6,0
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.0,0.0,0.0,2022,Manchester City,1,7,15,5,1
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.0,0.0,0.0,2022,Manchester City,1,17,19,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,2021-05-01,12:30,Premier League,Matchweek 34,Sat,Away,W,2.0,0.0,Crystal Palace,...,1.0,0.0,0.0,2020,Manchester City,0,7,12,5,1
56,2021-05-08,17:30,Premier League,Matchweek 35,Sat,Home,L,1.0,2.0,Chelsea,...,0.0,0.0,1.0,2020,Manchester City,1,6,17,5,0
57,2021-05-14,20:00,Premier League,Matchweek 36,Fri,Away,W,4.0,3.0,Newcastle Utd,...,1.0,0.0,0.0,2020,Manchester City,0,15,20,4,1
58,2021-05-18,19:00,Premier League,Matchweek 37,Tue,Away,L,2.0,3.0,Brighton,...,1.0,0.0,0.0,2020,Manchester City,0,4,19,1,0


In [53]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [54]:
cols = ["gf", "ga", "sh","sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [55]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [56]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,5,1,2.000000,2.333333,17.333333,4.666667,19.700000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,5,0,1.333333,2.000000,17.333333,3.666667,18.566667,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,...,5,1,1.000000,0.666667,16.666667,4.333333,18.933333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,...,6,0,1.000000,0.333333,14.333333,6.666667,19.033333,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,...,5,0,1.000000,0.666667,12.000000,5.666667,20.000000,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,2023-02-05,16:30,Premier League,Matchweek 22,Sun,Away,L,0.0,1.0,Tottenham,...,6,0,2.666667,1.333333,11.000000,4.000000,16.766667,0.333333,0.333333,0.333333
33,2023-02-12,16:30,Premier League,Matchweek 23,Sun,Home,W,3.0,1.0,Aston Villa,...,6,1,2.333333,1.000000,14.333333,5.333333,16.266667,0.666667,0.333333,0.333333
34,2023-02-15,19:30,Premier League,Matchweek 12,Wed,Away,W,3.0,1.0,Arsenal,...,2,1,2.000000,0.666667,14.333333,5.666667,15.966667,0.666667,0.666667,0.666667
35,2023-02-18,15:00,Premier League,Matchweek 24,Sat,Away,D,1.0,1.0,Nott'ham Forest,...,5,0,2.000000,1.000000,13.333333,5.666667,15.200000,0.666667,0.333333,0.333333


In [65]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [66]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,25,2023-01-22,14:00,Premier League,Matchweek 21,Sun,Away,L,0.0,3.0,Manchester City,...,6,0,0.666667,0.666667,12.333333,3.333333,17.166667,0.666667,0.000000,0.000000
Wolverhampton Wanderers,26,2023-02-04,15:00,Premier League,Matchweek 22,Sat,Home,W,3.0,0.0,Liverpool,...,5,1,0.666667,1.333333,12.333333,2.666667,16.866667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,27,2023-02-11,15:00,Premier League,Matchweek 23,Sat,Away,W,2.0,1.0,Southampton,...,5,1,1.333333,1.000000,13.000000,3.333333,18.233333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,28,2023-02-18,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,1.0,Bournemouth,...,5,0,1.666667,1.333333,11.000000,2.333333,17.800000,0.000000,0.000000,0.000000


In [67]:
# removing the extra level of index with team names
matches_rolling = matches_rolling.droplevel("team")

In [68]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,2023-01-22,14:00,Premier League,Matchweek 21,Sun,Away,L,0.0,3.0,Manchester City,...,6,0,0.666667,0.666667,12.333333,3.333333,17.166667,0.666667,0.000000,0.000000
26,2023-02-04,15:00,Premier League,Matchweek 22,Sat,Home,W,3.0,0.0,Liverpool,...,5,1,0.666667,1.333333,12.333333,2.666667,16.866667,0.000000,0.000000,0.000000
27,2023-02-11,15:00,Premier League,Matchweek 23,Sat,Away,W,2.0,1.0,Southampton,...,5,1,1.333333,1.000000,13.000000,3.333333,18.233333,0.000000,0.000000,0.000000
28,2023-02-18,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,1.0,Bournemouth,...,5,0,1.666667,1.333333,11.000000,2.333333,17.800000,0.000000,0.000000,0.000000


In [69]:
# reordering the index numbering to correspong with the number of rows
matches_rolling.index = range(matches_rolling.shape[0])

In [70]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,2023-01-22,14:00,Premier League,Matchweek 21,Sun,Away,L,0.0,3.0,Manchester City,...,6,0,0.666667,0.666667,12.333333,3.333333,17.166667,0.666667,0.000000,0.000000
1916,2023-02-04,15:00,Premier League,Matchweek 22,Sat,Home,W,3.0,0.0,Liverpool,...,5,1,0.666667,1.333333,12.333333,2.666667,16.866667,0.000000,0.000000,0.000000
1917,2023-02-11,15:00,Premier League,Matchweek 23,Sat,Away,W,2.0,1.0,Southampton,...,5,1,1.333333,1.000000,13.000000,3.333333,18.233333,0.000000,0.000000,0.000000
1918,2023-02-18,15:00,Premier League,Matchweek 24,Sat,Home,L,0.0,1.0,Bournemouth,...,5,0,1.666667,1.333333,11.000000,2.333333,17.800000,0.000000,0.000000,0.000000


In [93]:
# creating a function so we can reiterate easily
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-08-05']
    test = data[data["date"] > '2022-08-05']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [94]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [95]:
precision

0.5

In [96]:
combined

Unnamed: 0,actual,predicted
74,1,0
75,1,1
76,1,0
77,1,1
78,0,1
...,...,...
1915,0,0
1916,1,0
1917,1,0
1918,0,0


In [97]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [98]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
74,1,0,2022-08-13,Arsenal,Leicester City,W
75,1,1,2022-08-20,Arsenal,Bournemouth,W
76,1,0,2022-08-27,Arsenal,Fulham,W
77,1,1,2022-08-31,Arsenal,Aston Villa,W
78,0,1,2022-09-04,Arsenal,Manchester Utd,L
...,...,...,...,...,...,...
1915,0,0,2023-01-22,Wolverhampton Wanderers,Manchester City,L
1916,1,0,2023-02-04,Wolverhampton Wanderers,Liverpool,W
1917,1,0,2023-02-11,Wolverhampton Wanderers,Southampton,W
1918,0,0,2023-02-18,Wolverhampton Wanderers,Bournemouth,L


In [112]:
# creating a dictionary so we can normalize the team name, where the name in the [team] column is different from name in 
# the [opponent] column.
class MissingDict(dict):
    __missing__ = lambda self, key: key
        
map_values = {
    "Brighton and Hove Albion": "Brighton", 
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [113]:
mapping["West Ham United"]

'West Ham'

In [107]:
combined["new_team"] = combined["team"].map(mapping)

In [108]:
combined.sample(10)

Unnamed: 0,actual,predicted,date,team,opponent,result,new team,new_team
1816,0,0,2023-01-04,West Ham United,Leeds United,D,West Ham,West Ham
535,0,1,2023-02-11,Chelsea,West Ham,D,Chelsea,Chelsea
719,0,1,2022-11-12,Everton,Bournemouth,L,Everton,Everton
524,0,0,2022-10-22,Chelsea,Manchester Utd,D,Chelsea,Chelsea
778,0,0,2022-11-13,Fulham,Manchester Utd,L,Fulham,Fulham
784,0,1,2023-01-23,Fulham,Tottenham,L,Fulham,Fulham
725,1,0,2023-02-04,Everton,Arsenal,W,Everton,Everton
788,0,0,2023-02-24,Fulham,Wolves,D,Fulham,Fulham
1273,1,1,2023-02-19,Manchester United,Leicester City,W,Manchester Utd,Manchester Utd
1538,1,0,2022-08-30,Southampton,Chelsea,W,Southampton,Southampton


In [111]:
#merging the dataset with itsself to crosscheck the outcome for 2-legged matches where team was home and the return leg where 
# team was away.
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [110]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new team_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new team_y,new_team_y
0,1,0,2022-08-13,Arsenal,Leicester City,W,Arsenal,Arsenal,0,0,Leicester City,Arsenal,L,Leicester City,Leicester City
1,1,0,2022-08-27,Arsenal,Fulham,W,Arsenal,Arsenal,0,0,Fulham,Arsenal,L,Fulham,Fulham
2,1,1,2022-08-31,Arsenal,Aston Villa,W,Arsenal,Arsenal,0,0,Aston Villa,Arsenal,L,Aston Villa,Aston Villa
3,0,1,2022-09-04,Arsenal,Manchester Utd,L,Arsenal,Arsenal,1,1,Manchester United,Arsenal,W,Manchester Utd,Manchester Utd
4,1,1,2022-09-18,Arsenal,Brentford,W,Arsenal,Arsenal,0,0,Brentford,Arsenal,L,Brentford,Brentford
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,0,0,2023-01-22,Wolverhampton Wanderers,Manchester City,L,Wolves,Wolves,1,1,Manchester City,Wolves,W,Manchester City,Manchester City
439,1,0,2023-02-04,Wolverhampton Wanderers,Liverpool,W,Wolves,Wolves,0,1,Liverpool,Wolves,L,Liverpool,Liverpool
440,1,0,2023-02-11,Wolverhampton Wanderers,Southampton,W,Wolves,Wolves,0,0,Southampton,Wolves,L,Southampton,Southampton
441,0,0,2023-02-18,Wolverhampton Wanderers,Bournemouth,L,Wolves,Wolves,1,0,Bournemouth,Wolves,W,Bournemouth,Bournemouth


In [114]:
merged[(merged["predicted_x"]== 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

1    42
0    41
Name: actual_x, dtype: int64

In [116]:
42/83

0.5060240963855421