In [39]:
import pandas as pd

In [40]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [41]:
matches.shape

(2316, 28)

In [42]:
matches["team"].value_counts()

team
Manchester City             118
Newcastle United            118
Liverpool                   118
Aston Villa                 118
Brighton and Hove Albion    118
Chelsea                     118
Brentford                   118
Arsenal                     118
West Ham United             114
Everton                     114
Wolverhampton Wanderers     114
Crystal Palace              114
Manchester United           114
Tottenham Hotspur           114
Nottingham Forest            80
Bournemouth                  76
Fulham                       76
Burnley                      76
Leicester City               76
Leeds United                 76
Southampton                  76
Luton Town                   38
Sheffield United             38
Watford                      38
Norwich City                 38
Name: count, dtype: int64

In [43]:
matches["round"].value_counts()

round
Matchweek 1     69
Matchweek 4     69
Matchweek 2     69
Matchweek 3     69
Matchweek 30    60
Matchweek 24    60
Matchweek 25    60
Matchweek 18    60
Matchweek 26    60
Matchweek 27    60
Matchweek 28    60
Matchweek 31    60
Matchweek 34    60
Matchweek 23    60
Matchweek 33    60
Matchweek 29    60
Matchweek 35    60
Matchweek 36    60
Matchweek 37    60
Matchweek 32    60
Matchweek 21    60
Matchweek 22    60
Matchweek 11    60
Matchweek 5     60
Matchweek 6     60
Matchweek 7     60
Matchweek 8     60
Matchweek 9     60
Matchweek 10    60
Matchweek 12    60
Matchweek 20    60
Matchweek 13    60
Matchweek 14    60
Matchweek 15    60
Matchweek 16    60
Matchweek 17    60
Matchweek 19    60
Matchweek 38    60
Name: count, dtype: int64

In [44]:
matches["date"] = pd.to_datetime(matches["date"])

In [45]:
matches.dtypes

date             datetime64[ns]
time                     object
comp                     object
round                    object
day                      object
venue                    object
result                   object
gf                      float64
ga                      float64
opponent                 object
xg                      float64
xga                     float64
poss                    float64
attendance              float64
captain                  object
formation                object
opp formation            object
referee                  object
match report             object
notes                   float64
sh                      float64
sot                     float64
dist                    float64
fk                      float64
pk                        int64
pkatt                     int64
season                    int64
team                     object
dtype: object

In [46]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [47]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [48]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex = True).astype("int")

In [49]:
matches["day_code"] = matches["date"].dt.dayofweek

In [50]:
matches["target"] = (matches["result"] == "W").astype("int")

In [51]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [53]:
train = matches[matches["date"] < "2023-01-01"] 

In [54]:
test = matches[matches["date"] > "2023-01-01"]

In [55]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [56]:
rf.fit(train[predictors], train["target"])

In [57]:
preds = rf.predict(test[predictors])

In [58]:
from sklearn.metrics import accuracy_score


In [59]:

acc = accuracy_score(test["target"], preds)

In [60]:
acc

0.6021241830065359

In [61]:
combined = pd.DataFrame(dict(actual=test["target"], prediction = preds))

In [62]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,580,157
1,330,157


In [63]:
from sklearn.metrics import precision_score


In [64]:
precision_score(test["target"], preds)

0.5

In [65]:
grouped_matches = matches.groupby("team")

In [66]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [67]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [68]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,5,1,3.333333,0.333333,19.666667,6.000000,16.866667,0.666667,0.000000,0.000000
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,5,0,3.666667,0.000000,22.000000,7.333333,15.866667,0.333333,0.000000,0.000000
8,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Away,W,1.0,0.0,Chelsea,...,5,1,2.000000,0.000000,22.000000,6.333333,15.166667,0.333333,0.000000,0.000000
10,2021-10-03,16:30,Premier League,Matchweek 7,Sun,Away,D,2.0,2.0,Liverpool,...,6,0,0.666667,0.000000,18.666667,4.000000,15.933333,0.333333,0.000000,0.000000
11,2021-10-16,15:00,Premier League,Matchweek 8,Sat,Home,W,2.0,0.0,Burnley,...,5,1,1.000000,0.666667,14.333333,2.333333,16.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,2024-05-19,16:00,Premier League,Matchweek 38,Sun,Home,W,3.0,1.0,West Ham,...,6,1,3.666667,0.333333,13.333333,7.333333,15.800000,0.000000,1.333333,1.333333
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,...,6,1,3.000000,0.333333,16.666667,8.000000,16.666667,0.666667,0.666667,0.666667
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,...,5,1,2.333333,0.333333,15.333333,7.000000,17.166667,0.666667,0.333333,0.333333
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,1.0,West Ham,...,5,1,3.000000,0.666667,17.333333,7.000000,18.266667,1.000000,0.333333,0.333333


In [69]:
matches_rolling = matches.groupby("team").apply(lambda x : rolling_averages(x, cols=cols, new_cols=new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x : rolling_averages(x, cols=cols, new_cols=new_cols))


In [71]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
Arsenal,5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
Arsenal,7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
Arsenal,8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
Arsenal,9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,40,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0.0,1.0,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
Wolverhampton Wanderers,41,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2.0,1.0,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,42,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1.0,5.0,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [72]:
matches_rolling = matches_rolling.droplevel('team')

In [73]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0.0,1.0,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
41,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2.0,1.0,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
42,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1.0,5.0,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [76]:
matches_rolling.index = range(matches_rolling.shape[0])

In [77]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
1,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
2,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
3,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
4,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0.0,1.0,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
2237,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2.0,1.0,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
2238,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1.0,5.0,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
2239,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1.0,3.0,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [80]:
def make_predictions(data, predictors):
    train = data[data["date"] < "2023-01-01"]
    test = data[data["date"] > "2023-01-01"]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual = test["target"], predicted = preds), index = test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [81]:
combined, precision = make_predictions(matches_rolling , predictors + new_cols)

In [82]:
precision

0.5594202898550724

In [83]:
combined

Unnamed: 0,actual,predicted
51,0,1
52,1,0
53,1,1
54,0,1
55,0,1
...,...,...
2236,0,0
2237,1,0
2238,0,0
2239,0,0


In [84]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [85]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
51,0,1,2023-01-03,Arsenal,Newcastle Utd,D
52,1,0,2023-01-15,Arsenal,Tottenham,W
53,1,1,2023-01-22,Arsenal,Manchester Utd,W
54,0,1,2023-02-04,Arsenal,Everton,L
55,0,1,2023-02-11,Arsenal,Brentford,D
...,...,...,...,...,...,...
2236,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L
2237,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W
2238,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L
2239,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L


In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
   #"Newcastle United": 
}