In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

In [3]:
matches.shape

(5320, 29)

In [4]:
matches["date"] = pd.to_datetime(matches["date"])

In [5]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [6]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [7]:
matches["team_code"] = matches["team"].astype("category").cat.codes

In [8]:
matches["ref_code"] = matches["referee"].astype("category").cat.codes

In [9]:
matches["formation_code"] = matches["formation"].str.replace("[^0-9.]", "", regex=True).astype("int")

In [10]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [11]:
matches["day_code"] = matches["date"].dt.dayofweek

In [12]:
matches["gameweek"] = matches["round"].str.replace("Matchweek ", "").astype("int")

In [13]:
matches["target"] = (matches["result"] == "W").astype("int")

In [14]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,team,venue_code,opp_code,team_code,ref_code,formation_code,hour,day_code,gameweek,target
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Manchester City,0,5,16,4,4231,20,4,1,1
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Manchester City,1,18,16,28,4231,20,5,2,1
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Manchester City,0,21,16,9,4231,14,6,3,1
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Manchester City,1,10,16,19,4231,15,5,4,1
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Manchester City,0,28,16,1,4231,15,5,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2018-04-15,16:00,Premier League,Matchweek 34,Sun,Away,W,1,0,Manchester Utd,...,West Bromwich Albion,0,17,27,25,4411,16,6,34,1
39,2018-04-21,12:30,Premier League,Matchweek 35,Sat,Home,D,2,2,Liverpool,...,West Bromwich Albion,1,14,27,34,4411,12,5,35,0
40,2018-04-28,15:00,Premier League,Matchweek 36,Sat,Away,W,1,0,Newcastle Utd,...,West Bromwich Albion,0,18,27,7,4411,15,5,36,1
41,2018-05-05,15:00,Premier League,Matchweek 37,Sat,Home,W,1,0,Tottenham,...,West Bromwich Albion,1,25,27,22,4411,15,5,37,1


In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [17]:
train = matches[matches["date"] < '2023-03-01']

In [18]:
test = matches[matches["date"] >= '2023-03-01']

In [19]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "gameweek", "team_code", "ref_code", "formation_code"]

In [20]:
rf.fit(train[predictors], train["target"])

In [21]:
preds = rf.predict(test[predictors])

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
acc = accuracy_score(test["target"], preds)

In [24]:
acc

0.6343570057581573

In [25]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [26]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,522,110
1,271,139


In [27]:
from sklearn.metrics import precision_score

In [28]:
precision_score(test["target"], preds)

np.float64(0.5582329317269076)

In [29]:
grouped_matches = matches.groupby("team")

In [30]:
group = grouped_matches.get_group("Manchester City")

In [31]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(5, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [32]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg_x", "xga", "npxg", "poss"]
new_cols = [f"{c}_rolling" for c in cols]

In [33]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,xg_x_rolling,xga_rolling,npxg_rolling,poss_rolling
7,2017-09-23,15:00,Premier League,Matchweek 6,Sat,Home,W,5,0,Crystal Palace,...,18.4,7.4,17.38,0.8,0.2,0.2,2.02,0.52,1.86,68.8
9,2017-09-30,17:30,Premier League,Matchweek 7,Sat,Away,W,1,0,Chelsea,...,20.6,8.4,16.18,0.4,0.2,0.2,2.54,0.58,2.38,67.6
10,2017-10-14,15:00,Premier League,Matchweek 8,Sat,Home,W,7,2,Stoke City,...,20.2,8.2,15.78,0.4,0.2,0.2,2.50,0.56,2.34,67.0
12,2017-10-21,15:00,Premier League,Matchweek 9,Sat,Home,W,3,0,Burnley,...,20.4,8.8,15.76,0.6,0.2,0.2,2.98,0.56,2.82,68.8
14,2017-10-28,15:00,Premier League,Matchweek 10,Sat,Away,W,3,2,West Brom,...,20.6,8.6,15.96,0.6,0.4,0.4,3.12,0.46,2.90,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,2024-04-28,16:30,Premier League,Matchweek 35,Sun,Away,W,2,0,Nott'ham Forest,...,21.0,7.6,16.02,0.6,0.2,0.2,2.38,0.74,2.22,70.2
52,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Home,W,5,1,Wolves,...,20.8,8.4,16.92,0.8,0.2,0.2,2.36,0.98,2.20,69.0
53,2024-05-11,12:30,Premier League,Matchweek 37,Sat,Away,W,4,0,Fulham,...,19.4,8.2,16.80,0.4,0.6,0.6,2.42,0.88,1.94,67.2
54,2024-05-14,20:00,Premier League,Matchweek 34,Tue,Away,W,2,0,Tottenham,...,18.8,8.2,17.10,0.4,0.8,0.8,2.68,0.68,2.04,65.0


In [34]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [35]:
matches_rolling = matches_rolling.droplevel('team')

In [36]:
matches_rolling.index = range(matches_rolling.shape[0])

In [37]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-03-01']
    test = data[data["date"] >= '2023-03-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [38]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [39]:
precision

np.float64(0.6305220883534136)

In [40]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [41]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    "Sheffield United": "Sheffield Utd",
    "Nottingham Forest": "Nott'ham Forest"
}
mapping = MissingDict(**map_values)

In [42]:
combined["new_team"] = combined["team"].map(mapping)

In [43]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [44]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
1    148
0     63
Name: count, dtype: int64

In [45]:
precision = 148 / (148+63)
precision

0.7014218009478673