In [299]:
import pandas as pd

matches = pd.read_csv("matches.csv", index_col = 0)

matches.shape

(3040, 27)

In [300]:
matches.dtypes

matches["date"] = pd.to_datetime(matches["date"])

matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2024,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2024,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2024,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2024,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2024,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,18.2,0.0,0,0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,13.4,1.0,0,0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,18.5,0.0,0,0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,18.3,1.0,0,0,2021,Sheffield United


In [301]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

matches["opp_code"]= matches["opponent"].astype("category").cat.codes

matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype(int)

matches["day_code"] = matches["date"].dt.dayofweek

matches["target"] = (matches["result"] == "W").astype(int)

In [302]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

#rf = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', 
#                   alpha=0.0001, batch_size='auto', learning_rate='constant', 
#                   learning_rate_init=0.001, max_iter=200, random_state=1)

In [303]:
train = matches[matches["date"] < '2024-01-01']

test = matches[matches["date"] > '2024-01-01']

predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [304]:
rf.fit(train[predictors], train["target"])

In [305]:
preds = rf.predict(test[predictors])

In [306]:
from sklearn.metrics import accuracy_score

In [307]:
acc = accuracy_score(test["target"], preds)

acc

0.6612021857923497

In [308]:
combined = pd.DataFrame(dict(actual=test["target"], predictions=preds))

pd.crosstab(index=combined["actual"], columns=combined["predictions"])

predictions,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,184,45
1,79,58


In [309]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.5631067961165048

In [310]:
grouped_matches = matches.groupby("team")

group = grouped_matches.get_group("Arsenal")



In [311]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [312]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]


In [313]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

matches_rolling = matches_rolling.droplevel("team")

matches_rolling.index = range(matches_rolling.shape[0])

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [314]:
def make_predictions(data, predictors):
    date = '2024-01-01'
    train = data[data["date"] < date]
    test = data[data["date"] > date]
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision    

In [315]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

precision

0.6274509803921569

In [316]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)


In [317]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [318]:
combined["new_team"] = combined["team"].map(mapping)

In [319]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [320]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    60
0    34
Name: count, dtype: int64