In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("league_matches.csv")

In [3]:
matches.head()

Unnamed: 0,DATE,TIME,COMP,ROUND,DAY,VENUE,RESULT,GF,GA,OPPONENT,...,CAPTAIN,REFEREE,MATCH REPORT,SH,SOT,DIST,FK,PK,SEASON,TEAM
0,8/13/2021,20:15,Premier League,Matchweek 2,Mon,Away,W,3,1,Wolves,...,Fernandinho,Andre Marriner,Match Report,13,8,20.3,2,1,2021,Manchester City
1,8/13/2021,16:30,Premier League,Matchweek 3,Sun,Home,L,2,5,Leicester City,...,Fernandinho,Michael Oliver,Match Report,16,5,18.8,1,0,2021,Manchester City
2,8/14/2021,17:30,Premier League,Matchweek 4,Sat,Away,D,1,1,Leeds United,...,Kevin De Bruyne,Mike Dean,Match Report,23,1,17.6,1,0,2021,Manchester City
3,8/14/2021,17:30,Premier League,Matchweek 5,Sat,Home,W,1,0,Arsenal,...,Raheem Sterling,Chris Kavanagh,Match Report,13,5,16.8,0,0,2021,Manchester City
4,8/14/2021,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,West Ham,...,Raheem Sterling,Anthony Taylor,Match Report,14,7,20.3,1,0,2021,Manchester City


In [4]:
matches.shape

(760, 24)

In [5]:
matches["DATE"] = pd.to_datetime(matches["DATE"])

In [6]:
matches["TARGET"] = (matches["RESULT"] == "W").astype("int")

In [7]:
matches

Unnamed: 0,DATE,TIME,COMP,ROUND,DAY,VENUE,RESULT,GF,GA,OPPONENT,...,REFEREE,MATCH REPORT,SH,SOT,DIST,FK,PK,SEASON,TEAM,TARGET
0,2021-08-13,20:15,Premier League,Matchweek 2,Mon,Away,W,3,1,Wolves,...,Andre Marriner,Match Report,13,8,20.3,2,1,2021,Manchester City,1
1,2021-08-13,16:30,Premier League,Matchweek 3,Sun,Home,L,2,5,Leicester City,...,Michael Oliver,Match Report,16,5,18.8,1,0,2021,Manchester City,0
2,2021-08-14,17:30,Premier League,Matchweek 4,Sat,Away,D,1,1,Leeds United,...,Mike Dean,Match Report,23,1,17.6,1,0,2021,Manchester City,0
3,2021-08-14,17:30,Premier League,Matchweek 5,Sat,Home,W,1,0,Arsenal,...,Chris Kavanagh,Match Report,13,5,16.8,0,0,2021,Manchester City,1
4,2021-08-14,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,West Ham,...,Anthony Taylor,Match Report,14,7,20.3,1,0,2021,Manchester City,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,2022-05-22,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Andre Marriner,Match Report,8,1,17.4,0,0,2021,Sheffield United,0
756,2022-05-22,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Simon Hooper,Match Report,7,0,11.4,1,0,2021,Sheffield United,0
757,2022-05-22,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Jonathan Moss,Match Report,10,3,17.0,0,0,2021,Sheffield United,1
758,2022-05-22,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Robert Jones,Match Report,11,1,16.0,1,0,2021,Sheffield United,0


In [8]:
matches["VENUE_CODE"] = matches["VENUE"].astype("category").cat.codes

In [9]:
matches["OPP_CODE"] = matches["OPPONENT"].astype("category").cat.codes

In [10]:
matches["HOUR"] = matches["TIME"].str.replace(":.+", "", regex=True).astype("int")

In [11]:
matches["DAY_CODE"] = matches["DATE"].dt.dayofweek

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
#from sklearn import linear_model
#model = linear_model.LogisticRegression()

In [17]:
model = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [18]:
train = matches[matches["DATE"] < '2022-01-01']
test = matches[matches["DATE"] > '2022-01-01']

In [19]:
predictors = ["VENUE_CODE", "OPP_CODE", "HOUR", "DAY_CODE"]

In [20]:
model.fit(train[predictors], train["TARGET"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [21]:
preds = model.predict(test[predictors])

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy = accuracy_score(test["TARGET"], preds)

In [24]:
accuracy

0.5283505154639175

In [25]:
prediction = pd.DataFrame(dict(actual=test["TARGET"], predicted=preds))

In [26]:
pd.crosstab(index=prediction["actual"], columns=prediction["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,136,149
1,34,69


In [27]:
grouped_matches = matches.groupby("TEAM")

In [28]:
group = grouped_matches.get_group("Manchester United").sort_values("DATE")

In [29]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("DATE")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [30]:
cols = ["GF", "GA", "SH", "SOT", "DIST", "FK", "PK"]
new_cols = [f"{c}_rolling" for c in cols]

#rolling_averages(group, cols, new_cols)

In [31]:
matches_rolling = matches.groupby("TEAM").apply(lambda x: rolling_averages(x, cols, new_cols))

In [32]:
matches_rolling.index = range(matches_rolling.shape[0])

In [33]:
def make_predictions(data, predictors):
    train = data[data["DATE"] < '2022-01-01']
    test = data[data["DATE"] > '2022-01-01']
    rf.fit(train[predictors], train["TARGET"])
    preds = rf.predict(test[predictors])
    prediction = pd.DataFrame(dict(actual=test["TARGET"], predicted=preds), index=test.index)
    error = precision_score(test["TARGET"], preds)
    return prediction, error

In [34]:
prediction = prediction.merge(matches_rolling[["DATE", "TEAM", "OPPONENT", "RESULT"]], left_index=True, right_index=True)

In [35]:
prediction[prediction["actual"] == prediction["predicted"]]

Unnamed: 0,actual,predicted,DATE,TEAM,OPPONENT,RESULT
373,1,1,2021-09-25,Liverpool,Sheffield Utd,W
374,0,0,2021-09-25,Liverpool,Chelsea,L
379,0,0,2021-09-25,Liverpool,Leeds United,D
380,1,1,2021-09-25,Liverpool,Newcastle Utd,D
381,1,1,2021-09-25,Liverpool,Southampton,W
...,...,...,...,...,...,...
686,0,0,2022-02-19,Wolverhampton Wanderers,Tottenham,D
689,0,0,2022-02-19,Wolverhampton Wanderers,Chelsea,D
693,1,1,2022-02-20,Wolverhampton Wanderers,Sheffield Utd,W
696,0,0,2022-02-23,Wolverhampton Wanderers,Everton,L
