In [3]:
#importing necessary Libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from xgboost import XGBClassifier

In [4]:
#Fetching the dataset
matches = pd.read_csv('/content/matches_data.csv',index_col = 0)

In [5]:
#Dsipalying the first 5 rows of the dataframe
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,Match Report,,17.0,8.0,13.9,0.0,0,0,2023,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,Match Report,,14.0,4.0,17.9,0.0,0,0,2023,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,Match Report,,29.0,9.0,17.3,2.0,0,1,2023,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,Match Report,,6.0,4.0,14.8,0.0,1,1,2023,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,Match Report,,29.0,13.0,16.4,1.0,0,0,2023,Manchester City


In [6]:
#Displaying the shape of the dataframe
matches.shape

(3800, 27)

In [7]:
#Counting total number of matches per team
matches["team"].value_counts()

team
Manchester City             190
Manchester United           190
Everton                     190
Arsenal                     190
Brighton and Hove Albion    190
Crystal Palace              190
West Ham United             190
Wolverhampton Wanderers     190
Newcastle United            190
Tottenham Hotspur           190
Aston Villa                 190
Liverpool                   190
Chelsea                     190
Burnley                     152
Leicester City              152
Southampton                 152
Bournemouth                 114
Fulham                      114
Brentford                   114
Sheffield United            114
Leeds United                114
Norwich City                 76
Nottingham Forest            76
Watford                      76
Luton Town                   38
West Bromwich Albion         38
Name: count, dtype: int64

In [8]:
#Counting total number of matches per round
matches["round"].value_counts()

round
Matchweek 1     100
Matchweek 30    100
Matchweek 23    100
Matchweek 24    100
Matchweek 25    100
Matchweek 18    100
Matchweek 26    100
Matchweek 27    100
Matchweek 28    100
Matchweek 31    100
Matchweek 2     100
Matchweek 32    100
Matchweek 33    100
Matchweek 29    100
Matchweek 35    100
Matchweek 36    100
Matchweek 37    100
Matchweek 34    100
Matchweek 22    100
Matchweek 21    100
Matchweek 20    100
Matchweek 19    100
Matchweek 3     100
Matchweek 4     100
Matchweek 5     100
Matchweek 6     100
Matchweek 7     100
Matchweek 8     100
Matchweek 9     100
Matchweek 10    100
Matchweek 11    100
Matchweek 12    100
Matchweek 13    100
Matchweek 14    100
Matchweek 15    100
Matchweek 16    100
Matchweek 17    100
Matchweek 38    100
Name: count, dtype: int64

In [9]:
#Displaying the datatypes
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [10]:
#Converting the date column into datetime datatype
matches["date"] = pd.to_datetime(matches["date"])

In [11]:
#Converting the venues into categorical data represented as integers
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [12]:
#Converting the opponents into categorical data represented as integers
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [13]:
#Remove the colon and minutes from the hour and store the hour as an int
matches["hour"] = matches["time"].str.replace(":.+","",regex=True).astype("int")

In [14]:
matches["day_code"] = matches["date"].dt.dayofweek

In [15]:
#Creating a target column
def result_to_target(result):
    if result == 'W':
        return 1
    else:
        return 0

# Apply the function to the 'result' column
matches['target'] = matches['result'].apply(result_to_target)

In [16]:
#Initialize the randomclassifier classifier
rf = RandomForestClassifier(n_estimators= 100, min_samples_split=10, random_state=42)

In [17]:
#Splitting our data into train and test set
train = matches[matches["date"] < "2023 - 08 - 01"]
test = matches[matches["date"] > "2023 - 08 - 01"]

In [18]:
#Shape of training dataset
train.shape

(3040, 32)

In [19]:
#Shape of test dataset
test.shape

(760, 32)

In [20]:
#Setting our predictors
predictors = ["venue_code","opp_code","hour","day_code"]

In [21]:
#Fitting the model
rf.fit(train[predictors],train["target"])

In [22]:
#Testing our model
preds = rf.predict(test[predictors])

In [23]:
#Calculating the accuracy of the model
acc = accuracy_score(test["target"],preds)
print("Acuuracy of the model:",acc)

Acuuracy of the model: 0.6039473684210527


In [24]:
combined = pd.DataFrame(dict(actual = test["target"],prediction = preds))

In [25]:
pd.crosstab(index=combined["actual"],columns = combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,370,92
1,209,89


In [26]:
#Calculating precision score using average as macro since it is a multi class classfication problem
precision = precision_score(test['target'], preds, average='macro')
print(f"Macro-average Precision: {precision:.2f}")

Macro-average Precision: 0.57


In [27]:
#Creating more predictors to improve accuracy
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester United")

In [28]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-14,20:00,Premier League,Matchweek 1,Mon,Home,W,1,0,Wolves,...,0.0,0,0,2023,Manchester United,1,25,20,0,1
1,2023-08-19,17:30,Premier League,Matchweek 2,Sat,Away,L,0,2,Tottenham,...,1.0,0,0,2023,Manchester United,0,21,17,5,0
2,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,W,3,2,Nott'ham Forest,...,0.0,1,1,2023,Manchester United,1,18,15,5,1
3,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Away,L,1,3,Arsenal,...,0.0,0,0,2023,Manchester United,0,0,16,6,0
4,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Home,L,1,3,Brighton,...,1.0,0,0,2023,Manchester United,1,4,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2020-07-09,20:15,Premier League,Matchweek 34,Thu,Away,W,3,0,Aston Villa,...,1.0,1,1,2019,Manchester United,0,1,20,3,1
53,2020-07-13,20:00,Premier League,Matchweek 35,Mon,Home,D,2,2,Southampton,...,0.0,0,0,2019,Manchester United,1,20,20,0,0
54,2020-07-16,20:15,Premier League,Matchweek 36,Thu,Away,W,2,0,Crystal Palace,...,1.0,0,0,2019,Manchester United,0,7,20,3,1
56,2020-07-22,18:00,Premier League,Matchweek 37,Wed,Home,D,1,1,West Ham,...,0.0,0,0,2019,Manchester United,1,24,18,2,0


In [29]:
#Creating a fucntion to compute rolling averages
def rolling_average(group,cols,new_cols):
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(3,closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

In [30]:
#Creating cols and new_cols variables
cols = ["gf","ga","sh","sot","dist","fk","pk","pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [31]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [32]:
#Compute rolling averages for every team
matches_rolling = matches.groupby("team").apply(lambda x: rolling_average(x,cols,new_cols))
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3,2019-09-01,16:30,Premier League,Matchweek 4,Sun,Home,D,2,2,Tottenham,...,6,0,1.333333,1.333333,10.666667,4.666667,17.200000,0.333333,0.000000,0.000000
Arsenal,4,2019-09-15,16:30,Premier League,Matchweek 5,Sun,Away,D,2,2,Watford,...,6,0,1.666667,2.000000,16.666667,6.666667,18.600000,1.000000,0.000000,0.000000
Arsenal,6,2019-09-22,16:30,Premier League,Matchweek 6,Sun,Home,W,3,2,Aston Villa,...,6,1,1.666667,2.333333,14.000000,5.000000,19.300000,1.000000,0.000000,0.000000
Arsenal,8,2019-09-30,20:00,Premier League,Matchweek 7,Mon,Away,D,1,1,Manchester Utd,...,0,0,2.333333,2.000000,17.666667,5.666667,18.600000,1.333333,0.333333,0.333333
Arsenal,10,2019-10-06,14:00,Premier League,Matchweek 8,Sun,Home,W,1,0,Bournemouth,...,6,1,2.000000,1.666667,12.333333,4.666667,18.100000,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,40,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
Wolverhampton Wanderers,41,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,42,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [33]:
#Dropping the extra index level
matches_rolling = matches_rolling.droplevel('team')
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2019-09-01,16:30,Premier League,Matchweek 4,Sun,Home,D,2,2,Tottenham,...,6,0,1.333333,1.333333,10.666667,4.666667,17.200000,0.333333,0.000000,0.000000
4,2019-09-15,16:30,Premier League,Matchweek 5,Sun,Away,D,2,2,Watford,...,6,0,1.666667,2.000000,16.666667,6.666667,18.600000,1.000000,0.000000,0.000000
6,2019-09-22,16:30,Premier League,Matchweek 6,Sun,Home,W,3,2,Aston Villa,...,6,1,1.666667,2.333333,14.000000,5.000000,19.300000,1.000000,0.000000,0.000000
8,2019-09-30,20:00,Premier League,Matchweek 7,Mon,Away,D,1,1,Manchester Utd,...,0,0,2.333333,2.000000,17.666667,5.666667,18.600000,1.333333,0.333333,0.333333
10,2019-10-06,14:00,Premier League,Matchweek 8,Sun,Home,W,1,0,Bournemouth,...,6,1,2.000000,1.666667,12.333333,4.666667,18.100000,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
41,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
42,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
43,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [34]:
#Making sure the index is unique
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2019-09-01,16:30,Premier League,Matchweek 4,Sun,Home,D,2,2,Tottenham,...,6,0,1.333333,1.333333,10.666667,4.666667,17.200000,0.333333,0.000000,0.000000
1,2019-09-15,16:30,Premier League,Matchweek 5,Sun,Away,D,2,2,Watford,...,6,0,1.666667,2.000000,16.666667,6.666667,18.600000,1.000000,0.000000,0.000000
2,2019-09-22,16:30,Premier League,Matchweek 6,Sun,Home,W,3,2,Aston Villa,...,6,1,1.666667,2.333333,14.000000,5.000000,19.300000,1.000000,0.000000,0.000000
3,2019-09-30,20:00,Premier League,Matchweek 7,Mon,Away,D,1,1,Manchester Utd,...,0,0,2.333333,2.000000,17.666667,5.666667,18.600000,1.333333,0.333333,0.333333
4,2019-10-06,14:00,Premier League,Matchweek 8,Sun,Home,W,1,0,Bournemouth,...,6,1,2.000000,1.666667,12.333333,4.666667,18.100000,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3714,2024-04-24,19:45,Premier League,Matchweek 29,Wed,Home,L,0,1,Bournemouth,...,2,0,1.000000,2.000000,9.666667,4.000000,20.533333,0.333333,0.333333,0.333333
3715,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Luton Town,...,5,1,0.666667,1.666667,10.333333,3.333333,18.766667,0.000000,0.000000,0.000000
3716,2024-05-04,17:30,Premier League,Matchweek 36,Sat,Away,L,1,5,Manchester City,...,5,0,0.666667,1.333333,11.000000,4.000000,19.666667,0.000000,0.000000,0.000000
3717,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Home,L,1,3,Crystal Palace,...,5,0,1.000000,2.333333,10.000000,3.333333,15.966667,0.000000,0.000000,0.000000


In [35]:
#Function for retraining the model
def make_predictions(data,predictors):
  train = data[data["date"] < "2023 - 08 - 01"]
  test = data[data["date"] > "2023 - 08 - 01"]
  rf.fit(train[predictors],train["target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual = test["target"],prediction = preds),index = test.index)
  precision = precision_score(test['target'], preds, average='macro')
  accuracy = accuracy_score(test['target'],preds)
  return combined,precision,accuracy

In [36]:
#Calculating the precision of the new model
combined,precision,accuracy = make_predictions(matches_rolling,predictors + new_cols)

In [37]:
#Displaying the new precision score
precision

0.6215013064175525

In [38]:
#Displaying the accuracy of the new model
accuracy

0.6446499339498019

In [39]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [40]:
#Mapping team names
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves","West Bromwich Albion":"West Brom","Nottingham Forest":"Nott'ham Forest",}
mapping = MissingDict(**map_values)

In [41]:
combined["new_team"] = combined['team'].map(mapping)

In [42]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [43]:
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2023-08-12,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nott'ham Forest
1,1,0,2023-08-21,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
2,0,1,2023-08-26,Arsenal,Fulham,D,Arsenal,0,0,Fulham,Arsenal,D,Fulham
3,1,1,2023-09-03,Arsenal,Manchester Utd,W,Arsenal,0,0,Manchester United,Arsenal,L,Manchester Utd
4,1,1,2023-09-17,Arsenal,Everton,W,Arsenal,0,0,Everton,Arsenal,L,Everton
...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves,1,0,Bournemouth,Wolves,W,Bournemouth
712,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves,0,0,Luton Town,Wolves,L,Luton Town
713,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,Manchester City,Wolves,W,Manchester City
714,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,0,Crystal Palace,Wolves,W,Crystal Palace


In [44]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] ==0)]["actual_x"].value_counts()

actual_x
1    104
0     68
Name: count, dtype: int64