In [1]:
import pandas as pd

### Load data

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

### convert column names into lower case

In [3]:
matches.columns=[c.lower() for c in matches.columns]

### drop unnecessary columns

In [4]:
matches.drop(columns=['comp','notes'],inplace=True)

In [5]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,referee,match report,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,...,Michael Oliver,Match Report,13.0,1.0,18.7,1.0,1,1,2023,Manchester City
2,2022-08-13,15:00,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,...,David Coote,Match Report,19.0,7.0,17.5,0.0,0,0,2023,Manchester City
3,2022-08-21,16:30,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,...,Jarred Gillett,Match Report,21.0,10.0,16.2,1.0,0,0,2023,Manchester City
4,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,...,Darren England,Match Report,18.0,5.0,14.1,0.0,0,0,2023,Manchester City
5,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,...,Paul Tierney,Match Report,17.0,9.0,14.8,0.0,0,0,2023,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Matchweek 34,Sun,Away,L,0,4,Tottenham,0.5,...,Andre Marriner,Match Report,8.0,1.0,18.2,0.0,0,0,2021,Sheffield United
39,2021-05-08,15:00,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,0.7,...,Simon Hooper,Match Report,7.0,0.0,13.4,1.0,0,0,2021,Sheffield United
40,2021-05-16,19:00,Matchweek 36,Sun,Away,W,1,0,Everton,1.2,...,Jonathan Moss,Match Report,10.0,3.0,18.5,0.0,0,0,2021,Sheffield United
41,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,0.9,...,Robert Jones,Match Report,11.0,1.0,18.3,1.0,0,0,2021,Sheffield United


### Preprocessing

#### w=1 & D,L=0

In [6]:
matches["target"] = (matches["result"] == "W").astype("int")

### index the rows

In [7]:
matches.index=range(matches.shape[0])

### convert venue and opponent into categorical data types

In [8]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [9]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [10]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

### convert date col into datetime 

In [11]:
matches["date"] = pd.to_datetime(matches["date"])

In [12]:
matches["day_code"] = matches["date"].dt.dayofweek

### convert time as int

In [13]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [14]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,target,venue_code,opp_code,hour,day_code
0,2022-08-07,16:30,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,...,1.0,1,1,2023,Manchester City,1,0,23,16,6
1,2022-08-13,15:00,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,...,0.0,0,0,2023,Manchester City,1,1,2,15,5
2,2022-08-21,16:30,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,...,1.0,0,0,2023,Manchester City,0,0,15,16,6
3,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,...,0.0,0,0,2023,Manchester City,1,1,7,15,5
4,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,...,0.0,0,0,2023,Manchester City,1,1,17,19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2275,2021-05-02,19:15,Matchweek 34,Sun,Away,L,0,4,Tottenham,0.5,...,0.0,0,0,2021,Sheffield United,0,0,20,19,6
2276,2021-05-08,15:00,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,0.7,...,1.0,0,0,2021,Sheffield United,0,1,7,15,5
2277,2021-05-16,19:00,Matchweek 36,Sun,Away,W,1,0,Everton,1.2,...,0.0,0,0,2021,Sheffield United,1,0,8,19,6
2278,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,0.9,...,1.0,0,0,2021,Sheffield United,0,0,15,18,2


### seprate training and testing datasets

In [15]:
train=matches[matches['date']<'2022-05-22']

In [16]:
test=matches[matches['date']>'2022-08-06']

In [17]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [18]:
X_train=train[predictors]
y_train=train["target"]



In [19]:
X_test=test[predictors]
y_test=test['target']


### initial ML model

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf=RandomForestClassifier(n_estimators=50,min_samples_split=10,random_state=2)

In [22]:
rf.fit(X_train,y_train)

In [23]:
y_pred=rf.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy_score(y_test,y_pred)

0.5884718498659517

In [26]:
combined=pd.DataFrame(dict(actual=y_test,predictions=y_pred))

In [27]:
combined

Unnamed: 0,actual,predictions
0,1,1
1,1,0
2,0,1
3,1,0
4,1,1
...,...,...
755,0,1
756,0,0
757,0,1
758,0,0


In [28]:
pd.crosstab(index=combined['actual'],columns=combined['predictions'])

predictions,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,369,90
1,217,70


In [29]:
from sklearn.metrics import precision_score

In [30]:
precision_score(y_test,y_pred)

0.4375

### Grouping

In [31]:
grouped_matches = matches.groupby("team")

In [32]:
group = grouped_matches.get_group("Manchester City")

In [33]:
group

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,target,venue_code,opp_code,hour,day_code
0,2022-08-07,16:30,Matchweek 1,Sun,Away,W,2,0,West Ham,2.2,...,1.0,1,1,2023,Manchester City,1,0,23,16,6
1,2022-08-13,15:00,Matchweek 2,Sat,Home,W,4,0,Bournemouth,1.7,...,0.0,0,0,2023,Manchester City,1,1,2,15,5
2,2022-08-21,16:30,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,2.1,...,1.0,0,0,2023,Manchester City,0,0,15,16,6
3,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,...,0.0,0,0,2023,Manchester City,1,1,7,15,5
4,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,...,0.0,0,0,2023,Manchester City,1,1,17,19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,2021-05-01,12:30,Matchweek 34,Sat,Away,W,2,0,Crystal Palace,1.7,...,1.0,0,0,2021,Manchester City,1,0,7,12,5
1554,2021-05-08,17:30,Matchweek 35,Sat,Home,L,1,2,Chelsea,1.8,...,0.0,0,1,2021,Manchester City,0,1,6,17,5
1555,2021-05-14,20:00,Matchweek 36,Fri,Away,W,4,3,Newcastle Utd,1.5,...,1.0,0,0,2021,Manchester City,1,0,15,20,4
1556,2021-05-18,19:00,Matchweek 37,Tue,Away,L,2,3,Brighton,0.9,...,1.0,0,0,2021,Manchester City,0,0,4,19,1


In [34]:
group.sort_values("date")

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,target,venue_code,opp_code,hour,day_code
1520,2020-09-21,20:15,Matchweek 2,Mon,Away,W,3,1,Wolves,1.9,...,2.0,1,1,2021,Manchester City,1,0,24,20,0
1521,2020-09-27,16:30,Matchweek 3,Sun,Home,L,2,5,Leicester City,0.9,...,1.0,0,0,2021,Manchester City,0,1,11,16,6
1522,2020-10-03,17:30,Matchweek 4,Sat,Away,D,1,1,Leeds United,1.2,...,1.0,0,0,2021,Manchester City,0,0,10,17,5
1523,2020-10-17,17:30,Matchweek 5,Sat,Home,W,1,0,Arsenal,1.3,...,0.0,0,0,2021,Manchester City,1,1,0,17,5
1524,2020-10-24,12:30,Matchweek 6,Sat,Away,D,1,1,West Ham,1.0,...,1.0,0,0,2021,Manchester City,0,0,23,12,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,2023-05-06,15:00,Matchweek 35,Sat,Home,W,2,1,Leeds United,2.6,...,2.0,0,1,2023,Manchester City,1,1,10,15,5
34,2023-05-14,14:00,Matchweek 36,Sun,Away,W,3,0,Everton,0.8,...,2.0,0,0,2023,Manchester City,1,0,8,14,6
35,2023-05-21,16:00,Matchweek 37,Sun,Home,W,1,0,Chelsea,1.2,...,0.0,0,0,2023,Manchester City,1,1,6,16,6
36,2023-05-24,20:00,Matchweek 32,Wed,Away,D,1,1,Brighton,1.8,...,0.0,0,0,2023,Manchester City,0,0,4,20,2


### columns for rolling averages

In [35]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [36]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [37]:
rolling_stats = group[cols].rolling(3, closed='left').mean()

In [38]:
group[new_cols] = rolling_stats

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[new_cols] = rolling_stats
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[new_cols] = rolling_stats
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[new_cols] = rolling_stats
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [39]:
group = group.dropna(subset=new_cols)
group

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,...,15,5,3.000000,1.000000,17.666667,6.000000,17.466667,0.666667,0.333333,0.333333
4,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,...,19,2,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.000000,0.000000
5,2022-09-03,17:30,Matchweek 6,Sat,Away,D,1,1,Aston Villa,2.1,...,17,5,4.333333,1.666667,18.666667,8.000000,15.033333,0.333333,0.000000,0.000000
6,2022-09-17,12:30,Matchweek 8,Sat,Away,W,3,0,Wolves,1.1,...,12,5,3.666667,1.000000,16.000000,6.000000,15.233333,0.333333,0.000000,0.000000
7,2022-10-02,14:00,Matchweek 9,Sun,Home,W,6,3,Manchester Utd,3.2,...,14,6,3.333333,0.333333,15.333333,6.666667,17.000000,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,2021-05-01,12:30,Matchweek 34,Sat,Away,W,2,0,Crystal Palace,1.7,...,12,5,1.666667,1.000000,17.666667,4.666667,17.566667,1.333333,0.000000,0.000000
1554,2021-05-08,17:30,Matchweek 35,Sat,Home,L,1,2,Chelsea,1.8,...,17,5,1.666667,1.000000,20.666667,4.333333,17.600000,1.333333,0.000000,0.000000
1555,2021-05-14,20:00,Matchweek 36,Fri,Away,W,4,3,Newcastle Utd,1.5,...,20,4,1.666667,1.000000,16.000000,3.000000,17.966667,1.000000,0.000000,0.333333
1556,2021-05-18,19:00,Matchweek 37,Tue,Away,L,2,3,Brighton,0.9,...,19,1,2.333333,1.666667,16.666667,4.000000,18.200000,0.666667,0.000000,0.333333


In [40]:
rolling_stats

Unnamed: 0,gf,ga,sh,sot,dist,fk,pk,pkatt
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,3.000000,1.000000,17.666667,6.000000,17.466667,0.666667,0.333333,0.333333
4,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...
1553,1.666667,1.000000,17.666667,4.666667,17.566667,1.333333,0.000000,0.000000
1554,1.666667,1.000000,20.666667,4.333333,17.600000,1.333333,0.000000,0.000000
1555,1.666667,1.000000,16.000000,3.000000,17.966667,1.000000,0.000000,0.333333
1556,2.333333,1.666667,16.666667,4.000000,18.200000,0.666667,0.000000,0.333333


In [41]:
group

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,2.2,...,15,5,3.000000,1.000000,17.666667,6.000000,17.466667,0.666667,0.333333,0.333333
4,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,3.3,...,19,2,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.000000,0.000000
5,2022-09-03,17:30,Matchweek 6,Sat,Away,D,1,1,Aston Villa,2.1,...,17,5,4.333333,1.666667,18.666667,8.000000,15.033333,0.333333,0.000000,0.000000
6,2022-09-17,12:30,Matchweek 8,Sat,Away,W,3,0,Wolves,1.1,...,12,5,3.666667,1.000000,16.000000,6.000000,15.233333,0.333333,0.000000,0.000000
7,2022-10-02,14:00,Matchweek 9,Sun,Home,W,6,3,Manchester Utd,3.2,...,14,6,3.333333,0.333333,15.333333,6.666667,17.000000,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553,2021-05-01,12:30,Matchweek 34,Sat,Away,W,2,0,Crystal Palace,1.7,...,12,5,1.666667,1.000000,17.666667,4.666667,17.566667,1.333333,0.000000,0.000000
1554,2021-05-08,17:30,Matchweek 35,Sat,Home,L,1,2,Chelsea,1.8,...,17,5,1.666667,1.000000,20.666667,4.333333,17.600000,1.333333,0.000000,0.000000
1555,2021-05-14,20:00,Matchweek 36,Fri,Away,W,4,3,Newcastle Utd,1.5,...,20,4,1.666667,1.000000,16.000000,3.000000,17.966667,1.000000,0.000000,0.333333
1556,2021-05-18,19:00,Matchweek 37,Tue,Away,L,2,3,Brighton,0.9,...,19,1,2.333333,1.666667,16.666667,4.000000,18.200000,0.666667,0.000000,0.333333


### create function for rolling avg

In [42]:
def rolling_averages(group,cols,new_cols):
    rolling_stats=group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [43]:
matches_rolling=matches.groupby('team').apply(lambda x:rolling_averages(x, cols, new_cols) )

In [44]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,41,2022-08-27,17:30,Matchweek 4,Sat,Home,W,2,1,Fulham,2.6,...,17,5,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.0,0.0
Arsenal,42,2022-08-31,19:30,Matchweek 5,Wed,Home,W,2,1,Aston Villa,2.4,...,19,2,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.0,0.0
Arsenal,43,2022-09-04,16:30,Matchweek 6,Sun,Away,L,1,3,Manchester Utd,1.3,...,16,6,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.0,0.0
Arsenal,44,2022-09-18,12:00,Matchweek 8,Sun,Away,W,3,0,Brentford,1.5,...,12,6,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.0,0.0
Arsenal,45,2022-10-01,12:30,Matchweek 9,Sat,Home,W,3,1,Tottenham,2.4,...,12,5,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,2009,2021-05-03,18:00,Matchweek 34,Mon,Away,D,1,1,West Brom,1.0,...,18,0,0.666667,1.333333,9.666667,3.000000,17.766667,0.666667,0.0,0.0
Wolverhampton Wanderers,2010,2021-05-09,12:00,Matchweek 35,Sun,Home,W,2,1,Brighton,1.1,...,12,6,0.666667,1.666667,14.000000,5.333333,19.133333,0.333333,0.0,0.0
Wolverhampton Wanderers,2011,2021-05-16,14:05,Matchweek 36,Sun,Away,L,0,2,Tottenham,0.9,...,14,6,1.000000,2.000000,16.000000,6.000000,21.433333,0.666667,0.0,0.0
Wolverhampton Wanderers,2012,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0,1,Everton,0.3,...,18,2,1.000000,1.333333,17.000000,6.333333,19.033333,0.333333,0.0,0.0


In [45]:
matches_rolling=matches_rolling.droplevel('team')

In [46]:
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
41,2022-08-27,17:30,Matchweek 4,Sat,Home,W,2,1,Fulham,2.6,...,17,5,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.0,0.0
42,2022-08-31,19:30,Matchweek 5,Wed,Home,W,2,1,Aston Villa,2.4,...,19,2,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.0,0.0
43,2022-09-04,16:30,Matchweek 6,Sun,Away,L,1,3,Manchester Utd,1.3,...,16,6,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.0,0.0
44,2022-09-18,12:00,Matchweek 8,Sun,Away,W,3,0,Brentford,1.5,...,12,6,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.0,0.0
45,2022-10-01,12:30,Matchweek 9,Sat,Home,W,3,1,Tottenham,2.4,...,12,5,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009,2021-05-03,18:00,Matchweek 34,Mon,Away,D,1,1,West Brom,1.0,...,18,0,0.666667,1.333333,9.666667,3.000000,17.766667,0.666667,0.0,0.0
2010,2021-05-09,12:00,Matchweek 35,Sun,Home,W,2,1,Brighton,1.1,...,12,6,0.666667,1.666667,14.000000,5.333333,19.133333,0.333333,0.0,0.0
2011,2021-05-16,14:05,Matchweek 36,Sun,Away,L,0,2,Tottenham,0.9,...,14,6,1.000000,2.000000,16.000000,6.000000,21.433333,0.666667,0.0,0.0
2012,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0,1,Everton,0.3,...,18,2,1.000000,1.333333,17.000000,6.333333,19.033333,0.333333,0.0,0.0


In [47]:
matches_rolling.isnull().sum()

date               0
time               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance       689
captain            0
formation          0
referee            0
match report       0
sh                 0
sot                0
dist               1
fk                 0
pk                 0
pkatt              0
season             0
team               0
target             0
venue_code         0
opp_code           0
hour               0
day_code           0
gf_rolling         0
ga_rolling         0
sh_rolling         0
sot_rolling        0
dist_rolling       0
fk_rolling         0
pk_rolling         0
pkatt_rolling      0
dtype: int64

In [48]:
train=matches_rolling[matches_rolling['date']<'2022-05-22']

In [49]:
test=matches_rolling[matches_rolling['date']>'2022-08-06']

In [50]:
X_train=train[predictors+new_cols]
y_train=train['target']

In [51]:
X_test=test[predictors+new_cols]
y_test=test['target']

In [52]:
rf=RandomForestClassifier(n_estimators=60,min_samples_split=15,random_state=1)

In [53]:
rf.fit(X_train,y_train)

In [54]:
y_pred=rf.predict(X_test)

In [55]:
precision_score(y_test,y_pred)

0.5496688741721855

### apply Gridsearchcv for parameter tuning

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
n_estimators=[20,40,60,80,100]
max_features=[0.2,0.4,0.6,0.8,1.0]
bootstrap=[True,False]
max_depth=[2,4,6,None]
max_samples=[0.3,0.5,0.7,1.0]
min_samples_split=[2,4,6,8,10,12]

In [58]:
param_grid={'n_estimators':n_estimators,
           'max_features':max_features,
           'max_depth':max_depth,
            'bootstrap':bootstrap,
           'max_samples':max_samples,
           'min_samples_split':min_samples_split}
print(param_grid)

{'n_estimators': [20, 40, 60, 80, 100], 'max_features': [0.2, 0.4, 0.6, 0.8, 1.0], 'max_depth': [2, 4, 6, None], 'bootstrap': [True, False], 'max_samples': [0.3, 0.5, 0.7, 1.0], 'min_samples_split': [2, 4, 6, 8, 10, 12]}


In [59]:
rf_grid=GridSearchCV(estimator=rf,
                     param_grid=param_grid,
                     cv=5,
                     verbose=2,
                     n_jobs=-1
                    )

In [60]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 4800 candidates, totalling 24000 fits


12000 fits failed out of a total of 24000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\prati\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\prati\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 397, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.



In [61]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 6,
 'max_features': 0.4,
 'max_samples': 0.7,
 'min_samples_split': 10,
 'n_estimators': 20}

In [62]:
rf_grid.best_score_

0.6329488579488579

### again fit RF

In [63]:
rf=RandomForestClassifier(max_depth=6,max_features=0.4,max_samples=0.7,min_samples_split=10,n_estimators=20)

In [64]:
rf.fit(X_train,y_train)

In [65]:
y_pred=rf.predict(X_test)

In [66]:
precision_score(y_test,y_pred)

0.6017699115044248

In [67]:
accuracy_score(y_test,y_pred)

0.6442857142857142

In [68]:
combined=pd.DataFrame(dict(actual=y_test,predicted=y_pred))

In [69]:
combined

Unnamed: 0,actual,predicted
41,1,0
42,1,1
43,0,1
44,1,0
45,1,0
...,...,...
489,0,0
490,1,0
491,0,0
492,0,0


### Predicted results for 2022-2023 season

In [70]:
combined.merge(matches_rolling[['date','team','venue','opponent','result']],left_index=True,right_index=True)

Unnamed: 0,actual,predicted,date,team,venue,opponent,result
41,1,0,2022-08-27,Arsenal,Home,Fulham,W
42,1,1,2022-08-31,Arsenal,Home,Aston Villa,W
43,0,1,2022-09-04,Arsenal,Away,Manchester Utd,L
44,1,0,2022-09-18,Arsenal,Away,Brentford,W
45,1,0,2022-10-01,Arsenal,Home,Tottenham,W
...,...,...,...,...,...,...,...
489,0,0,2023-04-29,Wolverhampton Wanderers,Away,Brighton,L
490,1,0,2023-05-06,Wolverhampton Wanderers,Home,Aston Villa,W
491,0,0,2023-05-13,Wolverhampton Wanderers,Away,Manchester Utd,L
492,0,0,2023-05-20,Wolverhampton Wanderers,Home,Everton,D
