# Premier League Prediction System using Machine Learning

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [2]:
# Reading the CSV File
df = pd.read_csv('matches.csv', index_col=0)

In [3]:
# Printing top 5 rows of the Dataset
df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [4]:
# Printing the last 5 rows of the Dataset
df.tail()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,Match Report,,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,Match Report,,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,Match Report,,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,Match Report,,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United
42,2021-05-23,16:00,Premier League,Matchweek 38,Sun,Home,W,1.0,0.0,Burnley,...,Match Report,,12.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United


In [5]:
# Shape of the Dataset
df.shape

(1389, 27)

In [6]:
# Checking for Null/Empty Value
df.isnull().sum()

date               0
time               0
comp               0
round              0
day                0
venue              0
result             0
gf                 0
ga                 0
opponent           0
xg                 0
xga                0
poss               0
attendance       696
captain            0
formation          0
referee            0
match report       0
notes           1389
sh                 0
sot                0
dist               1
fk                 0
pk                 0
pkatt              0
season             0
team               0
dtype: int64

In [7]:
# Checking for Duplicated Values
df.duplicated()

1     False
2     False
3     False
4     False
6     False
      ...  
38    False
39    False
40    False
41    False
42    False
Length: 1389, dtype: bool

In [8]:
# Printing the Value Counts of Team Column
df['team'].value_counts()

team
Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: count, dtype: int64

In [9]:
# Selected Team as Liverpool
df[df['team']== 'Liverpool']

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,W,4.0,3.0,Leeds United,...,Match Report,,20.0,4.0,17.0,0.0,2.0,2.0,2021,Liverpool
2,2020-09-20,16:30,Premier League,Matchweek 2,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,17.0,5.0,17.7,1.0,0.0,0.0,2021,Liverpool
4,2020-09-28,20:00,Premier League,Matchweek 3,Mon,Home,W,3.0,1.0,Arsenal,...,Match Report,,21.0,9.0,16.8,0.0,0.0,0.0,2021,Liverpool
6,2020-10-04,19:15,Premier League,Matchweek 4,Sun,Away,L,2.0,7.0,Aston Villa,...,Match Report,,14.0,8.0,15.8,1.0,0.0,0.0,2021,Liverpool
7,2020-10-17,12:30,Premier League,Matchweek 5,Sat,Away,D,2.0,2.0,Everton,...,Match Report,,22.0,8.0,15.0,1.0,0.0,0.0,2021,Liverpool
9,2020-10-24,20:00,Premier League,Matchweek 6,Sat,Home,W,2.0,1.0,Sheffield Utd,...,Match Report,,17.0,5.0,18.2,1.0,0.0,0.0,2021,Liverpool
11,2020-10-31,17:30,Premier League,Matchweek 7,Sat,Home,W,2.0,1.0,West Ham,...,Match Report,,8.0,2.0,18.6,1.0,1.0,1.0,2021,Liverpool
13,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Away,D,1.0,1.0,Manchester City,...,Match Report,,9.0,2.0,21.5,0.0,1.0,1.0,2021,Liverpool
14,2020-11-22,19:15,Premier League,Matchweek 9,Sun,Home,W,3.0,0.0,Leicester City,...,Match Report,,24.0,12.0,11.9,0.0,0.0,0.0,2021,Liverpool
16,2020-11-28,12:30,Premier League,Matchweek 10,Sat,Away,D,1.0,1.0,Brighton,...,Match Report,,6.0,2.0,20.9,0.0,0.0,0.0,2021,Liverpool


In [10]:
# Printing the Value Counts of Round Column
df['round'].value_counts()

round
Matchweek 1     39
Matchweek 16    39
Matchweek 34    39
Matchweek 32    39
Matchweek 31    39
Matchweek 29    39
Matchweek 28    39
Matchweek 26    39
Matchweek 25    39
Matchweek 24    39
Matchweek 23    39
Matchweek 2     39
Matchweek 19    39
Matchweek 17    39
Matchweek 20    39
Matchweek 15    39
Matchweek 5     39
Matchweek 3     39
Matchweek 13    39
Matchweek 12    39
Matchweek 4     39
Matchweek 11    39
Matchweek 10    39
Matchweek 9     39
Matchweek 8     39
Matchweek 14    39
Matchweek 7     39
Matchweek 6     39
Matchweek 30    37
Matchweek 27    37
Matchweek 22    37
Matchweek 21    37
Matchweek 18    37
Matchweek 33    32
Matchweek 35    20
Matchweek 36    20
Matchweek 37    20
Matchweek 38    20
Name: count, dtype: int64

In [11]:
# Data-types of the Dataset
df.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [12]:
# Changing the date dtype to Datetime
df['date'] = pd.to_datetime(df['date'])

In [13]:
# Checking for the change in Data-types
df.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team                    object
dtype: object

In [14]:
# Dropping the Columns from the Data-types
df = df.drop(columns = ['comp', 'notes'], axis=1)

In [15]:
# Printing the Value Counts of Result Column
df.result.value_counts()

result
L    548
W    526
D    315
Name: count, dtype: int64

In [16]:
# Mapping the String Values to Numerical Values
df['target'] = df['result'].map({'W': 1, 'L':0, 'D': 0})

In [17]:
df

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,match report,sh,sot,dist,fk,pk,pkatt,season,team,target
1,2021-08-15,16:30,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,1.9,...,Match Report,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City,0
2,2021-08-21,15:00,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.7,...,Match Report,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City,1
3,2021-08-28,12:30,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,3.8,...,Match Report,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City,1
4,2021-09-11,15:00,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.9,...,Match Report,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City,1
6,2021-09-18,15:00,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.1,...,Match Report,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,0.5,...,Match Report,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United,0
39,2021-05-08,15:00,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,0.7,...,Match Report,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United,0
40,2021-05-16,19:00,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,1.6,...,Match Report,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United,1
41,2021-05-19,18:00,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,0.8,...,Match Report,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United,0


In [18]:
# Printing the Value Counts of Venue Column
df.venue.value_counts()

venue
Away    695
Home    694
Name: count, dtype: int64

In [19]:
# Mapping the String Values to Numerical Values
df['venue_codes'] = df['venue'].map({'Away': 0, 'Home':1})

In [20]:
# Printing the Value Counts of Opponent Column
df.opponent.value_counts()

opponent
Southampton        71
Newcastle Utd      71
Liverpool          71
Tottenham          70
Wolves             70
Brighton           70
Manchester Utd     70
West Ham           70
Arsenal            69
Manchester City    69
Burnley            69
Crystal Palace     69
Leeds United       69
Aston Villa        69
Everton            68
Chelsea            68
Leicester City     68
Sheffield Utd      38
Fulham             38
West Brom          38
Brentford          32
Watford            31
Norwich City       31
Name: count, dtype: int64

In [21]:
# Printing random numerical values
df['opp_codes'] = df['opponent'].astype('category').cat.codes

In [22]:
# Distributing the Time to Hour
df['hour'] = df['time'].str.replace(':.+','',regex=True).astype('int')

In [23]:
# Numerical Values of the Day of the Week (Monday-0, Sunday-6)
df['day_code'] = df['date'].dt.dayofweek

In [24]:
df.head()

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,target,venue_codes,opp_codes,hour,day_code
1,2021-08-15,16:30,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,1.9,...,1.0,0.0,0.0,2022,Manchester City,0,0,18,16,6
2,2021-08-21,15:00,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,2.7,...,1.0,0.0,0.0,2022,Manchester City,1,1,15,15,5
3,2021-08-28,12:30,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,3.8,...,0.0,0.0,0.0,2022,Manchester City,1,1,0,12,5
4,2021-09-11,15:00,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,2.9,...,0.0,0.0,0.0,2022,Manchester City,1,0,10,15,5
6,2021-09-18,15:00,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,1.1,...,1.0,0.0,0.0,2022,Manchester City,0,1,17,15,5


### Model Building

In [25]:
# Random Forest Classifier Model
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [26]:
# Splitting the Data as Train and Test datasets
train = df[df['date'] < '2022-01-01']
test = df[df['date'] > '2022-01-01']
predictors = ['venue_codes','opp_codes','hour','day_code']

In [27]:
# Fitting the Model to the Dataset
rf.fit(train[predictors],train['target'] )

In [28]:
preds = rf.predict(test[predictors])

In [29]:
# Accuracy Score
accuracy = accuracy_score(test['target'],preds)

In [30]:
accuracy

0.6123188405797102

In [31]:
combined = pd.DataFrame(dict(actual = test['target'], predictions = preds))

In [32]:
pd.crosstab(index=combined['actual'],columns = combined['predictions'] )

predictions,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


In [33]:
precision_score(test['target'], preds)

0.4745762711864407

In [34]:
grouped_matches = df.groupby('team')

In [35]:
group = grouped_matches.get_group('Chelsea').sort_values('date')

In [36]:
group

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,fk,pk,pkatt,season,team,target,venue_codes,opp_codes,hour,day_code
0,2020-09-14,20:15,Matchweek 1,Mon,Away,W,3.0,1.0,Brighton,1.2,...,0.0,1.0,1.0,2021,Chelsea,1,0,3,20,0
1,2020-09-20,16:30,Matchweek 2,Sun,Home,L,0.0,2.0,Liverpool,1.0,...,0.0,0.0,1.0,2021,Chelsea,0,1,11,16,6
3,2020-09-26,17:30,Matchweek 3,Sat,Away,D,3.0,3.0,West Brom,2.2,...,2.0,0.0,0.0,2021,Chelsea,0,0,20,17,5
5,2020-10-03,12:30,Matchweek 4,Sat,Home,W,4.0,0.0,Crystal Palace,2.5,...,0.0,2.0,2.0,2021,Chelsea,1,1,6,12,5
6,2020-10-17,15:00,Matchweek 5,Sat,Home,D,3.0,3.0,Southampton,2.0,...,0.0,0.0,0.0,2021,Chelsea,0,1,17,15,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-03-13,14:00,Matchweek 29,Sun,Home,W,1.0,0.0,Newcastle Utd,0.8,...,1.0,0.0,0.0,2022,Chelsea,1,1,14,14,6
47,2022-04-02,15:00,Matchweek 31,Sat,Home,L,1.0,4.0,Brentford,1.6,...,0.0,0.0,0.0,2022,Chelsea,0,1,2,15,5
49,2022-04-09,15:00,Matchweek 32,Sat,Away,W,6.0,0.0,Southampton,4.2,...,0.0,0.0,0.0,2022,Chelsea,1,0,17,15,5
52,2022-04-20,19:45,Matchweek 25,Wed,Home,L,2.0,4.0,Arsenal,0.7,...,0.0,0.0,0.0,2022,Chelsea,0,1,0,19,2


In [37]:
def rolling_averages(group,cols,new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [38]:
cols = ['gf','ga','sh','sot','dist','fk','pk','pkatt']
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group,cols,new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-03,12:30,Matchweek 4,Sat,Home,W,4.0,0.0,Crystal Palace,2.5,...,12,5,2.000000,2.000000,12.333333,5.666667,20.366667,0.666667,0.333333,0.666667
6,2020-10-17,15:00,Matchweek 5,Sat,Home,D,3.0,3.0,Southampton,2.0,...,15,5,2.333333,1.666667,14.333333,5.666667,18.933333,0.666667,0.666667,1.000000
8,2020-10-24,17:30,Matchweek 6,Sat,Away,D,0.0,0.0,Manchester Utd,0.2,...,17,5,3.333333,2.000000,17.000000,6.666667,15.300000,0.666667,0.666667,0.666667
10,2020-10-31,15:00,Matchweek 7,Sat,Away,W,3.0,0.0,Burnley,1.4,...,15,5,2.333333,1.000000,11.000000,3.333333,15.300000,0.000000,0.666667,0.666667
12,2020-11-07,17:30,Matchweek 8,Sat,Home,W,4.0,1.0,Sheffield Utd,2.7,...,17,5,2.000000,1.000000,10.666667,5.000000,15.733333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-03-13,14:00,Matchweek 29,Sun,Home,W,1.0,0.0,Newcastle Utd,0.8,...,14,6,2.666667,0.333333,12.000000,5.000000,15.600000,0.666667,0.000000,0.000000
47,2022-04-02,15:00,Matchweek 31,Sat,Home,L,1.0,4.0,Brentford,1.6,...,15,5,2.666667,0.333333,11.333333,5.000000,15.133333,0.666667,0.000000,0.000000
49,2022-04-09,15:00,Matchweek 32,Sat,Away,W,6.0,0.0,Southampton,4.2,...,15,5,1.666667,1.666667,14.666667,6.000000,16.100000,0.666667,0.000000,0.000000
52,2022-04-20,19:45,Matchweek 25,Wed,Home,L,2.0,4.0,Arsenal,0.7,...,19,2,2.666667,1.333333,17.666667,8.333333,17.000000,0.333333,0.000000,0.000000


In [39]:
df_rolling = df.groupby('team').apply(lambda x : rolling_averages(x,cols,new_cols))

In [40]:
df_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.4,...,14,6,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,17,5,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.9,...,19,6,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.1,...,16,6,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,19,6,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,32,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,14,6,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,33,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,20,4,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,34,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,15,5,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,35,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,20,4,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [41]:
df_rolling = df_rolling.droplevel('team')

In [42]:
df_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.4,...,14,6,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,17,5,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.9,...,19,6,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.1,...,16,6,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,19,6,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,14,6,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
33,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,20,4,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
34,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,15,5,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
35,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,20,4,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [43]:
df_rolling.index = range(df_rolling.shape[0])

In [44]:
df_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.4,...,14,6,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,17,5,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.9,...,19,6,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.1,...,16,6,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,19,6,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,14,6,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
1313,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,20,4,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
1314,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,15,5,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
1315,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,20,4,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [45]:
def make_prediction(data, prediction):
    train = data[data['date'] < '2022-01-01']
    test = data[data['date'] > '2022-01-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual = test['target'], predictions = preds))
    error = precision_score(test['target'], preds)
    return combined, error

In [46]:
combined , error = make_prediction(df_rolling, predictors + new_cols)

In [47]:
error

0.4423076923076923

In [48]:
combined

Unnamed: 0,actual,predictions
55,0,1
56,1,0
57,1,0
58,1,0
59,1,0
...,...,...
1312,1,0
1313,0,0
1314,1,0
1315,0,0


In [49]:
combined = combined.merge(df_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [50]:
combined

Unnamed: 0,actual,predictions,date,team,opponent,result
55,0,1,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,0,2022-02-24,Arsenal,Wolves,W
59,1,0,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1312,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W
1313,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L
1314,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W
1315,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L


In [51]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [52]:
combined['new_team'] = combined['team'].map(mapping)

In [53]:
merged = combined.merge(combined, left_on =['date', 'new_team'], right_on = ['date','opponent'])

In [54]:
merged

Unnamed: 0,actual_x,predictions_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predictions_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,0,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,0,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W,Wolves,0,0,Everton,Wolves,L,Everton
258,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L,Wolves,1,0,Leeds United,Wolves,W,Leeds United
259,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,0,Aston Villa,Wolves,L,Aston Villa
260,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L,Wolves,1,0,Newcastle United,Wolves,W,Newcastle Utd


In [55]:
merged[(merged["predictions_x"] == 1) & (merged["predictions_y"] ==0)]["actual_x"].value_counts()

actual_x
0    25
1    19
Name: count, dtype: int64