# This notebook will use Random Forest to predict EPL matches

In [39]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [40]:
epl_df = pd.read_csv(Path('Resources/epl_match_data.csv'),
                    parse_dates = ['DateTime'],
                    infer_datetime_format = True)
epl_df.dtypes.value_counts()


float64                260
int64                    6
datetime64[ns, UTC]      1
dtype: int64

In [41]:
epl_df['DateTime'] = epl_df['DateTime'].apply(lambda x: x.toordinal())
epl_df.drop(columns = ['Unnamed: 0'], inplace = True)
display(epl_df)
display(epl_df.dtypes.value_counts())

Unnamed: 0,Season,DateTime,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,AS,...,Referee_l Mason,Referee_ A D'Urso,Referee_ A Wiley,Referee_ C Foy,Referee_ D Gallagher,Referee_ H Webb,Referee_ M Atkinson,Referee_ N Barry,Referee_ S Dunn,Referee_ U Rennie
0,2001,730351,4,0,2,2.0,0.0,2,17.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2001,730351,4,2,2,1.0,0.0,2,17.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2001,730351,1,3,0,1.0,1.0,1,6.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2001,730351,2,2,1,1.0,2.0,0,6.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2001,730351,2,0,2,2.0,0.0,2,17.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8284,2022,738254,0,4,0,0.0,1.0,0,9.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8285,2022,738255,2,0,2,0.0,0.0,1,15.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8286,2022,738255,2,1,2,2.0,0.0,2,12.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8287,2022,738255,2,0,2,1.0,0.0,2,17.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


float64    260
int64        6
dtype: int64

In [42]:
y = epl_df[['FTR']]
X = epl_df.drop(columns = ['FTR'])
display(y)
display(X)

Unnamed: 0,FTR
0,2
1,2
2,0
3,1
4,2
...,...
8284,0
8285,2
8286,2
8287,2


Unnamed: 0,Season,DateTime,FTHG,FTAG,HTHG,HTAG,HTR,HS,AS,HST,...,Referee_l Mason,Referee_ A D'Urso,Referee_ A Wiley,Referee_ C Foy,Referee_ D Gallagher,Referee_ H Webb,Referee_ M Atkinson,Referee_ N Barry,Referee_ S Dunn,Referee_ U Rennie
0,2001,730351,4,0,2.0,0.0,2,17.0,8.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2001,730351,4,2,1.0,0.0,2,17.0,12.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2001,730351,1,3,1.0,1.0,1,6.0,16.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2001,730351,2,2,1.0,2.0,0,6.0,13.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2001,730351,2,0,2.0,0.0,2,17.0,12.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8284,2022,738254,0,4,0.0,1.0,0,9.0,11.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8285,2022,738255,2,0,0.0,0.0,1,15.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8286,2022,738255,2,1,2.0,0.0,2,12.0,11.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8287,2022,738255,2,0,1.0,0.0,2,17.0,18.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [44]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.fit_transform(X_test)

In [45]:
rf_model = RandomForestClassifier()

In [46]:
rf_model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier()

In [47]:
predictions = rf_model.predict(X_test)

In [48]:
accuracy_score(y_test, predictions)

0.9339122045344911

In [49]:
confusion_matrix(y_test, predictions)

array([[602,  14,   6],
       [ 57, 399,  40],
       [ 13,   7, 935]])

In [50]:
classification_report(y_test, predictions)

'              precision    recall  f1-score   support\n\n           0       0.90      0.97      0.93       622\n           1       0.95      0.80      0.87       496\n           2       0.95      0.98      0.97       955\n\n    accuracy                           0.93      2073\n   macro avg       0.93      0.92      0.92      2073\nweighted avg       0.94      0.93      0.93      2073\n'

In [51]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1918421874769265, 'FTHG'),
 (0.17224281571621713, 'FTAG'),
 (0.07022165469357594, 'HTR'),
 (0.04490199928098066, 'HTHG'),
 (0.041567854743971275, 'HTAG'),
 (0.03246882376777133, 'HST'),
 (0.030570974517710344, 'AST'),
 (0.028177676905775797, 'AS'),
 (0.026743614088333683, 'DateTime'),
 (0.026695420914601326, 'HS'),
 (0.02227196345394243, 'AF'),
 (0.02161348248794989, 'HF'),
 (0.020329791458021954, 'Season'),
 (0.019911580653644404, 'AC'),
 (0.019740975267224528, 'HC'),
 (0.01610916888402685, 'AY'),
 (0.015422240861952834, 'HY'),
 (0.0043770475859364975, 'HR'),
 (0.004050664168227199, 'AwayTeam_Man United'),
 (0.0038400382823158496, 'AR'),
 (0.0035099787727301615, 'AwayTeam_Chelsea'),
 (0.0033456780976667522, 'Referee_M Dean'),
 (0.0031448094153857568, 'HomeTeam_Arsenal'),
 (0.0028185598299451595, 'HomeTeam_Everton'),
 (0.0027984908426732097, 'AwayTeam_Tottenham'),
 (0.002785759458989263, 'Referee_M Atkinson'),
 (0.0027460281525644886, 'AwayTeam_Everton'),
 (0.002601758592926133, 'H