# This notebook will use Random Forest to predict EPL matches

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
epl_df = pd.read_csv(Path('Resources/epl_match_data.csv'))
epl_df.dtypes.value_counts()


float64    263
int64        3
dtype: int64

In [5]:
epl_df.drop(columns = ['Unnamed: 0'], inplace = True)
display(epl_df)
display(epl_df.dtypes.value_counts())

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,...,Referee_l Mason,Referee_ A D'Urso,Referee_ A Wiley,Referee_ C Foy,Referee_ D Gallagher,Referee_ H Webb,Referee_ M Atkinson,Referee_ N Barry,Referee_ S Dunn,Referee_ U Rennie
0,2.30,0.75,0.95,0.30,1.45,16.85,6.60,9.70,3.15,9.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.40,0.75,1.00,0.30,1.45,17.00,6.35,9.65,2.95,9.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.20,0.65,0.95,0.20,1.50,17.25,6.35,9.80,2.95,9.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.25,0.75,0.90,0.25,1.40,17.25,6.50,9.70,3.10,9.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.30,0.95,0.90,0.35,1.30,18.20,6.30,10.05,3.10,9.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,1.30,2.30,0.65,1.10,0.90,13.05,15.00,7.10,9.60,6.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4701,1.25,2.30,0.55,1.00,0.90,13.10,15.20,7.30,9.65,6.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4702,1.25,2.30,0.55,0.95,0.90,13.00,15.25,7.25,9.70,6.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4703,1.20,2.35,0.50,1.00,0.80,12.75,15.35,7.30,9.90,6.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


float64    263
int64        2
dtype: int64

In [6]:
y = epl_df[['FTR']]
X = epl_df.drop(columns = ['FTR'])
display(y)
display(X)

Unnamed: 0,FTR
0,0
1,2
2,1
3,1
4,0
...,...
4700,0
4701,0
4702,0
4703,0


Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HTR,HS,AS,HST,AST,HC,...,Referee_l Mason,Referee_ A D'Urso,Referee_ A Wiley,Referee_ C Foy,Referee_ D Gallagher,Referee_ H Webb,Referee_ M Atkinson,Referee_ N Barry,Referee_ S Dunn,Referee_ U Rennie
0,2.30,0.75,0.95,0.30,1.45,16.85,6.60,9.70,3.15,9.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.40,0.75,1.00,0.30,1.45,17.00,6.35,9.65,2.95,9.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.20,0.65,0.95,0.20,1.50,17.25,6.35,9.80,2.95,9.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.25,0.75,0.90,0.25,1.40,17.25,6.50,9.70,3.10,9.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.30,0.95,0.90,0.35,1.30,18.20,6.30,10.05,3.10,9.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,1.30,2.30,0.65,1.10,0.90,13.05,15.00,7.10,9.60,6.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4701,1.25,2.30,0.55,1.00,0.90,13.10,15.20,7.30,9.65,6.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4702,1.25,2.30,0.55,0.95,0.90,13.00,15.25,7.25,9.70,6.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4703,1.20,2.35,0.50,1.00,0.80,12.75,15.35,7.30,9.90,6.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.fit_transform(X_test)

In [9]:
rf_model = RandomForestClassifier()

In [10]:
rf_model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier()

In [11]:
predictions = rf_model.predict(X_test)

In [12]:
accuracy_score(y_test, predictions)

0.4613423959218352

In [14]:
confusion_matrix(y_test, predictions)

array([[128,  45, 132],
       [ 84,  35, 185],
       [120,  68, 380]])

In [15]:
classification_report(y_test, predictions)

'              precision    recall  f1-score   support\n\n           0       0.39      0.42      0.40       305\n           1       0.24      0.12      0.15       304\n           2       0.55      0.67      0.60       568\n\n    accuracy                           0.46      1177\n   macro avg       0.39      0.40      0.39      1177\nweighted avg       0.42      0.46      0.43      1177\n'

In [16]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.049230817423688435, 'HST'),
 (0.04907311888439904, 'FTHG'),
 (0.04857791011528171, 'HS'),
 (0.0469519812788285, 'FTAG'),
 (0.04691136086299821, 'AS'),
 (0.04631446979608023, 'AST'),
 (0.04552804503274063, 'HC'),
 (0.04479798299455913, 'AC'),
 (0.044646960851135356, 'HF'),
 (0.04392933056606811, 'AF'),
 (0.040568840782840294, 'HTR'),
 (0.03981801406995933, 'HTHG'),
 (0.03787403415900893, 'AY'),
 (0.03720316874663483, 'HY'),
 (0.03616272765836725, 'HTAG'),
 (0.022523825891168924, 'Season'),
 (0.022130228291260608, 'AR'),
 (0.019298147557934672, 'HR'),
 (0.005350598004306744, 'AwayTeam_Chelsea'),
 (0.005292761363304111, 'Referee_M Atkinson'),
 (0.004865061976398768, 'Referee_M Dean'),
 (0.004772227930653689, 'AwayTeam_Arsenal'),
 (0.00469831186215988, 'AwayTeam_Everton'),
 (0.004431341158780088, 'Referee_A Taylor'),
 (0.004283288184422554, 'AwayTeam_Tottenham'),
 (0.004259705402759647, 'AwayTeam_Newcastle'),
 (0.004226995119423284, 'AwayTeam_Aston Villa'),
 (0.004220308348070596, 'Awa