In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  


  from numpy.core.umath_tests import inner1d


In [2]:
# Current season Roster Power Index(RPI) from PES Database.
# Each data is used to predict each season.
# For example, use pes14 to predict (train) 14/15 season.
# Merge PES 15 data with epl_table_1314 table in order to predict EPL 14/15 season.
pes14 = pd.read_csv('./Data/PES14.csv')
pes15 = pd.read_csv('./Data/PES15.csv')
pes16 = pd.read_csv('./Data/PES16.csv')
pes17 = pd.read_csv('./Data/PES17.csv')
pes18 = pd.read_csv('./Data/PES18.csv')
pes19 = pd.read_csv('./Data/PES19.csv')

In [3]:
# Past season RPIs and Match Power Index(MPI)from past season.
epl_table_1213 = pd.read_csv('./Data/epl_table_1213.csv')
epl_table_1314 = pd.read_csv('./Data/epl_table_1314.csv')
epl_table_1415 = pd.read_csv('./Data/epl_table_1415.csv')
epl_table_1516 = pd.read_csv('./Data/epl_table_1516.csv')
epl_table_1617 = pd.read_csv('./Data/epl_table_1617.csv')
epl_table_1718 = pd.read_csv('./Data/epl_table_1718.csv')

In [4]:
# Merge two RPIs.
# Merge PES 15 database and EPL 13/14 table.
# Train 5 merged data to predict 18/19 season.
pr_14 = pd.merge(pes14, epl_table_1213, on='Team', how='outer').sort_values(by='Team').dropna()
pr_15 = pd.merge(pes15, epl_table_1314, on='Team', how='outer').sort_values(by='Team').dropna()
pr_16 = pd.merge(pes16, epl_table_1415, on='Team', how='outer').sort_values(by='Team').dropna()
pr_17 = pd.merge(pes17, epl_table_1516, on='Team', how='outer').sort_values(by='Team').dropna()
pr_18 = pd.merge(pes18, epl_table_1617, on='Team', how='outer').sort_values(by='Team').dropna()
pr_19 = pd.merge(pes19, epl_table_1718, on='Team', how='outer').sort_values(by='Team').dropna()

In [5]:
pr_14.head(10)

Unnamed: 0,Team,Ovr,Def,Mid,Fwd,Phy,Spd,P,W,D,...,AGA,PTS,H_Att,A_Att,H_Def,A_Def,YC,RC,DIS,CS
3,Arsenal,84.0,84.0,85.0,82.0,78.0,80.0,38.0,21.0,10.0,...,14.0,73.0,1.587838,1.061571,0.976645,0.472973,40.0,5.0,0.789474,14.0
18,Aston Villa,79.0,79.0,79.0,81.0,78.0,79.0,38.0,10.0,11.0,...,41.0,41.0,0.777027,1.019108,1.18896,1.385135,70.0,3.0,1.078947,5.0
0,Chelsea,86.0,87.0,85.0,87.0,81.0,78.0,38.0,22.0,9.0,...,23.0,75.0,1.385135,1.443737,0.679406,0.777027,49.0,3.0,0.802632,14.0
10,Everton,80.0,81.0,83.0,79.0,79.0,77.0,38.0,16.0,15.0,...,23.0,63.0,1.114865,0.934183,0.721868,0.777027,57.0,3.0,0.907895,11.0
12,Fulham,80.0,80.0,80.0,84.0,78.0,76.0,38.0,11.0,10.0,...,30.0,43.0,0.945946,0.934183,1.273885,1.013514,48.0,3.0,0.789474,8.0
5,Liverpool,82.0,82.0,82.0,83.0,80.0,79.0,38.0,16.0,13.0,...,27.0,61.0,1.114865,1.613588,0.679406,0.912162,54.0,2.0,0.815789,16.0
1,Manchester City,86.0,85.0,86.0,92.0,82.0,79.0,38.0,23.0,9.0,...,19.0,78.0,1.385135,1.061571,0.636943,0.641892,63.0,3.0,0.986842,18.0
2,Manchester Utd,85.0,83.0,85.0,89.0,79.0,78.0,38.0,28.0,5.0,...,24.0,89.0,1.52027,1.740977,0.806794,0.810811,57.0,1.0,0.802632,13.0
7,Newcastle Utd,81.0,81.0,80.0,82.0,79.0,79.0,38.0,11.0,8.0,...,37.0,41.0,0.810811,0.89172,1.316348,1.25,69.0,4.0,1.118421,6.0
14,Norwich City,79.0,80.0,79.0,80.0,78.0,77.0,38.0,10.0,14.0,...,38.0,44.0,0.844595,0.679406,0.849257,1.283784,60.0,1.0,0.842105,10.0


In [6]:
# Load EPL fixtures.
epl_detail_1314 = pd.read_csv('./Data/epl14.csv')
epl_detail_1415 = pd.read_csv('./Data/epl15.csv')
epl_detail_1516 = pd.read_csv('./Data/epl16.csv')
epl_detail_1617 = pd.read_csv('./Data/epl17.csv')
epl_detail_1718 = pd.read_csv('./Data/epl18.csv')
epl_detail_1819 = pd.read_csv('./Data/epl19.csv')

In [7]:
# Create columns to merge with EPL fixture dataframes.
new_cols = ['HtOff', 'HtDef', 'AtOff', 'AtDef', 'HtPesOvr', 'AtPesOvr', 
            'HtPesDef', 'HtPesMid', 'HtPesOff', 'AtPesDef', 'AtPesMid', 'AtPesOff',
            'HtPesPhy', 'HtPesSpd', 'AtPesPhy', 'AtPesSpd', 'HtDis', 'AtDis',
            'HtYC', 'HtRC', 'HtCS', 'AtYC', 'AtRC', 'AtCS']

def detail_df(epl_detail_data):
    '''
    Take Home, away teams and the result from fixture dataframes.
    '''
    epl_detail_data = epl_detail_data[['HomeTeam', 'AwayTeam', 'FTR']]
    epl_detail_data = epl_detail_data.reindex(columns=epl_detail_data.columns.tolist() + new_cols)
    #epl_detail_data = epl_detail_data.replace(team_dict)
    
    return epl_detail_data

epl_detail_1314 = detail_df(epl_detail_1314)
epl_detail_1415 = detail_df(epl_detail_1415)
epl_detail_1516 = detail_df(epl_detail_1516)
epl_detail_1617 = detail_df(epl_detail_1617)
epl_detail_1718 = detail_df(epl_detail_1718)
epl_detail_1819 = detail_df(epl_detail_1819)

In [8]:
def fill_df(epl_detail_data, pes_data):
    '''
    Pass RPIs to fixture dataframe.
    IndexError will be ignored.
    The model ignores relegation and promotion of teams.
    '''
    for team in epl_detail_data['HomeTeam']:
        try:
            ht_off = pes_data.loc[pes_data['Team'] == team, 'H_Att'].values[0] 
            ht_def = pes_data.loc[pes_data['Team'] == team, 'H_Def'].values[0]
            at_off = pes_data.loc[pes_data['Team'] == team, 'A_Att'].values[0]
            at_def = pes_data.loc[pes_data['Team'] == team, 'A_Def'].values[0]
            
            pes_ovr = pes_data.loc[pes_data['Team'] == team, 'Ovr'].values[0]
            pes_off = pes_data.loc[pes_data['Team'] == team, 'Fwd'].values[0]
            pes_def = pes_data.loc[pes_data['Team'] == team, 'Def'].values[0]
            pes_mid = pes_data.loc[pes_data['Team'] == team, 'Mid'].values[0]
            pes_spd = pes_data.loc[pes_data['Team'] == team, 'Spd'].values[0]
            pes_phy = pes_data.loc[pes_data['Team'] == team, 'Phy'].values[0]
            
            cs = pes_data.loc[pes_data['Team'] == team, 'CS'].values[0]
            yc = pes_data.loc[pes_data['Team'] == team, 'YC'].values[0]
            rc = pes_data.loc[pes_data['Team'] == team, 'RC'].values[0]
            dis = pes_data.loc[pes_data['Team'] == team, 'DIS'].values[0]

            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtYC'] = yc
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtRC'] = rc
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtCS'] = cs
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtDis'] = dis
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtDis'] = dis
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtYC'] = yc
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtRC'] = rc
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtCS'] = cs
            
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtOff'] = ht_off
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtDef'] = ht_def
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtOff'] = at_off
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtDef'] = at_def
            
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesOvr'] = pes_ovr
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesOvr'] = pes_ovr
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesOff'] = pes_off
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesOff'] = pes_off
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesDef'] = pes_def
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesDef'] = pes_def
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesMid'] = pes_mid
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesMid'] = pes_mid
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesSpd'] = pes_spd
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesSpd'] = pes_spd
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesPhy'] = pes_phy
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesPhy'] = pes_phy
        
        except IndexError:
            pass
        
    return epl_detail_data
            
epl_concat_1314 = fill_df(epl_detail_1314, pr_14)
epl_concat_1415 = fill_df(epl_detail_1415, pr_15)
epl_concat_1516 = fill_df(epl_detail_1516, pr_16)
epl_concat_1617 = fill_df(epl_detail_1617, pr_17)
epl_concat_1718 = fill_df(epl_detail_1718, pr_18)
epl_concat_1819 = fill_df(epl_detail_1819, pr_19)

In [9]:
# Final model.
model_df = pd.concat([epl_concat_1314, epl_concat_1415, epl_concat_1516, epl_concat_1617, epl_concat_1718])

# Drop NAs from promotion/relegation.
model_df = model_df.dropna()

In [10]:
X = model_df.drop(columns=(['HomeTeam', 'AwayTeam', 'FTR']))
y = model_df['FTR']

In [11]:
scaler = StandardScaler()  
X = scaler.fit_transform(X)

knn = KNeighborsClassifier(n_neighbors=5)
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(2),solver='sgd',learning_rate_init= 0.01, random_state=42)
 
knn.fit(X, y)
lr.fit(X, y)
rf.fit(X, y)
mlp.fit(X, y);

In [12]:
print('---- Score on Train (Past 5 Seasons) ----')
print('KNN:' + str(knn.score(X, y)))
print('LR:' + str(lr.score(X, y)))
print('RF:' + str(rf.score(X, y)))
print('MLP:' + str(mlp.score(X, y)))

print('---- Score on Test (Current Season) ----')
print('KNN:' + str(knn.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))
print('LR:' + str(lr.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))
print('RF:' + str(rf.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))
print('MLP:' + str(mlp.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))

---- Score on Train (Past 5 Seasons) ----
KNN:0.6051470588235294
LR:0.5279411764705882
RF:0.9801470588235294
MLP:0.5286764705882353
---- Score on Test (Current Season) ----
KNN:0.5116279069767442
LR:0.5581395348837209
RF:0.4186046511627907
MLP:0.4186046511627907


In [13]:
epl_detail_1819.dropna(inplace=True)
predict_X = epl_detail_1819[new_cols]
compare = epl_detail_1819[['HomeTeam', 'AwayTeam']]
compare = compare.reindex(columns=compare.columns.tolist() + ['Actual', 'Predict'])
compare['Actual'] = epl_detail_1819['FTR']
compare['Predict'] = knn.predict(predict_X)

In [14]:
compare

Unnamed: 0,HomeTeam,AwayTeam,Actual,Predict
0,Manchester Utd,Leicester City,H,H
3,Huddersfield,Chelsea,A,A
4,Newcastle Utd,Tottenham,A,A
7,Arsenal,Manchester City,A,H
8,Liverpool,West Ham Utd,H,A
9,Southampton,Burnley,D,H
11,Chelsea,Arsenal,H,A
12,Everton,Southampton,H,H
15,West Ham Utd,Bournemouth,A,H
17,Burnley,Watford,A,A


In [15]:
compare[(compare['AwayTeam'] == 'Liverpool') | (compare['HomeTeam'] == 'Liverpool')]

Unnamed: 0,HomeTeam,AwayTeam,Actual,Predict
8,Liverpool,West Ham Utd,H,A
19,Crystal Palace,Liverpool,A,H
34,Leicester City,Liverpool,A,A
45,Tottenham,Liverpool,A,H
56,Liverpool,Southampton,H,H
61,Chelsea,Liverpool,D,H


In [16]:
compare[(compare['AwayTeam'] == 'Manchester Utd') | (compare['HomeTeam'] == 'Manchester Utd')]

Unnamed: 0,HomeTeam,AwayTeam,Actual,Predict
0,Manchester Utd,Leicester City,H,H
29,Manchester Utd,Tottenham,A,H
37,Burnley,Manchester Utd,A,A
46,Watford,Manchester Utd,A,A
66,West Ham Utd,Manchester Utd,H,A


In [17]:
compare[(compare['AwayTeam'] == 'Chelsea') | (compare['HomeTeam'] == 'Chelsea')]

Unnamed: 0,HomeTeam,AwayTeam,Actual,Predict
3,Huddersfield,Chelsea,A,A
11,Chelsea,Arsenal,H,A
27,Newcastle Utd,Chelsea,A,A
31,Chelsea,Bournemouth,H,A
59,West Ham Utd,Chelsea,D,H
61,Chelsea,Liverpool,D,H


In [18]:
compare[(compare['AwayTeam'] == 'West Ham Utd') | (compare['HomeTeam'] == 'West Ham Utd')]

Unnamed: 0,HomeTeam,AwayTeam,Actual,Predict
8,Liverpool,West Ham Utd,H,A
15,West Ham Utd,Bournemouth,A,H
20,Arsenal,West Ham Utd,H,H
47,Everton,West Ham Utd,A,A
59,West Ham Utd,Chelsea,D,H
66,West Ham Utd,Manchester Utd,H,A
