In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  


  from numpy.core.umath_tests import inner1d


In [2]:
# data from three different sources use different names for teams. 
# manually making a dictionary to rename teams.
team_dict = {
            'Arsenal FC': 'Arsenal',
            'Burnley FC': 'Burnley',
            'Liverpool FC': 'Liverpool',
            'Everton FC': 'Everton',
            'Fulham FC': 'Fulham',
            'Southampton FC': 'Southampton',
            'Chelsea FC': 'Chelsea',
            'West Bromwich Albion': 'West Bromwich',
            'Manchester United': 'Manchester Utd',
            'Man United': 'Manchester Utd',
            'Man City': 'Manchester City', 
            'Newcastle': 'Newcastle Utd',
            'Newcastle United': 'Newcastle Utd',
            'West Ham United': 'West Ham Utd', 
            'Tottenham Hotspur': 'Tottenham',
            'Queens Park Rangers': 'QP Rangers',
            'Watford FC': 'Watford',
            'AFC Bournemouth': 'Bournemouth',
            'Sunderland AFC': 'Sunderland',
            'Middlesbrough FC': 'Middlesbrough',
            'Brighton & Hove Albion': 'Brighton & Hove',
            'Huddersfield Town': 'Huddersfield',
            'QPR': 'QP Rangers',
            'Cardiff': 'Cardiff City',
            'Hull': 'Hull City',
            'Norwich': 'Norwich City',
            'Stoke': 'Stoke City',
            'Swansea': 'Swansea City',
            'West Brom': 'West Bromwich',
            'West Ham': 'West Ham Utd',
            'Reading FC': 'Reading',
            'Leicester': 'Leicester City'
            }

In [3]:
# Current season Roster Power Index(RPI) from PES Database.
# Each data is used to predict each season.
# For example, use pes14 to predict (train) 14/15 season.
# Merge PES 15 data with epl_table_1314 table in order to predict EPL 14/15 season.
pes14 = pd.read_csv('./Data/PES14.csv')
pes15 = pd.read_csv('./Data/PES15.csv')
pes16 = pd.read_csv('./Data/PES16.csv')
pes17 = pd.read_csv('./Data/PES17.csv')
pes18 = pd.read_csv('./Data/PES18.csv')
pes19 = pd.read_csv('./Data/PES19.csv')

In [4]:
# Past season RPIs and Match Power Index(MPI)from past season.
epl_table_1213 = pd.read_csv('./Data/epl_table_1213.csv')
epl_table_1314 = pd.read_csv('./Data/epl_table_1314.csv')
epl_table_1415 = pd.read_csv('./Data/epl_table_1415.csv')
epl_table_1516 = pd.read_csv('./Data/epl_table_1516.csv')
epl_table_1617 = pd.read_csv('./Data/epl_table_1617.csv')
epl_table_1718 = pd.read_csv('./Data/epl_table_1718.csv')

In [5]:
# Merge two RPIs.
# Merge PES 15 database and EPL 13/14 table to predict EPL 14/15
pr_14 = pd.merge(pes14, epl_table_1213, on='Team', how='outer').sort_values(by='Team').dropna()
pr_15 = pd.merge(pes15, epl_table_1314, on='Team', how='outer').sort_values(by='Team').dropna()
pr_16 = pd.merge(pes16, epl_table_1415, on='Team', how='outer').sort_values(by='Team').dropna()
pr_17 = pd.merge(pes17, epl_table_1516, on='Team', how='outer').sort_values(by='Team').dropna()
pr_18 = pd.merge(pes18, epl_table_1617, on='Team', how='outer').sort_values(by='Team').dropna()
pr_19 = pd.merge(pes19, epl_table_1718, on='Team', how='outer').sort_values(by='Team').dropna()

In [6]:
# Load EPL fixtures.
epl_detail_1314 = pd.read_csv('./Data/epl1314.csv')
epl_detail_1415 = pd.read_csv('./Data/epl1415.csv')
epl_detail_1516 = pd.read_csv('./Data/epl1516.csv')
epl_detail_1617 = pd.read_csv('./Data/epl1617.csv')
epl_detail_1718 = pd.read_csv('./Data/epl1718.csv')
epl_detail_1819 = pd.read_csv('./Data/epl1819.csv')

In [7]:
# Create columns to merge with EPL fixture dataframes.
new_cols = ['HtOff', 'HtDef', 'AtOff', 'AtDef', 'HtPesOvr', 'AtPesOvr', 
            'HtPesDef', 'HtPesMid', 'HtPesOff', 'AtPesDef', 'AtPesMid', 'AtPesOff',
            'HtPesPhy', 'HtPesSpd', 'AtPesPhy', 'AtPesSpd', 'HtDis', 'AtDis', 'HtCs', 'AtCs']

def detail_df(epl_detail_data):
    '''
    Take Home, away teams and the result from fixture dataframes.
    '''
    epl_detail_data = epl_detail_data[['HomeTeam', 'AwayTeam', 'FTR']]
    epl_detail_data = epl_detail_data.reindex(columns=epl_detail_data.columns.tolist() + new_cols)
    epl_detail_data = epl_detail_data.replace(team_dict)
    
    return epl_detail_data

epl_detail_1314 = detail_df(epl_detail_1314)
epl_detail_1415 = detail_df(epl_detail_1415)
epl_detail_1516 = detail_df(epl_detail_1516)
epl_detail_1617 = detail_df(epl_detail_1617)
epl_detail_1718 = detail_df(epl_detail_1718)
epl_detail_1819 = detail_df(epl_detail_1819)

In [8]:
def fill_df(epl_detail_data, pes_data):
    '''
    Pass RPIs to fixture dataframe.
    IndexError will be ignored.
    The model ignores relegation and promotion of teams.
    '''
    for team in epl_detail_data['HomeTeam']:
        try:
            ht_off = pes_data.loc[pes_data['Team'] == team, 'H_Att'].values[0] 
            ht_def = pes_data.loc[pes_data['Team'] == team, 'H_Def'].values[0]
            at_off = pes_data.loc[pes_data['Team'] == team, 'A_Att'].values[0]
            at_def = pes_data.loc[pes_data['Team'] == team, 'A_Def'].values[0]
            
            pes_ovr = pes_data.loc[pes_data['Team'] == team, 'Ovr'].values[0]
            pes_off = pes_data.loc[pes_data['Team'] == team, 'Fwd'].values[0]
            pes_def = pes_data.loc[pes_data['Team'] == team, 'Def'].values[0]
            pes_mid = pes_data.loc[pes_data['Team'] == team, 'Mid'].values[0]
            pes_spd = pes_data.loc[pes_data['Team'] == team, 'Spd'].values[0]
            pes_phy = pes_data.loc[pes_data['Team'] == team, 'Phy'].values[0]
            
            dis = pes_data.loc[pes_data['Team'] == team, 'DIS'].values[0]
            cs = pes_data.loc[pes_data['Team'] == team, 'CS'].values[0]

            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtDis'] = dis
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtCs'] = cs
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtDis'] = dis
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtCs'] = cs
            
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtOff'] = ht_off
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtDef'] = ht_def
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtOff'] = at_off
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtDef'] = at_def
            
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesOvr'] = pes_ovr
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesOvr'] = pes_ovr
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesOff'] = pes_off
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesOff'] = pes_off
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesDef'] = pes_def
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesDef'] = pes_def
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesMid'] = pes_mid
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesMid'] = pes_mid
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesSpd'] = pes_spd
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesSpd'] = pes_spd
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesPhy'] = pes_phy
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesPhy'] = pes_phy
        
        except IndexError:
            pass
        
    return epl_detail_data
            
epl_concat_1314 = fill_df(epl_detail_1314, pr_14)
epl_concat_1415 = fill_df(epl_detail_1415, pr_15)
epl_concat_1516 = fill_df(epl_detail_1516, pr_16)
epl_concat_1617 = fill_df(epl_detail_1617, pr_17)
epl_concat_1718 = fill_df(epl_detail_1718, pr_18)
epl_concat_1819 = fill_df(epl_detail_1819, pr_19)

In [16]:
epl_concat_1314.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,370,371,372,373,374,375,376,377,378,379
HomeTeam,Arsenal,Liverpool,Norwich City,Sunderland,Swansea City,West Bromwich,West Ham Utd,Chelsea,Crystal Palace,Manchester City,...,Cardiff City,Fulham,Hull City,Liverpool,Manchester City,Norwich City,Southampton,Sunderland,Tottenham,West Bromwich
AwayTeam,Aston Villa,Stoke City,Everton,Fulham,Manchester Utd,Southampton,Cardiff City,Hull City,Tottenham,Newcastle Utd,...,Chelsea,Crystal Palace,Everton,Newcastle Utd,West Ham Utd,Arsenal,Manchester Utd,Swansea City,Aston Villa,Stoke City
FTR,A,H,D,A,A,A,H,H,A,H,...,A,D,A,H,H,A,D,A,H,A
HtOff,1.58784,1.11486,0.844595,0.675676,0.945946,1.08108,1.14865,1.38514,,1.38514,...,,0.945946,,1.11486,1.38514,0.844595,0.878378,0.675676,0.97973,1.08108
HtDef,0.976645,0.679406,0.849257,0.806794,1.10403,1.06157,0.934183,0.679406,,0.636943,...,,1.27389,,0.679406,0.636943,0.849257,1.01911,0.806794,0.764331,1.06157
AtOff,1.01911,0.552017,0.934183,0.934183,1.74098,0.976645,,,1.57113,0.89172,...,1.44374,,0.934183,0.89172,0.467091,1.06157,1.74098,0.806794,1.01911,0.552017
AtDef,1.38514,0.777027,0.777027,1.01351,0.810811,1.21622,,,0.945946,1.25,...,0.777027,,0.777027,1.25,1.0473,0.472973,0.810811,0.844595,1.38514,0.777027
HtPesOvr,84,82,79,80,79,80,80,86,,86,...,,80,,82,86,79,80,80,83,80
AtPesOvr,79,81,80,80,85,80,,,83,81,...,86,,80,81,80,84,85,79,79,81
HtPesDef,84,82,80,79,78,79,80,87,,85,...,,80,,82,85,80,81,79,80,79


In [9]:
# Final model.
model_df = pd.concat([epl_concat_1314, epl_concat_1415, epl_concat_1516, epl_concat_1617, epl_concat_1718])

# Drop NAs from promotion/relegation.
model_df = model_df.dropna()

In [10]:
X = model_df[new_cols]
y = model_df['FTR']

In [11]:
scaler = StandardScaler()  
X = scaler.fit_transform(X)

knn = KNeighborsClassifier(n_neighbors=8)
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(2),solver='sgd',learning_rate_init= 0.01, random_state=42)
 
knn.fit(X, y)
lr.fit(X, y)
rf.fit(X, y)
mlp.fit(X, y);

In [12]:
print('---- Score on Train (Past 5 Seasons) ----')
print('KNN:' + str(knn.score(X, y)))
print('LR:' + str(lr.score(X, y)))
print('RF:' + str(rf.score(X, y)))
print('MLP:' + str(mlp.score(X, y)))

print('---- Score on Test (Current Season) ----')
print('KNN:' + str(knn.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))
print('LR:' + str(lr.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))
print('RF:' + str(rf.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))
print('MLP:' + str(mlp.score(epl_detail_1819.dropna()[new_cols], epl_detail_1819.dropna()['FTR'])))

---- Score on Train (Past 5 Seasons) ----
KNN:0.5698529411764706
LR:0.5308823529411765
RF:0.9801470588235294
MLP:0.5264705882352941
---- Score on Test (Current Season) ----
KNN:0.5813953488372093
LR:0.4418604651162791
RF:0.37209302325581395
MLP:0.4186046511627907


In [13]:
def get_result(df, data):
    if knn.predict_proba(data)[0] > 0.52:
        df['Predict'] = 'A'
    elif knn.predict_proba(data)[2] > 0.52:
        df['Predict'] = 'H'
    else:
        df['Predict'] = 'D'
test = pd.DataFrame(columns=['Predict'])
prob = knn.predict_proba(predict_X)
for i in range(len(prob)):
    if (prob[i][0] == prob[i][2]) | (abs(prob[i][0] - prob[i][2]) < 0.05):
        test.loc[i, 'Predict'] = 'D'
    elif prob[i].max() == prob[i][0]:
        test.loc[i, 'Predict'] = 'A'
    elif prob[i].max() == prob[i][2]:
        test.loc[i, 'Predict'] = 'H'
    else:
        test.loc[i, 'Predict'] = 'D'

NameError: name 'predict_X' is not defined

In [14]:
test
knn.predict_proba(predict_X).round(4)
#test['Predict'] == y['FTR']
#(y.reset_index()['FTR'] == test['Predict']).mean()
#(compare.reset_index()['Actual'] == test['Predict']).mean()
#test['Predict']
#(knn.predict(X)=='D').sum()
#(y=='D').sum() #339

NameError: name 'predict_X' is not defined

In [None]:
epl_detail_1819.dropna(inplace=True)
predict_X = epl_detail_1819[new_cols]
compare = epl_detail_1819[['HomeTeam', 'AwayTeam']]
compare = compare.reindex(columns=compare.columns.tolist() + ['Actual', 'Predict'])
compare['Actual'] = epl_detail_1819['FTR']
compare['Predict'] = knn.predict(predict_X)

In [None]:
compare

In [None]:
compare[(compare['AwayTeam'] == 'Liverpool') | (compare['HomeTeam'] == 'Liverpool')]

In [None]:
compare[(compare['AwayTeam'] == 'Manchester Utd') | (compare['HomeTeam'] == 'Manchester Utd')]

In [None]:
compare[(compare['AwayTeam'] == 'Chelsea') | (compare['HomeTeam'] == 'Chelsea')]