In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  


In [4]:
# data from three different sources use different names for teams. 
# manually making a dictionary to rename teams.
team_dict = {
            'Arsenal FC': 'Arsenal',
            'Burnley FC': 'Burnley',
            'Liverpool FC': 'Liverpool',
            'Everton FC': 'Everton',
            'Fulham FC': 'Fulham',
            'Southampton FC': 'Southampton',
            'Chelsea FC': 'Chelsea',
            'West Bromwich Albion': 'West Bromwich',
            'Manchester United': 'Manchester Utd',
            'Man United': 'Manchester Utd',
            'Man City': 'Manchester City', 
            'Newcastle': 'Newcastle Utd',
            'Newcastle United': 'Newcastle Utd',
            'West Ham United': 'West Ham Utd', 
            'Tottenham Hotspur': 'Tottenham',
            'Queens Park Rangers': 'QP Rangers',
            'Watford FC': 'Watford',
            'AFC Bournemouth': 'Bournemouth',
            'Sunderland AFC': 'Sunderland',
            'Middlesbrough FC': 'Middlesbrough',
            'Brighton & Hove Albion': 'Brighton & Hove',
            'Huddersfield Town': 'Huddersfield',
            'QPR': 'QP Rangers',
            'Cardiff': 'Cardiff City',
            'Hull': 'Hull City',
            'Norwich': 'Norwich City',
            'Stoke': 'Stoke City',
            'Swansea': 'Swansea City',
            'West Brom': 'West Bromwich',
            'West Ham': 'West Ham Utd'
            }

In [5]:
# Current season Roster Power Index(RPI) from PES Database.
# Each data is used to predict each season.
# For example, use pes14 to predict (train) 14/15 season.
# Merge pes14 data with epl_table_1314 table in order to predict EPL 14/15 season.
pes14 = pd.read_csv('./Data/PES14.csv')
pes15 = pd.read_csv('./Data/PES15.csv')
pes16 = pd.read_csv('./Data/PES16.csv')
pes17 = pd.read_csv('./Data/PES17.csv')
pes18 = pd.read_csv('./Data/PES18.csv')
pes19 = pd.read_csv('./Data/PES19.csv')

In [6]:
# Past season RPIs and Match Power Index(MPI)from past season.
epl_table_1314 = pd.read_csv('./Data/epl_table_1314.csv')
epl_table_1415 = pd.read_csv('./Data/epl_table_1415.csv')
epl_table_1516 = pd.read_csv('./Data/epl_table_1516.csv')
epl_table_1617 = pd.read_csv('./Data/epl_table_1617.csv')
epl_table_1718 = pd.read_csv('./Data/epl_table_1718.csv')

In [7]:
# Merge two RPIs.
pr_14 = pd.merge(pes14, epl_table_1314, on='Team', how='outer').sort_values(by='Team')
pr_15 = pd.merge(pes15, epl_table_1415, on='Team', how='outer').sort_values(by='Team')
pr_16 = pd.merge(pes16, epl_table_1516, on='Team', how='outer').sort_values(by='Team')
pr_17 = pd.merge(pes17, epl_table_1617, on='Team', how='outer').sort_values(by='Team')
pr_18 = pd.merge(pes18, epl_table_1718, on='Team', how='outer').sort_values(by='Team')

In [8]:
# Load EPL fixtures.
epl_detail_1314 = pd.read_csv('./Data/epl1314.csv')
epl_detail_1415 = pd.read_csv('./Data/epl1415.csv')
epl_detail_1516 = pd.read_csv('./Data/epl1516.csv')
epl_detail_1617 = pd.read_csv('./Data/epl1617.csv')
epl_detail_1718 = pd.read_csv('./Data/epl1718.csv')

In [9]:
# Create columns to merge with EPL fixture dataframes.
new_cols = ['HtOff', 'HtDef', 'AtOff', 'AtDef', 'HtPesOvr', 'AtPesOvr', 
            'HtPesDef', 'HtPesMid', 'HtPesOff', 'AtPesDef', 'AtPesMid', 'AtPesOff',
            'HtPesPhy', 'HtPesSpd', 'AtPesPhy', 'AtPesSpd']

def detail_df(epl_detail_data):
    '''
    Take Home, away teams and the result from fixture dataframes.
    '''
    epl_detail_data = epl_detail_data[['HomeTeam', 'AwayTeam', 'FTR']]
    epl_detail_data = epl_detail_data.reindex(columns=epl_detail_data.columns.tolist() + new_cols)
    epl_detail_data = epl_detail_data.replace(team_dict)
    
    return epl_detail_data

epl_detail_1314 = detail_df(epl_detail_1314)
epl_detail_1415 = detail_df(epl_detail_1415)
epl_detail_1516 = detail_df(epl_detail_1516)
epl_detail_1617 = detail_df(epl_detail_1617)
epl_detail_1718 = detail_df(epl_detail_1718)


In [11]:
def fill_df(epl_detail_data, pes_data):
    '''
    Pass RPIs to fixture dataframe.
    IndexError will be ignored.
    The model ignores relegation and promotion of teams.
    '''
    for team in epl_detail_data['HomeTeam']:
        try:
            ht_off = pes_data.loc[pes_data['Team'] == team, 'H_Att'].values[0] 
            ht_def = pes_data.loc[pes_data['Team'] == team, 'H_Def'].values[0]
            at_off = pes_data.loc[pes_data['Team'] == team, 'A_Att'].values[0]
            at_def = pes_data.loc[pes_data['Team'] == team, 'A_Def'].values[0]
            pes_ovr = pes_data.loc[pes_data['Team'] == team, 'Ovr'].values[0]
            pes_off = pes_data.loc[pes_data['Team'] == team, 'Fwd'].values[0]
            pes_def = pes_data.loc[pes_data['Team'] == team, 'Def'].values[0]
            pes_mid = pes_data.loc[pes_data['Team'] == team, 'Mid'].values[0]
            pes_spd = pes_data.loc[pes_data['Team'] == team, 'Spd'].values[0]
            pes_phy = pes_data.loc[pes_data['Team'] == team, 'Phy'].values[0]
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtOff'] = ht_off
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtDef'] = ht_def
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtOff'] = at_off
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtDef'] = at_def
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesOvr'] = pes_ovr
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesOvr'] = pes_ovr
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesOff'] = pes_off
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesOff'] = pes_off
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesDef'] = pes_def
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesDef'] = pes_def
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesMid'] = pes_mid
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesMid'] = pes_mid
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesSpd'] = pes_spd
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesSpd'] = pes_spd
            epl_detail_data.loc[epl_detail_data['HomeTeam'] == team, 'HtPesPhy'] = pes_phy
            epl_detail_data.loc[epl_detail_data['AwayTeam'] == team, 'AtPesPhy'] = pes_phy
        
        except IndexError:
            pass
            
fill_df(epl_detail_1314, pr_14)
fill_df(epl_detail_1415, pr_15)
fill_df(epl_detail_1516, pr_16)
fill_df(epl_detail_1617, pr_17)
fill_df(epl_detail_1718, pr_18)

In [13]:
# Final model.
model_df = pd.concat([epl_detail_1314, epl_detail_1415, epl_detail_1516, epl_detail_1617, epl_detail_1718])

# Drop NAs from promotion/relegation.
model_df = model_df.dropna()

In [24]:
X = model_df[new_cols]
y = model_df['FTR']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = .33)

In [26]:
# scaler = StandardScaler()  
# scaler.fit(X_train)  
# X_train = scaler.transform(X_train)  
# X_test = scaler.transform(X_test) 

knn = KNeighborsClassifier()
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
clf = MLPClassifier(random_state=42)
 
knn.fit(X_train, y_train)
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
clf.fit(X_train, y_train);

In [27]:
print(knn.score(X_test, y_test))
print(lr.score(X_test, y_test))
print(rf.score(X_test, y_test))
print(clf.score(X_test, y_test))

0.504424778761062
0.5734513274336284
0.49557522123893805
0.431858407079646
