In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, make_scorer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit, KFold

In [31]:
data = pd.read_csv('soccer18m.csv')
data['GameID'] = data.index
data['HomeWin'] = 1*(data['FTHG'] > data['FTAG'])
data['GDiffHome'] = data['FTHG'] - data['FTAG']
data['GDiffAway'] = data['FTAG'] - data['FTHG']
data.head()

Unnamed: 0,Div,Date,Y,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,...,AST,pH,pD,pA,home_xG,away_xG,GameID,HomeWin,GDiffHome,GDiffAway
0,EPL,2014-08-16,14,Arsenal,Crystal Palace,2,1,1,1,14,...,2,0.778038,0.151989,0.069973,1.55411,0.158151,0,1,1,-1
1,EPL,2014-08-16,14,Leicester,Everton,2,2,1,2,11,...,3,0.311972,0.28982,0.398208,1.2783,0.613273,1,0,0,0
2,EPL,2014-08-16,14,Man United,Swansea,1,2,0,1,14,...,4,0.715373,0.192169,0.092459,1.16635,0.278076,2,0,-1,1
3,EPL,2014-08-16,14,QPR,Hull,0,1,0,0,19,...,4,0.395113,0.300577,0.304311,1.90067,1.11757,3,0,-1,1
4,EPL,2014-08-16,14,Stoke,Aston Villa,0,1,0,0,12,...,2,0.502378,0.282316,0.215305,0.423368,0.909774,4,0,-1,1


In [32]:
teams = data.HomeTeam.unique()

In [43]:
data['WDL_Home'] = np.where(data.GDiffHome > 0, 1, np.where(data.GDiffHome < 0, 0, 1/2))
data['WDL_Away'] = 1 - data.WDL_Home
data['eloH'] = ''
data['eloA'] = ''

In [44]:
def calc_elo(abs_dG, W, eloF, eloA, isHome, K = 40, HFA = 100):
    G = np.where(abs_dG <= 1, 1, np.where(abs_dG == 2, 3/2, (11 + abs_dG)/8))
    dr = (eloF + np.where(isHome == 1, HFA, 0)) - (eloA + np.where(isHome == 0, HFA, 0))
    #dr = (eloF - eloA) + np.where(isHome == 1, HFA, -HFA)
    We = 1/(10**(-dr/400) + 1)
    return eloF + K*G*(W - We)

In [45]:
elos = dict(zip(teams, [1000]*len(teams)))
data17 = data[data.Y < 18]

In [46]:
for idx, game in data17.iterrows():
    home = game.HomeTeam
    away = game.AwayTeam
    elos[home] = calc_elo(abs(game.GDiffHome), game.WDL_Home, elos[home], elos[away], 1)
    elos[away] = calc_elo(abs(game.GDiffAway), game.WDL_Away, elos[away], elos[home], 0)
    
    data17.loc[idx, 'eloH'] = elos[home]
    data17.loc[idx, 'eloA'] = elos[away]

In [212]:
teams17 = datamaster17[datamaster17.Y == 17].HomeTeam.unique()

elos17 = pd.DataFrame.from_dict(elos, orient = 'index', columns = ['Elo Score'])
elos17.reset_index(inplace = True)
elos17 = elos17[elos17['index'].isin(teams17)]

elos17 = elos17.merge(datamaster17[['Div', 'HomeTeam']].drop_duplicates(subset=['HomeTeam']), how = 'left', left_on = 'index', right_on = 'HomeTeam')
elos17.groupby('Div')['Elo Score','index'].apply(lambda x : x.nlargest(3, ['Elo Score']))

Unnamed: 0_level_0,Unnamed: 1_level_0,Elo Score,index
Div,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bundesliga,31,1344.96056,Bayern Munich
Bundesliga,46,1152.044801,Schalke 04
Bundesliga,49,1138.180094,Hoffenheim
EPL,28,1417.518046,Man City
EPL,24,1276.533689,Tottenham
EPL,19,1252.162521,Man United
La_Liga,51,1409.654295,Barcelona
La_Liga,61,1302.795304,Real Madrid
La_Liga,62,1219.067183,Ath Madrid
Ligue_1,0,1347.910532,Paris SG
