In [1]:
import pandas as pd

In [2]:
odds = pd.read_excel('raw/oddsportal-odds.xlsx', header=None)

In [3]:
odds['season'] = odds[0]\
.str.split(' - ')\
.apply(lambda x: x[0].split(' ')[-1] if isinstance(x, list) else pd.np.nan)\
.fillna(method='ffill')

In [4]:
odds = odds.loc[odds[1].notnull(), [1,3,4,5,'season']]

In [5]:
odds.columns = ['tie','oddshome','oddsdraw','oddsaway','season']

In [6]:
odds['hometeam'] = odds['tie'].str.split(' - ').apply(lambda x: x[0]).str.strip()
odds['awayteam'] = odds['tie'].str.split(' - ').apply(lambda x: x[1]).str.strip()

In [7]:
odds.head()

Unnamed: 0,tie,oddshome,oddsdraw,oddsaway,season,hometeam,awayteam
2,AS Roma - Liverpool,165,305,149,2018,AS Roma,Liverpool
5,Real Madrid - Bayern Munich,139,309,174,2018,Real Madrid,Bayern Munich
8,Bayern Munich - Real Madrid,100,308,256,2018,Bayern Munich,Real Madrid
11,Liverpool - AS Roma,-192,362,568,2018,Liverpool,AS Roma
14,Bayern Munich - Sevilla,-370,581,952,2018,Bayern Munich,Sevilla


In [8]:
def oddstopct(N):
    if N == '-':
        return pd.np.nan
    n = float(N)
    if n < 0:
        return (-n) / (-n + 100)
    else:
        return 100 / (n + 100)

In [9]:
odds['pcthomeunadj'] = odds['oddshome'].apply(oddstopct)
odds['pctdrawunadj'] = odds['oddsdraw'].apply(oddstopct)
odds['pctawayunadj'] = odds['oddsaway'].apply(oddstopct)

In [10]:
def adjust(row):
    row['pcthome'] = row['pcthomeunadj'] / (row['pcthomeunadj'] + row['pctdrawunadj'] + row['pctawayunadj'])
    row['pctdraw'] = row['pctdrawunadj'] / (row['pcthomeunadj'] + row['pctdrawunadj'] + row['pctawayunadj'])
    row['pctaway'] = row['pctawayunadj'] / (row['pcthomeunadj'] + row['pctdrawunadj'] + row['pctawayunadj'])
    return row

In [11]:
oddsadj = odds.apply(adjust, axis=1)

In [12]:
oddsadj.head()

Unnamed: 0,tie,oddshome,oddsdraw,oddsaway,season,hometeam,awayteam,pcthomeunadj,pctdrawunadj,pctawayunadj,pcthome,pctdraw,pctaway
2,AS Roma - Liverpool,165,305,149,2018,AS Roma,Liverpool,0.377358,0.246914,0.401606,0.367839,0.240685,0.391476
5,Real Madrid - Bayern Munich,139,309,174,2018,Real Madrid,Bayern Munich,0.41841,0.244499,0.364964,0.407064,0.237869,0.355067
8,Bayern Munich - Real Madrid,100,308,256,2018,Bayern Munich,Real Madrid,0.5,0.245098,0.280899,0.487331,0.238888,0.273781
11,Liverpool - AS Roma,-192,362,568,2018,Liverpool,AS Roma,0.657534,0.21645,0.149701,0.642321,0.211442,0.146237
14,Bayern Munich - Sevilla,-370,581,952,2018,Bayern Munich,Sevilla,0.787234,0.146843,0.095057,0.764948,0.142686,0.092366


In [13]:
oddsadj.to_csv('processed/odds.csv', index=False)

In [14]:
results = pd.read_csv('processed/results.csv', dtype=str)

In [15]:
results.head()

Unnamed: 0,season,round,tie,winner,pk,agr,aet,result
0,2008,first,arsenal-milan,arsenal,False,False,False,arsenal (2-0) milan
1,2008,first,celtic-barcelona,barcelona,False,False,False,celtic (2-4) barcelona
2,2008,first,fenerbahce-sevilla,fenerbahce,True,False,True,"fenerbahce (5-5 aet) sevilla, fenerbahce won o..."
3,2008,first,liverpool-inter,liverpool,False,False,False,liverpool (3-0) inter
4,2008,first,lyon-united,united,False,False,False,lyon (1-2) united


In [16]:
with open('raw/teamcrosswalk.psv') as f:
    lines = f.readlines()
    xwalk = {l.split('|')[1]: l.split('|')[0] for l in lines[1:]}

In [17]:
xwalk

{'milan': 'AC Milan',
 'apoel': 'APOEL',
 'roma': 'AS Roma',
 'ajax': 'Ajax',
 'arsenal': 'Arsenal',
 'atletico': 'Atl. Madrid',
 'barcelona': 'Barcelona',
 'basel': 'Basel',
 'leverkusen': 'Bayer Leverkusen',
 'bayern': 'Bayern Munich',
 'benfica': 'Benfica',
 'besiktas': 'Besiktas',
 'bordeaux': 'Bordeaux',
 'cska': 'CSKA Moscow',
 'celtic': 'Celtic',
 'chelsea': 'Chelsea',
 'dortmund': 'Dortmund',
 'kyiv': 'Dyn. Kyiv',
 'copenhagen': 'FC Copenhagen',
 'porto': 'FC Porto',
 'fenerbahce': 'Fenerbahce',
 'fiorentina': 'Fiorentina',
 'galatasaray': 'Galatasaray',
 'gent': 'Gent',
 'inter': 'Inter',
 'juventus': 'Juventus',
 'leicester': 'Leicester',
 'liverpool': 'Liverpool',
 'lyon': 'Lyon',
 'malaga': 'Malaga',
 'city': 'Manchester City',
 'united': 'Manchester Utd',
 'marseille': 'Marseille',
 'monaco': 'Monaco',
 'napoli': 'Napoli',
 'olympiacos': 'Olympiakos Piraeus',
 'psv': 'PSV',
 'panathinaikos': 'Panathinaikos',
 'psg': 'Paris SG',
 'madrid': 'Real Madrid',
 'schalke': 'Schalk

In [18]:
final = []
for d in results.to_dict(orient='records'):
    teams = d['tie'].split('-')
    final.append(
        dict(
            season=d['season'],
            tie=d['tie'],
            hometeam=xwalk[teams[0]],
            awayteam=xwalk[teams[1]],
            game=1
        )
    )
    final.append(
        dict(
            season=d['season'],
            tie=d['tie'],
            hometeam=xwalk[teams[1]],
            awayteam=xwalk[teams[0]],
            game=2
        )
    )

In [19]:
final[:5]

[{'season': '2008',
  'tie': 'arsenal-milan',
  'hometeam': 'Arsenal',
  'awayteam': 'AC Milan',
  'game': 1},
 {'season': '2008',
  'tie': 'arsenal-milan',
  'hometeam': 'AC Milan',
  'awayteam': 'Arsenal',
  'game': 2},
 {'season': '2008',
  'tie': 'celtic-barcelona',
  'hometeam': 'Celtic',
  'awayteam': 'Barcelona',
  'game': 1},
 {'season': '2008',
  'tie': 'celtic-barcelona',
  'hometeam': 'Barcelona',
  'awayteam': 'Celtic',
  'game': 2},
 {'season': '2008',
  'tie': 'fenerbahce-sevilla',
  'hometeam': 'Fenerbahce',
  'awayteam': 'Sevilla',
  'game': 1}]

In [25]:
gameoddsxwalk = \
pd.DataFrame(final)\
.merge(oddsadj[['season','hometeam','awayteam','pcthome','pctdraw','pctaway']])\
[['season','tie','game','hometeam','awayteam','pcthome','pctdraw','pctaway']]

In [26]:
gameoddsxwalk.head()

Unnamed: 0,season,tie,game,hometeam,awayteam,pcthome,pctdraw,pctaway
0,2008,arsenal-milan,1,Arsenal,AC Milan,0.452474,0.314634,0.232891
1,2008,arsenal-milan,2,AC Milan,Arsenal,0.428566,0.317081,0.254352
2,2008,celtic-barcelona,1,Celtic,Barcelona,0.234004,0.302405,0.463592
3,2008,celtic-barcelona,2,Barcelona,Celtic,0.821256,0.146369,0.032374
4,2008,fenerbahce-sevilla,1,Fenerbahce,Sevilla,0.3421,0.309465,0.348435


In [27]:
gameoddsxwalk[gameoddsxwalk.isnull().sum(axis=1).astype(bool)]

Unnamed: 0,season,tie,game,hometeam,awayteam,pcthome,pctdraw,pctaway
228,2016,gent-wolfsburg,1,Gent,Wolfsburg,,,
235,2016,psg-chelsea,2,Chelsea,Paris SG,,,


In [28]:
gameoddsxwalk['pcthome'].fillna(0.33, inplace=True)
gameoddsxwalk['pctdraw'].fillna(0.33, inplace=True)
gameoddsxwalk['pctaway'].fillna(0.33, inplace=True)

In [29]:
gameoddsxwalk.to_csv('processed/games-odds-xwalk.csv', index=False)