In [105]:
import pandas as pd
import datetime as dt
import numpy as np
from utils import ranked_probability_loss

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.options.mode.chained_assignment = None

In [106]:
 # Reading the data

bets = pd.read_csv("data/bets.zip")
booking = pd.read_csv("data/booking.zip")
goals = pd.read_csv("data/goals.zip")
matches = pd.read_csv("data/matches.zip")
stats = pd.read_csv("data/stats.zip")

In [107]:
# Converting epoch column to datetime
matches['timestamp'] = matches['epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))
bets['timestamp'] = bets['odd_epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))

In [108]:
def week_converter(timestamp):
  """
  year is 2019 for dates between 2019-07 and 2020-06, 
  22nd week just random splitter, 
  there might be better representation
  
  is_national is True for Friday, Saturday, Sunday, Monday 
  False otherwise
  """
  # year = (timestamp - dt.timedelta(1)).dt.strftime('%Y')
  year, week, day = (timestamp - dt.timedelta(1)).isocalendar()
  year = year - 1 if week < 22 else year
  is_national = day >= 4
  return [year, week, is_national]

In [109]:
matches[['year','week', 'is_national']] = pd.DataFrame(matches.timestamp.apply(week_converter).values.tolist(), 
                                                       index=matches.index)


In [110]:
start_date = '2019-11-28'
league_id = 148

In [111]:
test_matches = matches[(matches['week'] == 48) & (matches['year'] == 2019)]
matches = matches[matches['timestamp'] < start_date]
print(len(test_matches), len(matches))
if league_id:
  test_matches = test_matches[test_matches['league_id'] == league_id]
  print(len(test_matches), len(matches))
matches = matches.dropna(subset=['match_status', 'match_hometeam_score', 'match_awayteam_score'])
print(len(test_matches), len(matches))

100 5570
10 5570
10 5570


In [112]:
# subsetting bets to odd1 oddx odd2 only
# odd values should be more than 
bets = bets[bets['value'] > 1]
bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

In [113]:
# pivoting bets data to see the changes with time easily and 
# see the odds in a single row for each match - bookmaker - timestamp

bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                        columns='variable',
                        values='value').reset_index()

# reordering columns
bets = bets[['match_id', 'odd_bookmakers', 'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

In [114]:
# bets.groupby(['match_id', 'odd_bookmakers'], as_index=False).agg(['first', 'last'])

In [115]:
# Since bets are changing by time, I will use final odds announced by bookmakers
# by assuming they are correcting their odds somehow

final_bets = bets.groupby(['match_id', 'odd_bookmakers'], as_index=False).last()

In [116]:
# Calculating implied naive probabilities and creating new prob_odd_1(x,2) columns
for cols in ['odd_1', 'odd_x', 'odd_2']:
  final_bets['prob_'+cols] = 1 / final_bets[cols]

# Summing all naive probabilities for each bookmaker & match (this will give us 1 + margin of bookmaker)
final_bets['total'] = final_bets['prob_odd_1'] + final_bets['prob_odd_x'] + final_bets['prob_odd_2']

# normalizin odd by removing margin share from each of them
for cols in ['odd_1', 'odd_x', 'odd_2']:
  final_bets['norm_prob_'+cols] = final_bets['prob_'+cols] / final_bets['total']

In [117]:
# creates a result column 1, 0 or 2 for home win, draw, away win accordingly

matches['result'] = np.where(matches.match_hometeam_score > matches.match_awayteam_score, 
                             1, 0)
# if away > home, then returns 2. otherwise returns the previous result value 
# (which is 1 if home > away and 0 otherwise)

matches['result'] = np.where(matches.match_hometeam_score < matches.match_awayteam_score, 
                             2, matches.result)

# joining result info into the final bets table

final_bets_test = final_bets.merge(test_matches[['match_id']], 
                              on='match_id')
final_bets = final_bets.merge(matches[['match_id', 'result', 'year', 'week', 'is_national']], 
                              on='match_id')

In [118]:
set(test_matches.match_id.unique()) -  set(final_bets_test.match_id.unique())


{273232, 273235, 273239, 273240}

In [119]:
final_bets['rps'] = ranked_probability_loss(final_bets['result'], 
                                            final_bets[['norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2']])

In [120]:
final_bets

Unnamed: 0,match_id,odd_bookmakers,odd_1,odd_x,odd_2,timestamp,prob_odd_1,prob_odd_x,prob_odd_2,total,norm_prob_odd_1,norm_prob_odd_x,norm_prob_odd_2,result,year,week,is_national,rps
0,145899,10Bet,6.75,5.25,1.29,2017-03-12 11:03:01,0.148148,0.190476,0.775194,1.113818,0.133009,0.171012,0.695979,2,2018,44,False,0.05506
1,145899,188BET,4.95,4.60,1.43,2017-03-12 11:03:01,0.202020,0.217391,0.699301,1.118712,0.180583,0.194323,0.625094,2,2018,44,False,0.08658
2,145899,18bet,5.35,4.30,1.43,2017-03-12 11:03:01,0.186916,0.232558,0.699301,1.118775,0.167072,0.207869,0.625059,2,2018,44,False,0.08425
3,145899,1xBet,6.45,4.76,1.49,2017-03-12 11:03:01,0.155039,0.210084,0.671141,1.036264,0.149613,0.202732,0.647655,2,2018,44,False,0.07327
4,145899,888sport,7.00,5.00,1.35,2017-03-12 11:03:01,0.142857,0.200000,0.740741,1.083598,0.131836,0.184570,0.683594,2,2018,44,False,0.05875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140420,320397,bet365.it,2.00,3.60,3.20,2019-10-30 05:42:21,0.500000,0.277778,0.312500,1.090278,0.458599,0.254777,0.286624,1,2019,44,False,0.18763
140421,320397,bwin,2.15,3.50,3.00,2019-10-30 05:42:21,0.465116,0.285714,0.333333,1.084164,0.429009,0.263534,0.307457,1,2019,44,False,0.21028
140422,320397,bwin.es,2.15,3.50,3.00,2019-10-30 05:42:21,0.465116,0.285714,0.333333,1.084164,0.429009,0.263534,0.307457,1,2019,44,False,0.21028
140423,320397,iFortuna.cz,2.17,3.70,2.86,2019-10-30 05:42:21,0.460829,0.270270,0.349650,1.080750,0.426398,0.250077,0.323526,1,2019,44,False,0.21684


In [121]:
bookmaker_list = final_bets.groupby(['odd_bookmakers', 'year', 'is_national']).agg({'rps': 'mean',
                                  'result': 'count'}).sort_values('rps').reset_index().reset_index()
bookmaker_list['index'] = bookmaker_list['index'] + 1
bookmaker_list.columns = ['rank', 'odd_bookmakers', 'year', 'is_national', 'mean_rps', 'count']
bookmaker_list.head(10000)

Unnamed: 0,rank,odd_bookmakers,year,is_national,mean_rps,count
0,1,betcart,2018,False,0.114997,3
1,2,Guts,2018,False,0.121430,3
2,3,KTO,2019,True,0.156200,1
3,4,Sportium,2019,True,0.158151,8
4,5,Superbahis,2019,False,0.175860,3
...,...,...,...,...,...,...
361,362,Winline,2018,True,0.314483,36
362,363,BetOnline,2019,False,0.326780,1
363,364,PaddyPower.it,2019,False,0.356814,5
364,365,Bovada,2019,False,0.359890,5


In [122]:
eng_matches = matches[(matches['year'] >= 2019) & (matches['league_id'] == 148)]['match_id']
eng_final_bets = final_bets[final_bets['match_id'].isin(eng_matches.tolist())]

bookmaker_list = eng_final_bets.groupby(['odd_bookmakers', 'year', 'is_national']).agg({'rps': 'mean',
                                  'result': 'count'}).sort_values('rps').reset_index().reset_index()
bookmaker_list['index'] = bookmaker_list['index'] + 1
bookmaker_list.columns = ['rank', 'odd_bookmakers', 'year', 'is_national', 'mean_rps', 'count']
bookmaker_list[bookmaker_list['is_national'] == True].head(10000)

Unnamed: 0,rank,odd_bookmakers,year,is_national,mean_rps,count
0,1,Coral,2019,True,0.151191,13
1,2,KTO,2019,True,0.1562,1
2,3,mybet,2019,True,0.171538,29
3,4,Ladbrokes,2019,True,0.172307,23
7,8,Sportium.es,2019,True,0.194341,124
9,10,bwin.fr,2019,True,0.19462,126
10,11,STS.pl,2019,True,0.194764,128
11,12,SBOBET,2019,True,0.195332,129
12,13,bet365,2019,True,0.195376,120
13,14,1xBet,2019,True,0.195718,128


In [123]:
final_bets_test = final_bets_test.merge(bookmaker_list[['rank', 'odd_bookmakers']], on='odd_bookmakers')

In [124]:
final_bets_test['updated_ranking'] = final_bets_test.groupby('match_id')["rank"].rank(method="first", ascending=True)

In [126]:
output = final_bets_test[final_bets_test['updated_ranking'] <= 5].sort_values(['match_id', 'updated_ranking'])
output

Unnamed: 0,match_id,odd_bookmakers,odd_1,odd_x,odd_2,timestamp,prob_odd_1,prob_odd_x,prob_odd_2,total,norm_prob_odd_1,norm_prob_odd_x,norm_prob_odd_2,rank,updated_ranking
282,273231,Interwetten.es,2.3,3.2,3.25,2019-11-26 01:25:16,0.434783,0.3125,0.307692,1.054975,0.412126,0.296216,0.291658,6,1.0
546,273231,bet365.it,2.25,3.2,3.25,2019-11-26 01:25:16,0.444444,0.3125,0.307692,1.064637,0.417461,0.293527,0.289012,7,2.0
426,273231,Sportium.es,2.3,3.1,3.2,2019-11-26 01:25:16,0.434783,0.322581,0.3125,1.069863,0.406391,0.301516,0.292093,8,3.0
534,273231,bet365,2.25,3.2,3.25,2019-11-26 01:25:16,0.444444,0.3125,0.307692,1.064637,0.417461,0.293527,0.289012,9,4.0
582,273231,bwin.fr,2.2,3.1,3.1,2019-11-26 01:25:16,0.454545,0.322581,0.322581,1.099707,0.413333,0.293333,0.293333,10,5.0
284,273233,Interwetten.es,1.65,3.95,5.2,2019-11-28 15:57:15,0.606061,0.253165,0.192308,1.051533,0.576359,0.240758,0.182883,6,1.0
548,273233,bet365.it,1.61,4.0,5.25,2019-11-28 15:57:15,0.621118,0.25,0.190476,1.061594,0.58508,0.235495,0.179425,7,2.0
428,273233,Sportium.es,1.62,3.8,5.25,2019-11-28 15:57:15,0.617284,0.263158,0.190476,1.070918,0.576406,0.245731,0.177863,8,3.0
536,273233,bet365,1.61,4.0,5.25,2019-11-28 15:57:15,0.621118,0.25,0.190476,1.061594,0.58508,0.235495,0.179425,9,4.0
584,273233,bwin.fr,1.6,3.75,4.75,2019-11-28 15:57:15,0.625,0.266667,0.210526,1.102193,0.567051,0.241942,0.191007,10,5.0


In [132]:
test_matches

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,timestamp,year,week,is_national
5581,2626,2630,273236,1575113400,,0,Newcastle,Manchester City,,,,,,,,,148,2019-11-30 14:30:00,2019,48,True
5590,2613,2621,273234,1575122400,,0,Liverpool,Brighton,,,,,,,,,148,2019-11-30 17:00:00,2019,48,True
5591,2615,2628,273239,1575122400,,0,Tottenham,Bournemouth,,,,,,,,,148,2019-11-30 17:00:00,2019,48,True
5592,2619,2629,273231,1575122400,,0,Burnley,Crystal Palace,,,,,,,,,148,2019-11-30 17:00:00,2019,48,True
5593,2620,2616,273232,1575122400,,0,Chelsea,West Ham,,,,,,,,,148,2019-11-30 17:00:00,2019,48,True
5608,2623,2614,273238,1575131400,,0,Southampton,Watford,,,,,,,,,148,2019-11-30 19:30:00,2019,48,True
5631,2617,2641,273237,1575205200,,0,Norwich,Arsenal,,,,,,,,,148,2019-12-01 16:00:00,2019,48,True
5632,2654,2646,273240,1575205200,,0,Wolves,Sheffield Utd,,,,,,,,,148,2019-12-01 16:00:00,2019,48,True
5644,2612,2611,273233,1575214200,,0,Leicester,Everton,,,,,,,,,148,2019-12-01 18:30:00,2019,48,True
5645,2632,2627,273235,1575214200,,0,Manchester Utd,Aston Villa,,,,,,,,,148,2019-12-01 18:30:00,2019,48,True


In [135]:
output = output.groupby('match_id', as_index=False).agg(
  {'norm_prob_odd_1': 'mean',
   'norm_prob_odd_x': 'mean',
   'norm_prob_odd_2': 'mean'})
output = output.merge(test_matches[['match_id', 'match_hometeam_name', 'match_awayteam_name']], on='match_id')
output

Unnamed: 0,match_id,norm_prob_odd_1,norm_prob_odd_x,norm_prob_odd_2,match_hometeam_name,match_awayteam_name
0,273231,0.413354,0.295624,0.291022,Burnley,Crystal Palace
1,273233,0.577996,0.239884,0.18212,Leicester,Everton
2,273234,0.7785,0.148216,0.073284,Liverpool,Brighton
3,273236,0.060554,0.126454,0.812993,Newcastle,Manchester City
4,273237,0.240872,0.239687,0.519441,Norwich,Arsenal
5,273238,0.44934,0.276498,0.274162,Southampton,Watford


In [139]:
from bs4 import BeautifulSoup

ModuleNotFoundError: No module named 'bs4'