In [1]:
import pandas as pd
import datetime as dt
import numpy as np
from utils import ranked_probability_loss

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.options.mode.chained_assignment = None

In [2]:
 # Reading the data

bets = pd.read_csv("data/bets.zip")
booking = pd.read_csv("data/booking.zip")
goals = pd.read_csv("data/goals.zip")
matches = pd.read_csv("data/matches.zip")
stats = pd.read_csv("data/stats.zip")

In [3]:
# Converting epoch column to datetime
matches['timestamp'] = matches['epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))
bets['timestamp'] = bets['odd_epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))

In [4]:
def week_converter(timestamp):
  """
  year is 2019 for dates between 2019-07 and 2020-06, 
  22nd week just random splitter, 
  there might be better representation
  
  is_national is True for Friday, Saturday, Sunday, Monday 
  False otherwise
  """
  # year = (timestamp - dt.timedelta(1)).dt.strftime('%Y')
  year, week, day = (timestamp - dt.timedelta(1)).isocalendar()
  year = year - 1 if week < 22 else year
  is_national = day >= 4
  return [year, week, is_national]

In [5]:
matches[['year','week', 'is_national']] = pd.DataFrame(matches.timestamp.apply(week_converter).values.tolist(), 
                                                       index=matches.index)


In [6]:
start_date = '2019-11-22'
end_date = '2019-11-26'
league_id = 148

In [7]:
test_matches = matches[matches['timestamp'] > start_date]
matches = matches[matches['timestamp'] < start_date]
print(len(test_matches), len(matches))
if end_date:
  test_matches = test_matches[test_matches['timestamp'] < end_date]
  print(len(test_matches), len(matches))
if league_id:
  test_matches = test_matches[test_matches['league_id'] == league_id]
  print(len(test_matches), len(matches))
matches = matches.dropna(subset=['match_status', 'match_hometeam_score', 'match_awayteam_score'])
print(len(test_matches), len(matches))

116 5470
41 5470
2 5470
2 5470


In [8]:
# subsetting bets to odd1 oddx odd2 only
# odd values should be more than 
bets = bets[bets['value'] > 1]
bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

In [9]:
# pivoting bets data to see the changes with time easily and 
# see the odds in a single row for each match - bookmaker - timestamp

bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                        columns='variable',
                        values='value').reset_index()

# reordering columns
bets = bets[['match_id', 'odd_bookmakers', 'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

In [10]:
# Since bets are changing by time, I will use final odds announced by bookmakers
# by assuming they are correcting their odds somehow

final_bets = bets.groupby(['match_id', 'odd_bookmakers'], as_index=False).last()

In [11]:
# Calculating implied naive probabilities and creating new prob_odd_1(x,2) columns
for cols in ['odd_1', 'odd_x', 'odd_2']:
  final_bets['prob_'+cols] = 1 / final_bets[cols]

# Summing all naive probabilities for each bookmaker & match (this will give us 1 + margin of bookmaker)
final_bets['total'] = final_bets['prob_odd_1'] + final_bets['prob_odd_x'] + final_bets['prob_odd_2']

# normalizin odd by removing margin share from each of them
for cols in ['odd_1', 'odd_x', 'odd_2']:
  final_bets['norm_prob_'+cols] = final_bets['prob_'+cols] / final_bets['total']

In [12]:
# creates a result column 1, 0 or 2 for home win, draw, away win accordingly

matches['result'] = np.where(matches.match_hometeam_score > matches.match_awayteam_score, 
                             1, 0)
# if away > home, then returns 2. otherwise returns the previous result value 
# (which is 1 if home > away and 0 otherwise)

matches['result'] = np.where(matches.match_hometeam_score < matches.match_awayteam_score, 
                             2, matches.result)

# joining result info into the final bets table

final_bets_test = final_bets.merge(test_matches[['match_id']], 
                              on='match_id')
final_bets = final_bets.merge(matches[['match_id', 'result', 'year', 'week', 'is_national']], 
                              on='match_id')

In [30]:
matches[matches['match_id'] == 145899]

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,match_hometeam_halftime_score,match_awayteam_halftime_score,match_hometeam_extra_score,match_awayteam_extra_score,match_hometeam_penalty_score,match_awayteam_penalty_score,league_id,timestamp,year,week,is_national,result
2536,3031,3034,145899,1540917900,Finished,0,Montpellier,Nantes,0.0,3.0,0.0,0.0,0.0,0.0,,,176,2018-10-30 19:45:00,2018,44,False,2


In [13]:
final_bets['rps'] = ranked_probability_loss(final_bets['result'], 
                                            final_bets[['norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2']])

In [14]:
final_bets

Unnamed: 0,match_id,odd_bookmakers,odd_1,odd_x,odd_2,timestamp,prob_odd_1,prob_odd_x,prob_odd_2,total,norm_prob_odd_1,norm_prob_odd_x,norm_prob_odd_2,result,year,week,is_national,rps
0,145899,10Bet,6.75,5.25,1.29,2017-03-12 11:03:01,0.148148,0.190476,0.775194,1.113818,0.133009,0.171012,0.695979,2,2018,44,False,0.05506
1,145899,188BET,4.95,4.60,1.43,2017-03-12 11:03:01,0.202020,0.217391,0.699301,1.118712,0.180583,0.194323,0.625094,2,2018,44,False,0.08658
2,145899,18bet,5.35,4.30,1.43,2017-03-12 11:03:01,0.186916,0.232558,0.699301,1.118775,0.167072,0.207869,0.625059,2,2018,44,False,0.08425
3,145899,1xBet,6.45,4.76,1.49,2017-03-12 11:03:01,0.155039,0.210084,0.671141,1.036264,0.149613,0.202732,0.647655,2,2018,44,False,0.07327
4,145899,888sport,7.00,5.00,1.35,2017-03-12 11:03:01,0.142857,0.200000,0.740741,1.083598,0.131836,0.184570,0.683594,2,2018,44,False,0.05875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135798,320397,bet365.it,2.00,3.60,3.20,2019-10-30 05:42:21,0.500000,0.277778,0.312500,1.090278,0.458599,0.254777,0.286624,1,2019,44,False,0.18763
135799,320397,bwin,2.15,3.50,3.00,2019-10-30 05:42:21,0.465116,0.285714,0.333333,1.084164,0.429009,0.263534,0.307457,1,2019,44,False,0.21028
135800,320397,bwin.es,2.15,3.50,3.00,2019-10-30 05:42:21,0.465116,0.285714,0.333333,1.084164,0.429009,0.263534,0.307457,1,2019,44,False,0.21028
135801,320397,iFortuna.cz,2.17,3.70,2.86,2019-10-30 05:42:21,0.460829,0.270270,0.349650,1.080750,0.426398,0.250077,0.323526,1,2019,44,False,0.21684


In [28]:
bookmaker_list = final_bets.groupby(['odd_bookmakers', 'year', 'is_national']).agg({'rps': 'mean',
                                  'result': 'count'}).sort_values('rps').reset_index().reset_index()
bookmaker_list['index'] = bookmaker_list['index'] + 1
bookmaker_list.columns = ['rank', 'odd_bookmakers', 'year', 'is_national', 'mean_rps', 'count']
bookmaker_list.head(10000)

Unnamed: 0,rank,odd_bookmakers,year,is_national,mean_rps,count
0,1,betcart,2018,False,0.114997,3
1,2,Guts,2018,False,0.121430,3
2,3,GGBET,2019,True,0.155233,14
3,4,KTO,2019,True,0.156200,1
4,5,Sportium,2019,True,0.158151,8
...,...,...,...,...,...,...
360,361,Winline,2018,True,0.314483,36
361,362,BetOnline,2019,False,0.326780,1
362,363,PaddyPower.it,2019,False,0.356814,5
363,364,Bovada,2019,False,0.359890,5


In [27]:
bookmaker_list[bookmaker_list['odd_bookmakers'] == 'bwin']

Unnamed: 0,rank,odd_bookmakers,year,is_national,mean_rps,count
59,60,bwin,2018,False,0.198136,256
111,112,bwin,2019,False,0.204145,128
122,123,bwin,2018,True,0.204772,1133
213,214,bwin,2019,True,0.209067,902


In [25]:
bookmaker_list.groupby().count().sort_values('rank')

Unnamed: 0_level_0,rank,year,is_national,mean_rps,count
odd_bookmakers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GGBET,1,1,1,1,1
Luxbet,2,2,2,2,2
Tipico.it,2,2,2,2,2
DOXXbet,2,2,2,2,2
Tipbet,2,2,2,2,2
Winline.ru,2,2,2,2,2
Betago,2,2,2,2,2
Winner,2,2,2,2,2
Guts,2,2,2,2,2
Sekabet,2,2,2,2,2


In [16]:
final_bets_test = final_bets_test.merge(bookmaker_list[['rank', 'odd_bookmakers']], on='odd_bookmakers')

In [17]:
final_bets_test['updated_ranking'] = final_bets_test.groupby('match_id')["rank"].rank(method="first", ascending=True)

In [18]:
final_bets_test

Unnamed: 0,match_id,odd_bookmakers,odd_1,odd_x,odd_2,timestamp,prob_odd_1,prob_odd_x,prob_odd_2,total,norm_prob_odd_1,norm_prob_odd_x,norm_prob_odd_2,rank,updated_ranking
0,273222,10Bet,2.05,3.4,3.45,2019-11-22 21:45:31,0.487805,0.294118,0.289855,1.071778,0.455136,0.27442,0.270443,35,30.0
1,273228,10Bet,3.4,3.15,2.15,2019-11-21 16:35:23,0.294118,0.31746,0.465116,1.076694,0.273167,0.294847,0.431985,35,30.0
2,273222,188BET,2.12,3.5,3.55,2019-11-22 21:45:31,0.471698,0.285714,0.28169,1.039103,0.453948,0.274963,0.27109,33,28.0
3,273228,188BET,3.6,3.2,2.2,2019-11-21 16:35:23,0.277778,0.3125,0.454545,1.044823,0.265861,0.299094,0.435045,33,28.0
4,273222,18bet,2.1,3.5,3.6,2019-11-22 21:45:31,0.47619,0.285714,0.277778,1.039683,0.458015,0.274809,0.267176,18,13.0
5,273228,18bet,3.7,3.35,2.15,2019-11-23 05:29:29,0.27027,0.298507,0.465116,1.033894,0.26141,0.288722,0.449868,18,13.0
6,273222,1xBet,2.14,3.58,3.66,2019-11-22 21:45:31,0.46729,0.27933,0.273224,1.019843,0.458198,0.273895,0.267908,37,32.0
7,273228,1xBet,3.58,3.46,2.21,2019-11-23 05:29:29,0.27933,0.289017,0.452489,1.020836,0.273628,0.283118,0.443253,37,32.0
8,273222,888sport,2.08,3.55,3.6,2019-11-22 21:45:31,0.480769,0.28169,0.277778,1.040237,0.462173,0.270794,0.267033,27,22.0
9,273228,888sport,3.6,3.45,2.12,2019-11-23 05:29:29,0.277778,0.289855,0.471698,1.039331,0.267266,0.278886,0.453848,27,22.0


In [19]:
final_bets_test.match_id.unique()

array([273222, 273228])

In [20]:
final_bets_test[final_bets_test['updated_ranking'] == 1]

Unnamed: 0,match_id,odd_bookmakers,odd_1,odd_x,odd_2,timestamp,prob_odd_1,prob_odd_x,prob_odd_2,total,norm_prob_odd_1,norm_prob_odd_x,norm_prob_odd_2,rank,updated_ranking
44,273222,GGBET,2.13,3.6,3.35,2019-11-22 21:45:31,0.469484,0.277778,0.298507,1.045769,0.448936,0.265621,0.285443,2,1.0
45,273228,GGBET,3.67,3.35,2.11,2019-11-21 16:35:23,0.27248,0.298507,0.473934,1.044921,0.260766,0.285675,0.453559,2,1.0
