In [65]:
import pandas as pd
import datetime as dt
import numpy as np
from utils import ranked_probability_loss

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.options.mode.chained_assignment = None

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer
from sklearn.utils.validation import check_array

def ranked_probability_loss_metric(obs, preds):
  """
  >>> y_true = [1, 1]
  >>> y_prob = [[0.5, 0.3, 0.2], [0.5, 0.2, 0.3]]
  >>> ranked_probability_loss(y_true, y_prob) # array([0.145, 0.17 ])

  >>> y_true = [1]
  >>> y_prob = [[0.7, 0.3, 0]]
  >>> ranked_probability_loss(y_true, y_prob) # array([0.045])
  """
  result_mapping = {1: [1, 0, 0],
                  0: [0, 1, 0],
                  2: [0, 0, 1]}

  obs = check_array(obs, ensure_2d=False)
  preds = check_array(preds, ensure_2d=False)
  obs = np.array([result_mapping[i] for i in obs])

  cum_diff = np.cumsum(preds, axis=1) - np.cumsum(obs, axis=1)
  result = np.sum(np.square(cum_diff), axis=1)/2
  return np.round(result, 5).mean()

In [66]:
# Reading the data

bets = pd.read_csv("data/bets.zip")
booking = pd.read_csv("data/booking.zip")
goals = pd.read_csv("data/goals.zip")
matches = pd.read_csv("data/matches.zip")
stats = pd.read_csv("data/stats.zip")

In [67]:
# Converting epoch column to datetime
matches['timestamp'] = matches['epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))
bets['timestamp'] = bets['odd_epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))


In [68]:
def week_converter(timestamp):
  """
  year is 2019 for dates between 2019-07 and 2020-06, 
  22nd week just random splitter, 
  there might be better representation
  
  is_national is True for Friday, Saturday, Sunday, Monday 
  False otherwise
  """
  # year = (timestamp - dt.timedelta(1)).dt.strftime('%Y')
  year, week, day = (timestamp - dt.timedelta(1)).isocalendar()
  year = year - 1 if week < 22 else year
  is_national = day >= 4
  return [year, week, is_national]

In [69]:
matches[['year','week', 'is_national']] = pd.DataFrame(matches.timestamp.apply(week_converter).values.tolist(), 
                                                       index=matches.index)

In [70]:
start_date = '2019-12-05'
end_date = '2020-01-01'
league_id = 148

In [71]:
matches = matches[matches["league_id"] == 148]
test_matches = matches[matches['timestamp'] > start_date]
matches = matches[matches['timestamp'] < start_date]
print(len(test_matches), len(matches))

if end_date:
  test_matches = test_matches[test_matches['timestamp'] < end_date]
  print(len(test_matches), len(matches))
    
if league_id:
  test_matches = test_matches[test_matches['league_id'] == league_id]
  print(len(test_matches), len(matches))
    
matches = matches.dropna(subset=['match_status', 'match_hometeam_score', 'match_awayteam_score'])
print(len(test_matches), len(matches))

12 690
12 690
12 690
12 690


In [72]:
# subsetting bets to odd1 oddx odd2 only
# odd values should be more than 
bets = bets[bets['value'] > 1]
bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                        columns='variable',
                        values='value').reset_index()

# pivoting bets data to see the changes with time easily and 
# see the odds in a single row for each match - bookmaker - timestamp

bets = bets[['match_id', 'odd_bookmakers', 'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

# Since bets are changing by time, I will use final odds announced by bookmakers
# by assuming they are correcting their odds somehow

final_bets = bets.groupby(['match_id', 'odd_bookmakers'], as_index=False).last()
first_bets = bets.groupby(['match_id', 'odd_bookmakers'], as_index=False).first()

final_bets["BetType"] = "Final"
first_bets["BetType"] = "First"

final_bets = pd.concat([first_bets, final_bets])

In [73]:
# Calculating implied naive probabilities and creating new prob_odd_1(x,2) columns
for cols in ['odd_1', 'odd_x', 'odd_2']:
  final_bets['prob_'+cols] = 1 / final_bets[cols]

# Summing all naive probabilities for each bookmaker & match (this will give us 1 + margin of bookmaker)
final_bets['total'] = final_bets['prob_odd_1'] + final_bets['prob_odd_x'] + final_bets['prob_odd_2']

# normalizin odd by removing margin share from each of them
for cols in ['odd_1', 'odd_x', 'odd_2']:
  final_bets['norm_prob_'+cols] = final_bets['prob_'+cols] / final_bets['total']

In [74]:
# creates a result column 1, 0 or 2 for home win, draw, away win accordingly

matches['result'] = np.where(matches.match_hometeam_score > matches.match_awayteam_score, 
                             1, 0)
# if away > home, then returns 2. otherwise returns the previous result value 
# (which is 1 if home > away and 0 otherwise)

matches['result'] = np.where(matches.match_hometeam_score < matches.match_awayteam_score, 
                             2, matches.result)

# joining result info into the final bets table

final_bets = final_bets.merge(matches[['match_id', 'result', 'year', 'week', 'is_national']], 
                              on='match_id')

In [24]:
final_bets['rps'] = ranked_probability_loss(final_bets['result'], 
                                            final_bets[['norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2']])

In [75]:
bookmaker_list = final_bets.groupby(['odd_bookmakers']).agg({'result': 'count'})
bookmaker_list = bookmaker_list[bookmaker_list["result"] > 800]

In [76]:
final_bets.columns

Index(['match_id', 'odd_bookmakers', 'odd_1', 'odd_x', 'odd_2', 'timestamp',
       'BetType', 'prob_odd_1', 'prob_odd_x', 'prob_odd_2', 'total',
       'norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2', 'result',
       'year', 'week', 'is_national'],
      dtype='object')

In [77]:
final_bets_input = final_bets[["match_id", "timestamp", "norm_prob_odd_1", "norm_prob_odd_x", "norm_prob_odd_2", "BetType", "odd_bookmakers", "result"]]

final_bets_input = final_bets_input.pivot_table(index=["result", "match_id", 'timestamp'],
                             columns= ["odd_bookmakers", "BetType"],
                             values=['norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2']).reset_index()

final_bets_input.columns = final_bets_input.columns.map('{0[1]}_{0[0]}_{0[2]}'.format)

final_bets_input.rename(columns={'_timestamp_':'timestamp',
                          '_match_id_':'match_id',
                                "_result_":"result"}, 
                 inplace=True)

final_bets_input["Year"] = final_bets_input["timestamp"].map(lambda x: x.year)

In [78]:
final_bets_input = final_bets_input.fillna(1/3)

In [79]:
final_bets_input = final_bets_input.drop("timestamp", axis=1)

In [106]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression


logistic_model = LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=1000)

ranked_probability_loss_metric(final_bets_input["result"], ridge_model.predict_proba(X))

In [85]:
model = XGBClassifier(needs_proba=True)
kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=7)

label_encoded_y = LabelEncoder().fit_transform(final_bets_input["result"])
X = final_bets_input.drop('result', axis=1)

scorer = make_scorer(ranked_probability_loss_metric, greater_is_better=False, needs_proba=True)


parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['multi:softprob'],
              'learning_rate': [0.01, 0.05], #so called `eta` value
              'max_depth': [2, 4],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.6, 0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 250], #number of trees, change it to 1000 for better results
              'seed': [1337]}

clf = GridSearchCV(model, parameters, n_jobs=-1, 
                   cv=kfold, 
                   scoring= scorer,
                   verbose=2, refit=True)

clf.fit(X, label_encoded_y)


Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   11.8s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   11.9s finished


GridSearchCV(cv=StratifiedKFold(n_splits=8, random_state=7, shuffle=True),
             error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     needs_proba=True, nt...
             param_grid={'colsample_bytree': [0.7], 'learning_rate': [0.01],
                         'max_depth': [2], 'min_child_weight': [11],
                         'n_estimators': [100], 'nthread': [4],
                         'objective': ['multi:softprob'], 'seed': [1337],
                         'silent': [1], 'subsample': [0.8]},
             pre_dispatch='2*n_jobs

In [86]:
y_pred = clf.predict_proba(X)
ranked_probability_loss_metric(final_bets_input["result"], y_pred)

0.24995947194719476

In [109]:
match_list = matches[(matches["league_id"] == 148) & (matches["timestamp"] > '2019-11-20')]
match_list = match_list["match_id"].values.tolist()

test = final_bets_input[final_bets_input["match_id"].isin(match_list)]
y_pred = clf.predict_proba(test.drop('result', axis=1))

ranked_probability_loss_metric(test['result'], y_pred)

0.293065

In [93]:
#y_pred, test["result"]

test = final_bets_input[final_bets_input["match_id"].isin(match_list)]
y_pred = ridge_model.predict_proba(test.drop('result', axis=1))

ranked_probability_loss_metric(test['result'], y_pred)

ridge_params = {'alpha':[200, 230, 250,265, 270, 275, 290, 300, 500]}

models2 = GridSearchCV(RidgeClassifier(), 
                               param_grid=ridge_params).fit(X, label_encoded_y).best_estimator_

test = final_bets_input[final_bets_input["match_id"].isin(match_list)]
y_pred = models2.predict_proba(test.drop('result', axis=1))

ranked_probability_loss_metric(test['result'], y_pred)




AttributeError: 'RidgeClassifier' object has no attribute 'predict_proba'

In [94]:
ridge_params = {'alpha':[200, 230, 250,265, 270, 275, 290, 300, 500]}

models2 = GridSearchCV(RidgeClassifier(), 
                               param_grid=ridge_params).fit(X, label_encoded_y).best_estimator_

test = final_bets_input[final_bets_input["match_id"].isin(match_list)]
y_pred = models2.predict_proba(test.drop('result', axis=1))

ranked_probability_loss_metric(test['result'], y_pred)




AttributeError: 'RidgeClassifier' object has no attribute 'predict_proba'

In [None]:
models2

In [None]:
final_bets_input[final_bets_input["match_id"].isin(match_list)]

In [None]:
y_pred