In [69]:
import pandas as pd
import datetime as dt
import numpy as np
from utils import ranked_probability_loss

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
pd.options.mode.chained_assignment = None

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer
from sklearn.utils.validation import check_array


In [70]:
# Reading the data

bets = pd.read_csv("data/bets.zip")
booking = pd.read_csv("data/booking.zip")
goals = pd.read_csv("data/goals.zip")
matches = pd.read_csv("data/matches.zip")
stats = pd.read_csv("data/stats.zip")

In [71]:
# Converting epoch column to datetime
matches['timestamp'] = matches['epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))
bets['timestamp'] = bets['odd_epoch'].apply(lambda x: dt.datetime.fromtimestamp(x))


In [72]:
def week_converter(timestamp):
  """
  year is 2019 for dates between 2019-07 and 2020-06, 
  22nd week just random splitter, 
  there might be better representation
  
  is_national is True for Friday, Saturday, Sunday, Monday 
  False otherwise
  """
  # year = (timestamp - dt.timedelta(1)).dt.strftime('%Y')
  s = '2016-03-14 17:24:55'  
  year, week, day = (timestamp - dt.timedelta(1)).isocalendar()
  season = year - 1 if week < 27 else year
  is_weekend = day >= 4
  date = timestamp - dt.timedelta(1)
  hour = timestamp.fromisoformat(s).hour
  return [date, season, year, week, is_weekend, hour]

In [73]:
matches[['date', 'season', 'year', 'week', 'is_weekend', "hour"]] = pd.DataFrame(matches.timestamp.apply(week_converter).values.tolist(), 
                                                       index=matches.index)

In [74]:
teams = matches.groupby(["match_awayteam_id", "match_awayteam_name"]).size().reset_index()

In [75]:
matches = matches.sort_values("date")

In [76]:
team_match = pd.DataFrame()
    
for index, row in matches.iterrows():
    away_team = row["match_awayteam_id"]
    home_team = row["match_hometeam_id"]
    
    dt = [[away_team,
          row["match_awayteam_name"],
          row["match_id"],
          "Away",
          row["season"],
          row["date"],
          row["hour"],
          row["match_live"],
          row["match_awayteam_score"],
          row["match_hometeam_score"],
         ]]
    
    df = pd.DataFrame(dt, columns = ['TeamId', 'TeamName', "MatchId", "HomeAway", "Season", "Date", "Hour", "Live", "Scored", "Conceded"]) 

    team_match = team_match.append(df)
    
    dt = [[home_team,
          row["match_hometeam_name"],
          row["match_id"],
          "Home",
          row["season"],
          row["date"],
          row["hour"],
          row["match_live"],
          row["match_hometeam_score"],
          row["match_awayteam_score"]
         ]]
    
    df = pd.DataFrame(dt, columns = ['TeamId', 'TeamName', "MatchId", "HomeAway", "Season", "Date", "Hour", "Live", "Scored", "Conceded"]) 

    team_match = team_match.append(df)

In [82]:
def point(row):
   if row['Scored'] > row["Conceded"]:
      return 3
   if row['Scored'] < row["Conceded"]:
      return 0
   if row['Scored'] == row["Conceded"]:
      return 1

   return np.nan

team_match['Point'] = team_match.apply (lambda row: point(row), axis=1)

In [104]:
team_match = team_match.sort_values("Date")

In [113]:
team_match["Point1"] = team_match.groupby(["Season", "TeamId"]).Point.transform(roll1)

In [106]:
roll1 = lambda x: x.rolling(1).sum().shift()
roll5 = lambda x: x.rolling(5).sum().shift()
historic = lambda x: x.expanding().mean().shift()

In [115]:
team_match["Point1"] = team_match.groupby(["Season", "TeamId"]).Point.transform(roll1)
team_match["GoalScored1"] = team_match.groupby(["Season", "TeamId"]).Scored.transform(roll1)
team_match["GoalConceded1"] = team_match.groupby(["Season", "TeamId"]).Conceded.transform(roll1)

team_match["Point5"] = team_match.groupby(["Season", "TeamId"]).Point.transform(roll5)
team_match["GoalScored5"] = team_match.groupby(["Season", "TeamId"]).Scored.transform(roll5)
team_match["GoalConceded5"] = team_match.groupby(["Season", "TeamId"]).Conceded.transform(roll5)

team_match["Point1Pos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Point.transform(roll1)
team_match["GoalScored1Pos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Scored.transform(roll1)
team_match["GoalConceded1Pos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Conceded.transform(roll1)

In [126]:
team_match[team_match["TeamId"] == 7109]

Unnamed: 0,TeamId,TeamName,MatchId,HomeAway,Season,Date,Hour,Live,Scored,Conceded,Point,Point1,GoalScored1,GoalConceded1,Point5,GoalScored5,GoalConceded5,Point1Pos,GoalScored1Pos,GoalConceded1Pos,PerformanceSeason,PerformanceHistoric,DrawRatio,WinRatio,LostRatio,DrawRatioSeason,WinRatioSeason,LostRatioSeason,DrawRatioPos,WinRatioPos,LostRatioPos,DrawRatioSeasonPos,WinRatioSeasonPos,LostRatioSeasonPos
0,7109,Valencia,41196,Away,2017,2017-09-15 14:00:00,17,0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,
0,7109,Valencia,41169,Away,2017,2017-09-23 21:45:00,17,0,3.0,2.0,3.0,1.0,1.0,1.0,,,,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,7109,Valencia,41159,Home,2017,2017-09-30 19:30:00,17,0,3.0,2.0,3.0,3.0,3.0,2.0,,,,,,,2.0,2.0,0.5,0.5,0.5,0.5,0.5,0.5,,,,,,
0,7109,Valencia,41149,Away,2017,2017-10-14 21:45:00,17,0,6.0,3.0,3.0,3.0,3.0,2.0,,,,3.0,3.0,2.0,2.333333,2.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.5,0.5,0.5
0,7109,Valencia,41145,Home,2017,2017-10-20 19:30:00,17,0,4.0,0.0,3.0,3.0,6.0,3.0,,,,3.0,3.0,2.0,2.5,2.5,0.75,0.75,0.75,0.75,0.75,0.75,1.0,1.0,1.0,1.0,1.0,1.0
0,7109,Valencia,41137,Away,2017,2017-10-27 14:00:00,17,0,2.0,1.0,3.0,3.0,4.0,0.0,13.0,17.0,8.0,3.0,6.0,3.0,2.6,2.6,0.8,0.8,0.8,0.8,0.8,0.8,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667
0,7109,Valencia,55688,Home,2017,2017-11-03 14:00:00,17,0,3.0,0.0,3.0,3.0,2.0,1.0,15.0,18.0,8.0,3.0,4.0,0.0,2.666667,2.666667,0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,1.0,1.0,1.0,1.0,1.0,1.0
0,7109,Valencia,59104,Away,2017,2017-11-18 17:15:00,17,0,2.0,0.0,3.0,3.0,3.0,0.0,15.0,18.0,6.0,3.0,2.0,1.0,2.714286,2.714286,0.857143,0.857143,0.857143,0.857143,0.857143,0.857143,0.75,0.75,0.75,0.75,0.75,0.75
0,7109,Valencia,63020,Home,2017,2017-11-25 21:45:00,17,0,1.0,1.0,1.0,3.0,2.0,0.0,15.0,17.0,4.0,3.0,3.0,0.0,2.75,2.75,0.875,0.875,0.875,0.875,0.875,0.875,1.0,1.0,1.0,1.0,1.0,1.0
0,7109,Valencia,64448,Away,2017,2017-12-02 17:15:00,17,0,0.0,1.0,0.0,1.0,1.0,1.0,13.0,12.0,2.0,3.0,2.0,0.0,2.555556,2.555556,0.777778,0.777778,0.777778,0.777778,0.777778,0.777778,0.8,0.8,0.8,0.8,0.8,0.8


In [117]:
team_match["PerformanceSeason"] = team_match.groupby(["Season", "TeamId"]).Point.transform(historic)
team_match["PerformanceHistoric"] = team_match.groupby(["TeamId"]).Point.transform(historic)


In [118]:
team_match["Won"] = [1 if x == 3 else 0 for x in team_match['Point']]
team_match["Draw"] = [1 if x == 3 else 0 for x in team_match['Point']]
team_match["Lost"] = [1 if x == 3 else 0 for x in team_match['Point']]

In [119]:
team_match["DrawRatio"] = team_match.groupby(["TeamId"]).Draw.transform(historic)
team_match["WinRatio"] = team_match.groupby(["TeamId"]).Won.transform(historic)
team_match["LostRatio"] = team_match.groupby(["TeamId"]).Lost.transform(historic)

team_match["DrawRatioSeason"] = team_match.groupby(["Season", "TeamId"]).Draw.transform(historic)
team_match["WinRatioSeason"] = team_match.groupby(["Season", "TeamId"]).Won.transform(historic)
team_match["LostRatioSeason"] = team_match.groupby(["Season", "TeamId"]).Lost.transform(historic)

team_match["DrawRatioPos"] = team_match.groupby(["TeamId", "HomeAway"]).Draw.transform(historic)
team_match["WinRatioPos"] = team_match.groupby(["TeamId", "HomeAway"]).Won.transform(historic)
team_match["LostRatioPos"] = team_match.groupby(["TeamId", "HomeAway"]).Lost.transform(historic)

team_match["DrawRatioSeasonPos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Draw.transform(historic)
team_match["WinRatioSeasonPos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Won.transform(historic)
team_match["LostRatioSeasonPos"] = team_match.groupby(["Season", "TeamId", "HomeAway"]).Lost.transform(historic)


In [120]:
team_match = team_match.drop(["Draw", "Won", "Lost"], axis = 1)


KeyError: "['index'] not found in axis"

In [123]:
cols = list(range(2,3)) + list(range(11, 34))

home = team_match[team_match["HomeAway"] == 'Home'].iloc[:, cols]
away = team_match[team_match["HomeAway"] == 'Away'].iloc[:, cols]

In [125]:
home

Unnamed: 0,MatchId,Point1,GoalScored1,GoalConceded1,Point5,GoalScored5,GoalConceded5,Point1Pos,GoalScored1Pos,GoalConceded1Pos,PerformanceSeason,PerformanceHistoric,DrawRatio,WinRatio,LostRatio,DrawRatioSeason,WinRatioSeason,LostRatioSeason,DrawRatioPos,WinRatioPos,LostRatioPos,DrawRatioSeasonPos,WinRatioSeasonPos,LostRatioSeasonPos
0,41196,,,,,,,,,,,,,,,,,,,,,,,
0,13331,,,,,,,,,,,,,,,,,,,,,,,
0,17684,,,,,,,,,,,,,,,,,,,,,,,
0,17682,,,,,,,,,,,,,,,,,,,,,,,
0,17683,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,245306,,,,,,,0.0,1.0,3.0,0.470588,0.555556,0.142857,0.142857,0.142857,0.111111,0.111111,0.111111,0.250000,0.250000,0.250000,0.125000,0.125000,0.125000
0,243298,,,,,,,1.0,1.0,1.0,1.500000,1.397059,0.347826,0.347826,0.347826,0.400000,0.400000,0.400000,0.393939,0.393939,0.393939,0.428571,0.428571,0.428571
0,278861,,,,,,,,,,1.066667,1.246914,0.292683,0.292683,0.292683,0.250000,0.250000,0.250000,0.302326,0.302326,0.302326,0.250000,0.250000,0.250000
0,242383,,,,,,,3.0,1.0,0.0,1.000000,1.524390,0.397590,0.397590,0.397590,0.181818,0.181818,0.181818,0.463415,0.463415,0.463415,0.363636,0.363636,0.363636


In [127]:
team_stats = home.merge(away, on='MatchId', how='inner', suffixes=('_Home', '_Away'))

In [68]:
home

Unnamed: 0,MatchId,GoalScored1,GoalConceded1,Point5,GoalScored5,GoalConceded5,Point1Pos,GoalScored1Pos,GoalConceded1Pos,PerformanceSeason,PerformanceHistoric,DrawRatio,WinRatio,LostRatio,DrawRatioSeason,WinRatioSeason,LostRatioSeason,DrawRatioPos,WinRatioPos,LostRatioPos,DrawRatioSeasonPos,WinRatioSeasonPos,LostRatioSeasonPos
0,41196,,,,,,,,,,,,,,,,,,,,,,
0,13331,,,,,,,,,,,,,,,,,,,,,,
0,17683,,,,,,,,,,,,,,,,,,,,,,
0,17684,,,,,,,,,,,,,,,,,,,,,,
0,17682,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,245306,,,,,,,,,,,,,,,,,,,,,,
0,243298,,,,,,,,,,,,,,,,,,,,,,
0,278861,,,,,,,,,,,,,,,,,,,,,,
0,242383,,,,,,,,,,,,,,,,,,,,,,


In [49]:
import pandas as pd
import numpy as np
import datetime as dt
import logging
from utils import week_converter

logger = logging.getLogger('myLogger')
level = logging.getLevelName('INFO')
logger.setLevel(level)

def prepare(start_date, end_date):
    bets = pd.read_csv("data/bets.zip")
    matches = pd.read_csv("data/matches.zip")

    # Converting epoch column to datetime
    matches['timestamp'] = matches['epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))
    bets['timestamp'] = bets['odd_epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))

    matches[['year','week', 'is_weekend']] = pd.DataFrame(
        matches.timestamp.apply(week_converter).values.tolist(), 
        index=matches.index)

    start_date2 = dt.datetime.strptime(
        start_date, '%Y-%m-%d') - dt.timedelta(1)
    start_date2 = dt.datetime.strftime(start_date2, '%Y-%m-%d')
    end_date = dt.datetime.strptime(end_date, '%Y-%m-%d') + dt.timedelta(1)
    end_date = dt.datetime.strftime(end_date, '%Y-%m-%d')

    test_matches = matches[(matches['timestamp'] > start_date2) &
                        (matches['timestamp'] < end_date) &
                        (matches['league_id'] == 148)]
    test_matches = test_matches.sort_values('match_id')
    matches = matches[matches['timestamp'] < start_date]
    print('Number of test and train matches are {} and {}'
          .format(len(test_matches), len(matches)))
    matches = matches.dropna(
        subset=['match_status', 'match_hometeam_score', 
                'match_awayteam_score'])

    match_ids = list(test_matches.match_id.append(matches.match_id))
    bets = bets[bets['match_id'].isin(match_ids)]
    bets = bets[bets['value'] > 1]
    bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

    bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                            columns='variable',
                            values='value').reset_index()
    bets = bets[['match_id', 'odd_bookmakers',
                'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

    final_bets = bets.groupby(['match_id', 'odd_bookmakers'],
                              as_index=False).last()
    
    for cols in ['odd_1', 'odd_x', 'odd_2']:
        final_bets['prob_'+cols] = 1 / final_bets[cols]

    final_bets['total'] = (final_bets['prob_odd_1'] + \
                          final_bets['prob_odd_x'] + \
                          final_bets['prob_odd_2'])

    for cols in ['odd_1', 'odd_x', 'odd_2']:
        final_bets['norm_prob_'+cols] = (final_bets['prob_'+cols] / 
                                         final_bets['total'])
    
    matches['result'] = np.where(
        matches.match_hometeam_score > matches.match_awayteam_score, 1, 0)
    matches['result'] = np.where(
        matches.match_hometeam_score < matches.match_awayteam_score, 
        2, matches.result)

    final_bets = final_bets.merge(matches[['match_id', 'result']], 
                              on='match_id', how='left')
    df = final_bets[
        ['match_id', 'odd_bookmakers', 'norm_prob_odd_1', 
         'norm_prob_odd_x', 'norm_prob_odd_2', 'result']]
    test = df[df.match_id.isin(test_matches.match_id)]
    train = df[df.match_id.isin(matches.match_id)]
    
    pivot_df = pd.pivot_table(train, 
               values=['norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2'],
               columns=['odd_bookmakers'],
               index=['match_id', 'result'])
    pivot_df = pivot_df.reset_index()
    y = pivot_df.result
    X = pivot_df.drop(['result'], axis=1)
    
    pivot_df = pd.pivot_table(test, 
               values=['norm_prob_odd_1', 'norm_prob_odd_x', 'norm_prob_odd_2'],
               columns=['odd_bookmakers'],
               index=['match_id'])
    pivot_df = pivot_df.reset_index()
    X_test = pivot_df
    #X = X[X_test.drop(['match_id'], axis=1).columns]
    print('Shape of X, y and X_test respectively is '
          .format(X.shape, y.shape, X_test.shape))
    # X= X.fillna(1/3)
    # X_test= X_test.fillna(1/3)
    
    return X, y, X_test, matches, test_matches, bets, final_bets

In [53]:
X, y, X_test, matches, test_matches, bets, final_bets = prepare('2019-11-29', '2019-12-02')
results = find_results(X_test.match_id)

Number of test and train matches are 10 and 5570
Shape of X, y and X_test respectively is 


In [54]:
def find_bookies_to_keep(start_date, end_date, ratio):
    bets = pd.read_csv("data/bets.zip")
    matches = pd.read_csv("data/matches.zip")

    # Converting epoch column to datetime
    matches['timestamp'] = matches['epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))
    bets['timestamp'] = bets['odd_epoch'].apply(
        lambda x: dt.datetime.fromtimestamp(x))

    matches = matches[(matches['timestamp'] > start_date) &
                      (matches['timestamp'] < end_date) &
                      (matches['league_id'] == 148)]

    matches = matches.dropna(
        subset=['match_status', 'match_hometeam_score',
                'match_awayteam_score'])

    match_ids = list(matches.match_id)
    bets = bets[bets['match_id'].isin(match_ids)]
    bets = bets[bets['value'] > 1]
    bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

    bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                            columns='variable',
                            values='value').reset_index()
    bets = bets[['match_id', 'odd_bookmakers',
                 'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

    final_bets = bets.groupby(['match_id', 'odd_bookmakers'],
                              as_index=False).last()

    bookies = final_bets.groupby('odd_bookmakers').count()[['match_id']].reset_index()
    bookies['total_matches'] = final_bets.match_id.nunique()
    bookies['ratio'] = bookies['match_id'] / bookies['total_matches']
    bookies.sort_values('ratio', ascending=False, inplace=True)
    bookies.reset_index(drop=True, inplace=True)
    bookies_to_keep = bookies[bookies['ratio'] > ratio]
    return list(bookies_to_keep.odd_bookmakers)


bookies_to_keep = find_bookies_to_keep('2018-01-01', '2019-12-01', 0.975)

bets = pd.read_csv("data/bets.zip")
bets = bets[bets["odd_bookmakers"].isin(bookies_to_keep)]

bets['timestamp'] = bets['odd_epoch'].apply(
    lambda x: dt.datetime.fromtimestamp(x))

bets = bets[bets['value'] > 1]
bets = bets[bets['variable'].isin(['odd_1', 'odd_x', 'odd_2'])]

bets = bets.pivot_table(index=['match_id', 'odd_bookmakers', 'timestamp'],
                        columns='variable',
                        values='value').reset_index()
bets = bets[['match_id', 'odd_bookmakers',
            'odd_1', 'odd_x', 'odd_2', 'timestamp']].dropna()

for cols in ['odd_1', 'odd_x', 'odd_2']:
    bets['prob_'+cols] = 1 / bets[cols]

bets['total'] = (bets['prob_odd_1'] + bets['prob_odd_x'] + bets['prob_odd_2'])

for cols in ['odd_1', 'odd_x', 'odd_2']:
    bets['norm_prob_'+cols] = (bets['prob_'+cols] / bets['total'])

bets = bets.sort_values(
    ['timestamp', 'match_id', 'odd_bookmakers']).reset_index(drop=True)

bets = bets[["match_id", "odd_bookmakers", "norm_prob_odd_1", 
             "norm_prob_odd_x", "norm_prob_odd_2"]]

bets.rename({"norm_prob_odd_1": "odd_1",
             "norm_prob_odd_x": "odd_x",
             "norm_prob_odd_2": "odd_2"}, axis=1, inplace=True)

bets_features = bets.groupby(['match_id', 'odd_bookmakers']).agg(
    {'odd_1': ['min', 'max', 'first', 'last', 'var', 'mean'],
     'odd_x': ['min', 'max', 'first', 'last', 'var', 'mean'],
     'odd_2': ['min', 'max', 'first', 'last', 'var', 'mean', 'size']})

bets_features.columns = bets_features.columns.map('{0[0]}_{0[1]}'.format)
bets_features.rename({"odd_2_size": "size"}, axis=1, inplace=True)
bets_features.fillna(0, inplace=True)
mean_bets_features = bets_features.groupby('match_id').mean()

bets_features_pivoted = bets_features.pivot_table(
    index=["match_id"],
    columns= ["odd_bookmakers"])

bets_features_pivoted.columns = bets_features_pivoted.columns.map('{0[1]}_{0[0]}'.format)

for col in bets_features.columns:
    selected_cols = [cols for cols in bets_features_pivoted.columns if col in cols]
    selected = bets_features_pivoted[selected_cols]
    bets_features_pivoted[selected_cols] = \
        bets_features_pivoted[selected_cols].fillna(value=selected.mean(axis=1), axis=0)


In [36]:
def find_results(match_ids):
    matches = pd.read_csv('data/matches.zip')
    matches = matches[matches['match_id'].isin(match_ids)]
    matches['result'] = np.where(matches.match_hometeam_score > matches.match_awayteam_score, 
                             1, 0)
    matches['result'] = np.where(matches.match_hometeam_score < matches.match_awayteam_score, 
                             2, matches.result)
    return matches[['match_id', 'result']]

In [128]:
team_bets = team_stats.merge(bets_features_pivoted, left_on='MatchId', right_on = "match_id", how='inner')


In [132]:
main_data = team_bets[team_bets["GoalScored5_Home"].notna()]

In [136]:
results = find_results(main_data["MatchId"].tolist())

In [137]:
main_data = main_data.merge(results, left_on = "MatchId", right_on = "match_id", how = "inner")

In [138]:
main_data

Unnamed: 0,MatchId,Point1_Home,GoalScored1_Home,GoalConceded1_Home,Point5_Home,GoalScored5_Home,GoalConceded5_Home,Point1Pos_Home,GoalScored1Pos_Home,GoalConceded1Pos_Home,PerformanceSeason_Home,PerformanceHistoric_Home,DrawRatio_Home,WinRatio_Home,LostRatio_Home,DrawRatioSeason_Home,WinRatioSeason_Home,LostRatioSeason_Home,DrawRatioPos_Home,WinRatioPos_Home,LostRatioPos_Home,DrawRatioSeasonPos_Home,WinRatioSeasonPos_Home,LostRatioSeasonPos_Home,Point1_Away,GoalScored1_Away,GoalConceded1_Away,Point5_Away,GoalScored5_Away,GoalConceded5_Away,Point1Pos_Away,GoalScored1Pos_Away,GoalConceded1Pos_Away,PerformanceSeason_Away,PerformanceHistoric_Away,DrawRatio_Away,WinRatio_Away,LostRatio_Away,DrawRatioSeason_Away,WinRatioSeason_Away,LostRatioSeason_Away,DrawRatioPos_Away,WinRatioPos_Away,LostRatioPos_Away,DrawRatioSeasonPos_Away,WinRatioSeasonPos_Away,LostRatioSeasonPos_Away,match_id_x,18bet_odd_1_first,1xBet_odd_1_first,...,888sport_odd_x_max,BetVictor_odd_x_max,Chance.cz_odd_x_max,SBOBET_odd_x_max,Unibet_odd_x_max,Unibet.it_odd_x_max,William Hill_odd_x_max,bwin_odd_x_max,18bet_odd_x_mean,1xBet_odd_x_mean,888sport_odd_x_mean,BetVictor_odd_x_mean,Chance.cz_odd_x_mean,SBOBET_odd_x_mean,Unibet_odd_x_mean,Unibet.it_odd_x_mean,William Hill_odd_x_mean,bwin_odd_x_mean,18bet_odd_x_min,1xBet_odd_x_min,888sport_odd_x_min,BetVictor_odd_x_min,Chance.cz_odd_x_min,SBOBET_odd_x_min,Unibet_odd_x_min,Unibet.it_odd_x_min,William Hill_odd_x_min,bwin_odd_x_min,18bet_odd_x_var,1xBet_odd_x_var,888sport_odd_x_var,BetVictor_odd_x_var,Chance.cz_odd_x_var,SBOBET_odd_x_var,Unibet_odd_x_var,Unibet.it_odd_x_var,William Hill_odd_x_var,bwin_odd_x_var,18bet_size,1xBet_size,888sport_size,BetVictor_size,Chance.cz_size,SBOBET_size,Unibet_size,Unibet.it_size,William Hill_size,bwin_size,match_id_y,result
0,145899,3.0,3.0,0.0,13.0,10.0,1.0,3.0,2.0,0.0,2.500000,1.631579,0.421053,0.421053,0.421053,0.750000,0.750000,0.750000,0.380952,0.380952,0.380952,0.600000,0.600000,0.600000,3.0,2.0,1.0,7.0,8.0,7.0,3.0,2.0,1.0,1.090909,1.184211,0.289474,0.289474,0.289474,0.272727,0.272727,0.272727,0.227273,0.227273,0.227273,0.250000,0.250000,0.250000,145899,0.167072,0.149613,...,0.184570,0.204082,0.211570,0.206872,0.184570,0.184570,0.197451,0.180788,0.207869,0.202732,0.184570,0.204082,0.211570,0.206872,0.184570,0.184570,0.197451,0.180788,0.207869,0.202732,0.184570,0.204082,0.211570,0.206872,0.184570,0.184570,0.197451,0.180788,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,145899,2
1,145900,3.0,2.0,1.0,10.0,12.0,7.0,3.0,2.0,1.0,1.454545,1.150000,0.275000,0.275000,0.275000,0.363636,0.363636,0.363636,0.434783,0.434783,0.434783,0.571429,0.571429,0.571429,3.0,2.0,1.0,12.0,10.0,4.0,3.0,2.0,1.0,2.111111,1.277778,0.388889,0.388889,0.388889,0.666667,0.666667,0.666667,0.300000,0.300000,0.300000,0.500000,0.500000,0.500000,145900,0.434269,0.435524,...,0.263653,0.245142,0.265631,0.266028,0.263653,0.263653,0.242105,0.238016,0.248499,0.258895,0.263653,0.245142,0.265631,0.266028,0.263653,0.263653,0.242105,0.238016,0.248499,0.258895,0.263653,0.245142,0.265631,0.266028,0.263653,0.263653,0.242105,0.238016,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,145900,1
2,147997,0.0,0.0,4.0,4.0,5.0,11.0,1.0,1.0,1.0,1.363636,1.131579,0.315789,0.315789,0.315789,0.363636,0.363636,0.363636,0.388889,0.388889,0.388889,0.400000,0.400000,0.400000,3.0,2.0,1.0,10.0,6.0,4.0,3.0,1.0,0.0,1.636364,1.250000,0.375000,0.375000,0.375000,0.454545,0.454545,0.454545,0.263158,0.263158,0.263158,0.400000,0.400000,0.400000,147997,0.378735,0.382349,...,,0.273973,,,0.278059,,,0.287530,0.294175,0.278661,,0.273973,,,0.278059,,,0.287530,0.294175,0.278661,,0.273973,,,0.278059,,,0.287530,0.000000e+00,0.000000,,0.000000e+00,,,0.000000,,,0.000000,1.0,1.0,,1.0,,,1.0,,,1.0,147997,1
3,147993,3.0,3.0,2.0,4.0,7.0,13.0,0.0,0.0,4.0,0.777778,1.142857,0.285714,0.285714,0.285714,0.111111,0.111111,0.111111,0.263158,0.263158,0.263158,0.000000,0.000000,0.000000,1.0,1.0,1.0,2.0,5.0,13.0,0.0,0.0,6.0,1.000000,1.285714,0.357143,0.357143,0.357143,0.272727,0.272727,0.272727,0.500000,0.500000,0.500000,0.428571,0.428571,0.428571,147993,0.105892,0.106101,...,,0.201536,,,0.201903,,,0.190141,0.229135,0.203872,,0.201536,,,0.201903,,,0.190141,0.229135,0.203872,,0.201536,,,0.201903,,,0.190141,0.000000e+00,0.000000,,0.000000e+00,,,0.000000,,,0.000000,1.0,1.0,,1.0,,,1.0,,,1.0,147993,0
4,147995,3.0,1.0,0.0,7.0,5.0,5.0,3.0,2.0,1.0,1.500000,1.368421,0.342105,0.342105,0.342105,0.375000,0.375000,0.375000,0.473684,0.473684,0.473684,0.500000,0.500000,0.500000,0.0,1.0,5.0,5.0,4.0,9.0,0.0,1.0,5.0,1.000000,1.057143,0.257143,0.257143,0.257143,0.250000,0.250000,0.250000,0.052632,0.052632,0.052632,0.000000,0.000000,0.000000,147995,0.322615,0.333147,...,,0.274435,,,0.278623,,,0.271267,0.292370,0.272407,,0.274435,,,0.278623,,,0.271267,0.292370,0.272407,,0.274435,,,0.278623,,,0.271267,0.000000e+00,0.000000,,0.000000e+00,,,0.000000,,,0.000000,1.0,1.0,,1.0,,,1.0,,,1.0,147995,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2452,242833,0.0,1.0,3.0,7.0,9.0,8.0,3.0,4.0,1.0,1.473684,1.494118,0.411765,0.411765,0.411765,0.421053,0.421053,0.421053,0.431818,0.431818,0.431818,0.333333,0.333333,0.333333,3.0,3.0,1.0,12.0,10.0,4.0,3.0,3.0,1.0,2.470588,2.443182,0.784091,0.784091,0.784091,0.823529,0.823529,0.823529,0.659091,0.659091,0.659091,0.750000,0.750000,0.750000,242833,0.138095,0.132241,...,0.188247,0.194954,0.196243,0.196405,0.188247,0.188247,0.189041,0.211543,0.187004,0.188189,0.188247,0.194954,0.196243,0.196405,0.188247,0.188247,0.189041,0.211543,0.187004,0.188189,0.188247,0.194954,0.196243,0.196405,0.188247,0.188247,0.189041,0.211543,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,242833,0
2453,271226,3.0,3.0,1.0,10.0,8.0,3.0,1.0,1.0,1.0,1.857143,1.603175,0.428571,0.428571,0.428571,0.500000,0.500000,0.500000,0.562500,0.562500,0.562500,0.571429,0.571429,0.571429,1.0,1.0,1.0,7.0,5.0,5.0,3.0,2.0,1.0,1.266667,1.392857,0.392857,0.392857,0.392857,0.333333,0.333333,0.333333,0.400000,0.400000,0.400000,0.375000,0.375000,0.375000,271226,0.576290,0.585487,...,0.239791,0.225602,0.240001,0.234098,0.238253,0.238253,0.223456,0.228180,0.224856,0.225162,0.235333,0.225598,0.234970,0.232227,0.234307,0.234307,0.223045,0.228180,0.223964,0.221576,0.226415,0.225597,0.231220,0.231291,0.226415,0.226415,0.222222,0.228180,1.591561e-06,0.000019,5.964261e-05,8.585188e-12,2.050915e-05,0.000003,0.000047,0.000047,5.071853e-07,0.000000,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,271226,0
2454,278854,1.0,2.0,2.0,11.0,15.0,10.0,3.0,2.0,1.0,2.000000,1.129870,0.298701,0.298701,0.298701,0.562500,0.562500,0.562500,0.375000,0.375000,0.375000,0.666667,0.666667,0.666667,3.0,3.0,1.0,15.0,14.0,5.0,3.0,2.0,1.0,2.200000,1.790123,0.518519,0.518519,0.518519,0.666667,0.666667,0.666667,0.511628,0.511628,0.511628,0.571429,0.571429,0.571429,278854,0.249750,0.230516,...,0.238305,0.252653,0.242184,0.242877,0.238121,0.238121,0.235450,0.249332,0.230307,0.244050,0.237397,0.252653,0.241645,0.241795,0.236315,0.236315,0.235437,0.247400,0.229844,0.238520,0.236943,0.252653,0.240567,0.239632,0.235412,0.235412,0.235412,0.243537,1.607729e-07,0.000023,6.180168e-07,0.000000e+00,8.720046e-07,0.000004,0.000002,0.000002,4.627841e-10,0.000011,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,278854,0
2455,273264,1.0,0.0,0.0,7.0,4.0,4.0,3.0,1.0,0.0,1.375000,1.128205,0.294872,0.294872,0.294872,0.375000,0.375000,0.375000,0.307692,0.307692,0.307692,0.375000,0.375000,0.375000,1.0,2.0,2.0,4.0,6.0,10.0,3.0,2.0,1.0,1.117647,1.025316,0.253165,0.253165,0.253165,0.294118,0.294118,0.294118,0.157895,0.157895,0.157895,0.250000,0.250000,0.250000,273264,0.374150,0.375660,...,0.299433,0.297765,0.297467,0.297084,0.300000,0.300000,0.297765,0.287770,0.298456,0.296403,0.299433,0.297765,0.297033,0.297084,0.300000,0.300000,0.297765,0.287770,0.295865,0.295323,0.299433,0.297765,0.296487,0.297084,0.300000,0.300000,0.297765,0.287770,2.984443e-06,0.000002,0.000000e+00,0.000000e+00,2.595628e-07,0.000000,0.000000,0.000000,0.000000e+00,0.000000,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,273264,0


In [133]:
def ranked_probability_loss_metric(obs, preds):
  """
  >>> y_true = [1, 1]
  >>> y_prob = [[0.5, 0.3, 0.2], [0.5, 0.2, 0.3]]
  >>> ranked_probability_loss(y_true, y_prob) # array([0.145, 0.17 ])

  >>> y_true = [1]
  >>> y_prob = [[0.7, 0.3, 0]]
  >>> ranked_probability_loss(y_true, y_prob) # array([0.045])
  """
  result_mapping = {1: [1, 0, 0],
                  0: [0, 1, 0],
                  2: [0, 0, 1]}

  obs = check_array(obs, ensure_2d=False)
  preds = check_array(preds, ensure_2d=False)
  obs = np.array([result_mapping[i] for i in obs])

  cum_diff = np.cumsum(preds, axis=1) - np.cumsum(obs, axis=1)
  result = np.sum(np.square(cum_diff), axis=1)/2
  return np.round(result, 5).mean()

In [166]:
model = XGBClassifier(needs_proba=True)
kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=7)

label_encoded_y = LabelEncoder().fit_transform(main_data["result"])
X = main_data.drop('result', axis=1)

scorer = make_scorer(ranked_probability_loss_metric, greater_is_better=False, needs_proba=True)


parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['multi:softprob'],
              'learning_rate': [0.01, 0.05], #so called `eta` value
              'max_depth': [2, 4],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.6, 0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 250], #number of trees, change it to 1000 for better results
              'seed': [1337]}

clf = GridSearchCV(model, parameters, n_jobs=-1, 
                   cv=kfold, 
                   scoring= scorer,
                   verbose=2, refit=True)

clf.fit(X, label_encoded_y)


Fitting 6 folds for each of 16 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 15.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=6, random_state=7, shuffle=True),
             error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     needs_proba=True, nt...
                         'learning_rate': [0.01, 0.05], 'max_depth': [2, 4],
                         'min_child_weight': [11], 'n_estimators': [100, 250],
                         'nthread': [4], 'objective': ['multi:softprob'],
                         'seed': [1337], 'silent': [1],
                         'subsample': [0.6, 0.8]},
             pre_dispatch='2*n_jobs', refit

In [167]:
y_pred = clf.predict_proba(X)
ranked_probability_loss_metric(main_data["result"], y_pred)

0.23156038665038667

In [169]:
match_list = matches[(matches["league_id"] == 148) & (matches["timestamp"] > '2019-12-08')]
match_list = match_list["match_id"].values.tolist()

test = main_data[main_data["MatchId"].isin(match_list)]
y_pred = clf.predict_proba(test.drop('result', axis=1))

ranked_probability_loss_metric(test['result'], y_pred)

0.13570454545454547

In [170]:
print(test['result'])
print(y_pred)

2355    2
2358    2
2366    0
2375    2
2390    0
2399    0
2400    0
2413    0
2437    0
2446    0
2455    0
Name: result, dtype: int32
[[0.31313932 0.34443274 0.34242797]
 [0.34479585 0.25173292 0.40347117]
 [0.39479896 0.30444792 0.30075312]
 [0.3097511  0.267032   0.42321688]
 [0.25431204 0.5391887  0.20649931]
 [0.27802888 0.50459254 0.21737856]
 [0.33094007 0.37703732 0.29202262]
 [0.33253777 0.4026761  0.2647861 ]
 [0.32205734 0.46530133 0.21264131]
 [0.32739207 0.20870855 0.46389937]
 [0.36558342 0.32944667 0.30496988]]
