Here, we'll try to fit our very first model on TennisExplorer data

In [1]:
from tennis_new.fetch.tennis_explorer.combiner import read_joined

jd = read_joined()

  if (yield from self.run_code(code, result)):


In [2]:
from tennis_new.model.utils.filters import (
    MissingPIDFilter,
    MissingScoreFilter,
    PossibleWalkoverFilter,
    RetirementFilter,
    TrainingFilter
)

#### Back to Fitting

In [7]:
# Fit only when we have the unique identifier for both players?
rel = TrainingFilter.filter(jd)

In [8]:
from tennis_new.ml.elo import ELOModel

match_elo = ELOModel(winner_mod=True)

In [9]:
%pdb
match_elo.fit_and_backfill(
    rel['p1_link'],
    rel['p2_link'],
    rel['match_link']
)

Automatic pdb calling has been turned OFF


In [10]:
import pandas as pd

def get_test_set(df, test_min='2011-01-01', test_max='2015-01-01', test_surface=None, filter_walkovers=True):
    date_cond = (
        (df['date'] >= test_min) &
        (df['date'] < test_max)
    )
    if test_surface is None:
        surface_cond = True
    cond = date_cond & surface_cond
    if filter_walkovers:
        cond &= (~possible_walkover(df))
    return df[cond]

def eval_mod(mod, df, test_min='2011-01-01', test_max='2015-01-01', test_surface=None, filter_walkovers=False):
    # TODO: Filter out walkovers from test set
    history_df = pd.DataFrame(mod.history)
    test_set = get_test_set(
        df,
        test_min=test_min,
        test_max=test_max,
        test_surface=test_surface,
        filter_walkovers=filter_walkovers
    )
    test_set = pd.merge(test_set, history_df, left_on='match_link', right_on='match_id')
    
    accuracy = (test_set['elo_match_prediction'] > 0.5).mean()
    w_odds = test_set[
        test_set['p1_odds'].notnull() &
        test_set['p2_odds'].notnull() &
        (test_set['p1_odds'] != test_set['p2_odds'])
    ]
    n_w_odds = w_odds.shape[0]
    odds_accuracy = (w_odds['p1_odds'] < w_odds['p2_odds']).mean()
    mod_odds_accuracy = (w_odds['elo_match_prediction'] > 0.5).mean()
    return {
        'overall_accuracy': accuracy,
        'odds_accuracy': odds_accuracy,
        'model_odds_accuracy': mod_odds_accuracy,
        'n_w_odds': n_w_odds
    }

In [12]:
match_eval = eval_mod(match_elo, rel)
match_eval

224662


{'overall_accuracy': 0.7275996830794705,
 'odds_accuracy': 0.7200428690759507,
 'model_odds_accuracy': 0.7074658387051017,
 'n_w_odds': 63449}

#### Try Initial Set Model

Note that this model we will definitely have to tune the ELO parameters.

In [13]:
from tennis_new.ml.elo import ELOModel

set_elo = ELOModel(winner_mod=True)

In [14]:
set_elo.fit_and_backfill(
    rel['p1_link'],
    rel['p2_link'],
    rel['match_link'],
    ys=rel[['p1_sets_won', 'p2_sets_won']].values
)

In [16]:
eval_mod(set_elo, rel)

224662


{'overall_accuracy': 0.7358520800135314,
 'odds_accuracy': 0.7200428690759507,
 'model_odds_accuracy': 0.7080332235338619,
 'n_w_odds': 63449}

The set model is working pretty well!  We should tune the ELO parameters again

#### Tune ELO Parameters

In [17]:
from tennis_new.ml.sobol import generate_sobol_seq, get_range_values

MIN_C = 100
MAX_C = 500
MIN_O = 0
MAX_O = 50
MIN_S = 0
MAX_S = 2


sobol_vals = generate_sobol_seq(3, 100, 1)
cs = get_range_values(MIN_C, MAX_C, sobol_vals[:, 0])
os = get_range_values(MIN_O, MAX_O, sobol_vals[:, 1])
ss = get_range_values(MIN_S, MAX_S, sobol_vals[:, 2])

In [None]:
from tqdm.autonotebook import tqdm

out = []
test_out = []
for c, o, s in tqdm(zip(cs, os, ss)):
    cur_elo = ELOModel(c=c, o=o, s=s, winner_mod=True)
    cur_elo.fit_and_backfill(
        rel['p1_link'],
        rel['p2_link'],
        rel['match_link'],
        ys=rel[['p1_sets_won', 'p2_sets_won']].values
    )
    cur_eval = eval_mod(cur_elo, rel)
    test_eval = eval_mod(cur_elo, rel, test_min='2015-01-01', test_max='2021-01-01')
    cur_eval.update({'c': c, 'o': o, 's': s})
    test_eval.update({'c': c, 'o': o, 's': s})
    out.append(cur_eval)
    test_out.append(test_eval)



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149
224662
282149


In [None]:
tune_df = pd.DataFrame(out)

In [None]:
tune_df.sort_values('model_odds_accuracy', ascending=False, inplace=True)
tune_df.head()

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np

def _plot_params(c, o, s):
    _x = np.arange(100)
    _y = c / (_x + o) ** s
    plt.plot(_x, _y, label='c:%0.2f, o: %0.2f, s:%0.2f' % (c, o, s))
        

def _plot_row(row):
    _plot_params(row['c'], row['o'], row['s'])

for i in range(5):
    _plot_row(tune_df.iloc[i])

_default_mod = ELOModel()
_plot_params(_default_mod.c, _default_mod.o, _default_mod.s)

plt.legend()

Above, it looks like the default parameters (those suggested by ESPN) are better than what we've found through tuning.