Here, we'll try to fit our very first model on TennisExplorer data.

TODO: 
* Figure out walkover / retirement handling!
    * Try 2x2 training and evaluation
* Try making formulaic match predictions for set model, e.g.:


$\begin{align}
    \mathbb{P}_2 &= \hat{p}^2(1 + 2(1 - \hat{p})) \\
    \mathbb{P}_3 &= \hat{p}^3(1 + 3(1 - \hat{p}) + {4 \choose 2}(1 - \hat{p})^2)
\end{align} \\
\text{where } \hat{p} \text{ is the set prediction}
$

Note that the bottom TODO will not affect accuracy results, but may help with the AUC metrics and later calibration and betting performance.

In [1]:
from tennis_new.fetch.tennis_explorer.combiner import read_joined

jd = read_joined()

  if (yield from self.run_code(code, result)):


In [2]:
jd['surface'].isnull().value_counts()

True     665861
False    299699
Name: surface, dtype: int64

In [3]:
jd['tourney_name'][jd['surface'].isnull()].value_counts()

Futures 2015     46948
Futures 2014     46432
Futures 2016     45695
Futures 2017     44736
Futures 2013     43887
Futures 2018     41356
Futures 2012     41300
Futures 2011     37793
Futures 08       37775
Futures 07       35896
Futures 09       35598
Futures 2010     34891
Futures 06       33621
Futures 05       30047
Futures 2019     29786
Futures 04       26785
Futures 03       25134
Futures 02       22285
Futures 2020      5834
African Games       62
Name: tourney_name, dtype: int64

In [4]:
w_odds = jd[jd['p1_odds'].notnull()]

In [5]:
w_odds['surface'].isnull().value_counts()

False    142028
True     134326
Name: surface, dtype: int64

In [6]:
jd[jd['surface'].isnull()].iloc[-1]

comment                                      NaN
date                                  2020-09-02
match_link             /match-detail/?id=1898021
match_time                                 09:00
p1_link                     /player/orlov-d8e3f/
p1_name                                 Orlov V.
p1_odds                                     1.12
p1_seed                                      NaN
p1_set1                                        6
p1_set2                                        6
p1_set3                                      NaN
p1_set4                                      NaN
p1_set5                                      NaN
p1_sets_won                                    2
p2_link                       /player/zhu-b5945/
p2_name                                   Zhu M.
p2_odds                                     5.03
p2_seed                                      NaN
p2_set1                                        2
p2_set2                                        1
p2_set3             

In [7]:
from tennis_new.model.utils.filters import (
    MissingPIDFilter,
    MissingScoreFilter,
    PossibleWalkoverFilter,
    RetirementFilter,
    TrainingFilter
)

#### Back to Fitting

In [8]:
# Fit only when we have the unique identifier for both players?
rel = TrainingFilter.filter(jd)

In [20]:
rel['p1_sets_won'].value_counts()

2.0    913374
1.0     36528
3.0     15579
4.0        12
Name: p1_sets_won, dtype: int64

In [24]:
one_set = rel[rel['p1_sets_won'] == 1]
one_set['p1_set1'].isnull().mean()

0.20855234340779677

In [29]:
one_set['p1_set1'].isnull().sum()

7618

There are 7.7k matches we could possible rule out of training and evaluation if we take out cases with missing scores and p1_sets_won == 1.  TODO: Figure out whether this is worth doing!

In [28]:
one_set['p2_sets_won'].value_counts()

0.0    36528
Name: p2_sets_won, dtype: int64

In [27]:
one_set[
    one_set['p1_set1'].isnull()
].tail(10).T

Unnamed: 0,963426,963532,963553,964346,964807,964830,965113,965359,965363,965533
comment,,,,,,,,,,
date,2020-07-30,2020-07-31,2020-07-31,2020-08-19,2020-08-24,2020-08-24,2020-08-28,2020-08-31,2020-08-31,2020-09-02
match_link,/match-detail/?id=1892194,/match-detail/?id=1892252,/match-detail/?id=1892273,/match-detail/?id=1894757,/match-detail/?id=1897374,/match-detail/?id=1896320,/match-detail/?id=1897099,/match-detail/?id=1897911,/match-detail/?id=1897999,/match-detail/?id=1898091
match_time,10:10,11:40,10:40,10:30,--:--,13:00,12:30,13:10,12:00,13:30
p1_link,/player/hampel-7e857/,/player/tepavac/,/player/zielinski-9d753/,/player/vrbensky/,/player/virtanen-7d161/,/player/alcalde/,/player/karatsev/,/player/obert/,/player/shyla/,/player/basic/
p1_name,Hampel L.,Tepavac M.,Zielinski J.,Vrbensky M.,Virtanen O.,Alcalde J.,Karatsev A.,Obert A.,Shyla Y.,Basic M.
p1_odds,1.61,2.61,1.64,3.2,,6.8,4.13,1.38,,
p1_seed,,,(3),,(4),,,,,
p1_set1,,,,,,,,,,
p1_set2,,,,,,,,,,


In [9]:
from tennis_new.ml.elo import ELOModel

match_elo = ELOModel(winner_mod=True)

In [10]:
%pdb
match_elo.fit_and_backfill(
    rel['p1_link'],
    rel['p2_link'],
    rel['match_link']
)

Automatic pdb calling has been turned ON


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

def get_test_set(df, test_min='2011-01-01', test_max='2015-01-01', test_surface=None, filter_walkovers=True):
    date_cond = (
        (df['date'] >= test_min) &
        (df['date'] < test_max)
    )
    if test_surface is None:
        surface_cond = True
    cond = date_cond & surface_cond
    if filter_walkovers:
        cond &= (~possible_walkover(df))
    return df[cond]

def dupe_auc(y_hat_ones):
    # The AUC depends on how you label matches as a 1 or 0 -- here, we just include every point twice,
    # once with each label.
    # Takes in iterable of the predicted likelihood that the winner won
    n = len(y_hat_ones)
    y = np.concatenate([
        np.ones(n), np.zeros(n)
    ])
    y_hat = np.concatenate([
        y_hat_ones,
        1. - y_hat_ones
    ])
    return roc_auc_score(y, y_hat)

def eval_mod(mod, df, test_min='2011-01-01', test_max='2015-01-01', test_surface=None, filter_walkovers=False):
    # TODO: Filter out walkovers from test set
    history_df = pd.DataFrame(mod.history)
    test_set = get_test_set(
        df,
        test_min=test_min,
        test_max=test_max,
        test_surface=test_surface,
        filter_walkovers=filter_walkovers
    )
    test_set = pd.merge(test_set, history_df, left_on='match_link', right_on='match_id')
    
    accuracy = (test_set['elo_match_prediction'] > 0.5).mean()
    auc = dupe_auc(test_set['elo_match_prediction'])
    
    w_odds = test_set[
        test_set['p1_odds'].notnull() &
        test_set['p2_odds'].notnull() &
        (test_set['p1_odds'] != test_set['p2_odds'])
    ]
    n_w_odds = w_odds.shape[0]
    odds_accuracy = (w_odds['p1_odds'] <= w_odds['p2_odds']).mean()
    mod_odds_accuracy = (w_odds['elo_match_prediction'] > 0.5).mean()
    
    w_odds_auc = dupe_auc(w_odds['elo_match_prediction'])
    odds_p1_raw = 1. / w_odds['p1_odds']
    odds_p2_raw = 1. / w_odds['p2_odds']
    odds_p1 = odds_p1_raw / (odds_p1_raw + odds_p2_raw) 
    odds_auc = dupe_auc(odds_p1)
    
    return {
        'overall_accuracy': accuracy,
        'overall_auc': auc,
        'odds_accuracy': odds_accuracy,
        'model_odds_accuracy': mod_odds_accuracy,
        'n_w_odds': n_w_odds,
        'odds_auc': odds_auc,
        'w_odds_auc': w_odds_auc,
    }

In [12]:
match_eval = eval_mod(match_elo, rel)
match_eval

{'overall_accuracy': 0.7275996830794705,
 'overall_auc': 0.8088814270063787,
 'odds_accuracy': 0.7200428690759507,
 'model_odds_accuracy': 0.7074658387051017,
 'n_w_odds': 63449,
 'odds_auc': 0.7955596676090044,
 'w_odds_auc': 0.7849735388169738}

In [13]:
match_eval_test = eval_mod(match_elo, rel, test_min='2015-01-01', test_max='2021-01-01')
match_eval_test

{'overall_accuracy': 0.7251851424655582,
 'overall_auc': 0.8055511725054558,
 'odds_accuracy': 0.7252324973282703,
 'model_odds_accuracy': 0.7135962618522477,
 'n_w_odds': 175916,
 'odds_auc': 0.8036791328396125,
 'w_odds_auc': 0.7909297106109022}

Is there some weird relationship between AUC and accuracy for these ranking based models?  Probably.  The AUC equivalent should really just be telling us about how good our ELO parameters are -- accuracy already measures pairwise comparisons between ELO parameters.  AUC additionally measures comparisons between the comparisons of ELO parameters or something like this.

#### Try Initial Set Model

Note that this model we will definitely have to tune the ELO parameters.

In [15]:
from tennis_new.ml.elo import ELOModel

set_elo = ELOModel(winner_mod=True)

In [16]:
set_elo.fit_and_backfill(
    rel['p1_link'],
    rel['p2_link'],
    rel['match_link'],
    ys=rel[['p1_sets_won', 'p2_sets_won']].values
)

In [17]:
set_eval = eval_mod(set_elo, rel)
set_eval

{'overall_accuracy': 0.7358520800135314,
 'overall_auc': 0.8187031847302881,
 'odds_accuracy': 0.7200428690759507,
 'model_odds_accuracy': 0.7080332235338619,
 'n_w_odds': 63449,
 'odds_auc': 0.7955596676090044,
 'w_odds_auc': 0.786068092124641}

In [18]:
set_eval_test = eval_mod(set_elo, rel, test_min='2015-01-01', test_max='2021-01-01')
set_eval_test

{'overall_accuracy': 0.732224457463879,
 'overall_auc': 0.8146880623686511,
 'odds_accuracy': 0.7252324973282703,
 'model_odds_accuracy': 0.7148298051342686,
 'n_w_odds': 175916,
 'odds_auc': 0.8036791328396125,
 'w_odds_auc': 0.7938482895413103}

The set model is working pretty well (at least on the overall population)!  We should tune the ELO parameters again

#### Tune ELO Parameters

In [40]:
from tennis_new.ml.sobol import generate_sobol_seq, get_range_values

MIN_C = 100
MAX_C = 500
MIN_O = 0
MAX_O = 50
MIN_S = 0
MAX_S = 2


sobol_vals = generate_sobol_seq(3, 100, 1)
cs = get_range_values(MIN_C, MAX_C, sobol_vals[:, 0])
os = get_range_values(MIN_O, MAX_O, sobol_vals[:, 1])
ss = get_range_values(MIN_S, MAX_S, sobol_vals[:, 2])

In [41]:
from tqdm.autonotebook import tqdm

out = []
test_out = []
for c, o, s in tqdm(zip(cs, os, ss)):
    cur_elo = ELOModel(c=c, o=o, s=s, winner_mod=True)
    cur_elo.fit_and_backfill(
        rel['p1_link'],
        rel['p2_link'],
        rel['match_link'],
        ys=rel[['p1_sets_won', 'p2_sets_won']].values
    )
    cur_eval = eval_mod(cur_elo, rel)
    test_eval = eval_mod(cur_elo, rel, test_min='2015-01-01', test_max='2021-01-01')
    cur_eval.update({'c': c, 'o': o, 's': s})
    test_eval.update({'c': c, 'o': o, 's': s})
    out.append(cur_eval)
    test_out.append(test_eval)



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [42]:
tune_df = pd.DataFrame(out)

In [43]:
test_df = pd.DataFrame(test_out)

In [57]:
tune_df.sort_values('w_odds_auc', ascending=False).head(10)

Unnamed: 0,c,model_odds_accuracy,n_w_odds,o,odds_accuracy,odds_auc,overall_accuracy,overall_auc,s,w_odds_auc
35,268.75,0.708254,63449,7.03125,0.720043,0.79556,0.733106,0.815806,0.46875,0.787358
77,334.375,0.708033,63449,4.296875,0.720043,0.79556,0.732029,0.813338,0.515625,0.787179
25,462.5,0.707923,63449,20.3125,0.720043,0.79556,0.733417,0.816437,0.5625,0.787138
22,187.5,0.707828,63449,42.1875,0.720043,0.79556,0.72958,0.812727,0.4375,0.787011
46,143.75,0.708128,63449,35.15625,0.720043,0.79556,0.732848,0.816485,0.34375,0.786883
66,209.375,0.70838,63449,32.421875,0.720043,0.79556,0.73473,0.818286,0.390625,0.7868
40,393.75,0.707623,63449,28.90625,0.720043,0.79556,0.728494,0.81096,0.59375,0.786511
2,200.0,0.707277,63449,37.5,0.720043,0.79556,0.725094,0.807268,0.5,0.785784
68,459.375,0.707907,63449,26.171875,0.720043,0.79556,0.727168,0.808539,0.640625,0.785706
99,265.625,0.706221,63449,12.890625,0.720043,0.79556,0.719944,0.799095,0.609375,0.782946


In [49]:
set_eval

{'overall_accuracy': 0.7358520800135314,
 'overall_auc': 0.8187031847302881,
 'odds_accuracy': 0.7200428690759507,
 'model_odds_accuracy': 0.7080332235338619,
 'n_w_odds': 63449,
 'odds_auc': 0.7955596676090044,
 'w_odds_auc': 0.786068092124641}

In [50]:
(tune_df['w_odds_auc'] > set_eval['w_odds_auc']).mean()

0.07

In [58]:
test_df.sort_values('w_odds_auc', ascending=False, inplace=True)
test_df.head(10)

Unnamed: 0,c,model_odds_accuracy,n_w_odds,o,odds_accuracy,odds_auc,overall_accuracy,overall_auc,s,w_odds_auc
35,268.75,0.715648,175916,7.03125,0.725232,0.803679,0.73062,0.812795,0.46875,0.794669
25,462.5,0.715216,175916,20.3125,0.725232,0.803679,0.73044,0.813099,0.5625,0.794607
77,334.375,0.715216,175916,4.296875,0.725232,0.803679,0.729816,0.811136,0.515625,0.794241
66,209.375,0.715341,175916,32.421875,0.725232,0.803679,0.730789,0.813259,0.390625,0.793946
40,393.75,0.715125,175916,28.90625,0.725232,0.803679,0.727058,0.808482,0.59375,0.793555
46,143.75,0.715154,175916,35.15625,0.725232,0.803679,0.729371,0.811426,0.34375,0.793518
22,187.5,0.715125,175916,42.1875,0.725232,0.803679,0.727187,0.808663,0.4375,0.793221
68,459.375,0.714801,175916,26.171875,0.725232,0.803679,0.725853,0.806771,0.640625,0.792929
2,200.0,0.713676,175916,37.5,0.725232,0.803679,0.723313,0.804483,0.5,0.791811
27,212.5,0.712812,175916,14.0625,0.725232,0.803679,0.731872,0.814734,0.3125,0.791237


In [53]:
# How correlated are AUC and accuraspearmanrrom scipy.stats import spearmanr?

spearmanr(tune_df['w_odds_auc'], tune_df['model_odds_accuracy'])

SpearmanrResult(correlation=0.9991839183918391, pvalue=2.1003771096175376e-138)

In [None]:
extremely correlated

In [54]:
tune_df.iloc[0]

c                        300.000000
model_odds_accuracy        0.635881
n_w_odds               63449.000000
o                         25.000000
odds_accuracy              0.720043
odds_auc                   0.795560
overall_accuracy           0.643674
overall_auc                0.706161
s                          1.000000
w_odds_auc                 0.698537
Name: 0, dtype: float64

In [55]:
test_df.iloc[0]

c                         268.750000
model_odds_accuracy         0.715648
n_w_odds               175916.000000
o                           7.031250
odds_accuracy               0.725232
odds_auc                    0.803679
overall_accuracy            0.730620
overall_auc                 0.812795
s                           0.468750
w_odds_auc                  0.794669
Name: 35, dtype: float64

In [None]:
test_eval = eval_mod(set_elo, rel, test_min='2015-01-01', test_max='2021-01-01')
(test_df['model_odds_accuracy'] > test_eval['model_odds_accuracy']).mean()

In [56]:
from scipy.stats import spearmanr

tune_df.sort_values(['c', 'o', 's'], ascending=True, inplace=True)
test_df.sort_values(['c', 'o', 's'], ascending=True, inplace=True)
(
    spearmanr(tune_df['model_odds_accuracy'], test_df['model_odds_accuracy']),
    spearmanr(tune_df['overall_accuracy'], test_df['overall_accuracy'])
)

(SpearmanrResult(correlation=0.9990098950668611, pvalue=2.7161452464675175e-134),
 SpearmanrResult(correlation=0.9985118511851184, pvalue=1.2593731225722673e-125))

Above, it looks like the ELO parameters correlate well between the tuning and testing set.  We get a modest improvement from retuning these.

In [34]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np

def _plot_params(c, o, s):
    _x = np.arange(100)
    _y = c / (_x + o) ** s
    plt.plot(_x, _y, label='c:%0.2f, o: %0.2f, s:%0.2f' % (c, o, s))
        

def _plot_row(row):
    _plot_params(row['c'], row['o'], row['s'])

for i in range(5):
    _plot_row(tune_df.sort_values('model_odds_accuracy', ascending=False).iloc[i])

_default_mod = ELOModel()
_plot_params(_default_mod.c, _default_mod.o, _default_mod.s)

plt.legend()

NameError: name 'tune_df' is not defined

> [0;32m<ipython-input-34-c59b1b2396a8>[0m(15)[0;36m<module>[0;34m()[0m
[0;32m     13 [0;31m[0;34m[0m[0m
[0m[0;32m     14 [0;31m[0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m5[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 15 [0;31m    [0m_plot_row[0m[0;34m([0m[0mtune_df[0m[0;34m.[0m[0msort_values[0m[0;34m([0m[0;34m'model_odds_accuracy'[0m[0;34m,[0m [0mascending[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0mi[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m[0;34m[0m[0m
[0m[0;32m     17 [0;31m[0m_default_mod[0m [0;34m=[0m [0mELOModel[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> exit


Above, it looks like the default parameters (those suggested by ESPN) are almost as good as what we've found through tuning.