In this notebook, we'll try to create a very simple approach to producing predictions for a bet.  The specific issue we're trying to solve is that the real casinos we'll be using to bet do not share an identifier for the players with TennisExplorer, the website we've used to scrape our historical data.  As such, we need to do name matching.  Below, we have an interface that provides a drop down for "exact substring matching." (enter two incomplete names, and the cell will display two drop down menus -- each consists of the list of names that contain each substring).

We also produce some useful plots and data about prospective bets.

In [1]:
from tennis_new.fetch.tennis_explorer.combiner import read_joined

jd = read_joined()

  if (yield from self.run_code(code, result)):


In [2]:
# TODO: Add these to the jd extraction in the first place?
# Adding a couple useful columns to jd
jd['match_score'] = \
    jd['p1_set1'].fillna('').astype(str) + '-' + jd['p2_set1'].fillna('').astype(str) + ',' + \
    jd['p1_set2'].fillna('').astype(str) + '-' + jd['p2_set2'].fillna('').astype(str) + ',' + \
    jd['p1_set3'].fillna('').astype(str) + '-' + jd['p2_set3'].fillna('').astype(str) + ',' + \
    jd['p1_set4'].fillna('').astype(str) + '-' + jd['p2_set4'].fillna('').astype(str) + ',' + \
    jd['p1_set5'].fillna('').astype(str) + '-' + jd['p2_set5'].fillna('').astype(str)

jd['casino_margin'] = 1. / jd['p1_odds'] + 1. / jd['p2_odds']

In [3]:
from tennis_new.model.config.elo.global_set_elo import SetELO

set_elo = SetELO()
set_elo.run(jd)
set_elo.validation_evaluation

{'DummyFilter_prediction_AUCMetric': 0.8187031847302881,
 'DummyFilter_prediction_AccuracyMetric': 0.7358520800135314,
 'DummyFilter_prediction_LogLikelihoodMetric': -0.5226366377611569,
 'HasOddsFilter_prediction_AUCMetric': 0.7839029874196454,
 'HasOddsFilter_prediction_AccuracyMetric': 0.7056423354253945,
 'HasOddsFilter_prediction_LogLikelihoodMetric': -0.5594758958654537,
 'DummyFilter_odds_implied_probability_AUCMetric': None,
 'DummyFilter_odds_implied_probability_AccuracyMetric': None,
 'DummyFilter_odds_implied_probability_LogLikelihoodMetric': None,
 'HasOddsFilter_odds_implied_probability_AUCMetric': 0.7937506478103871,
 'HasOddsFilter_odds_implied_probability_AccuracyMetric': 0.7114980299325661,
 'HasOddsFilter_odds_implied_probability_LogLikelihoodMetric': -0.5501844612492598}

In [4]:
import pandas as pd

history_df = set_elo.history_df.copy()
history_df = pd.merge(
    history_df,
    jd[[
        'match_link',
        'date',
        'tourney_link',
        'match_score',
        'p1_name',
        'p2_name',
        'p1_odds',
        'p2_odds',
        'casino_margin'
    ]],
    left_on='match_id',
    right_on='match_link'
)

In [5]:
history_df.head()

Unnamed: 0,elo1,elo2,match_id,p1_id,p2_id,prediction,match_link,date,tourney_link,match_score,p1_name,p2_name,p1_odds,p2_odds,casino_margin
0,1500.0,1500.0,/match-detail/?id=13857,/player/arazi/,/player/mcenroe/,0.5,/match-detail/?id=13857,1997-01-01,/doha/1997/atp-men/,"2.0-6.0,7.0-5.0,7.0-5.0,-,-",Arazi H.,McEnroe P.,,,
1,1500.0,1500.0,/match-detail/?id=13855,/player/gustafsson/,/player/hrbaty/,0.5,/match-detail/?id=13855,1997-01-01,/doha/1997/atp-men/,"6.0-2.0,2.0-6.0,6.0-3.0,-,-",Gustafsson M.,Hrbaty D.,,,
2,1500.0,1500.0,/match-detail/?id=13856,/player/henman/,/player/el-sawy/,0.5,/match-detail/?id=13856,1997-01-01,/doha/1997/atp-men/,"6.0-3.0,6.0-2.0,-,-,-",Henman T.,El Sawy T.,,,
3,1500.0,1500.0,/match-detail/?id=13858,/player/larsson/,/player/fredriksson/,0.5,/match-detail/?id=13858,1997-01-01,/doha/1997/atp-men/,"6.0-4.0,7.0-6.0,-,-,-",Larsson M.,Fredriksson P.,,,
4,1500.0,1500.0,/match-detail/?id=13797,/player/tillstrom/,/player/black/,0.5,/match-detail/?id=13797,1997-01-01,/adelaide/1997/atp-men/,"-,-,-,-,-",Tillstrom M.,Black B.,,,


In [6]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# Get list of players ordered by the last time they played
last_win = jd.drop_duplicates('p1_link', keep='last')[['p1_link', 'date']].rename(
    columns={'p1_link': 'pid'}
)
last_loss = jd.drop_duplicates('p2_link', keep='last')[['p2_link', 'date']].rename(
    columns={'p2_link': 'pid'}
)
PLAYER_LIST = pd.concat([
    last_win, last_loss
]).sort_values('date', ascending=False).drop_duplicates('pid', keep='last')['pid'].dropna().tolist()

In [7]:
%matplotlib inline
%pdb
from datetime import datetime
from matplotlib import pyplot as plt
import webbrowser 
import numpy as np
from IPython.display import display


def _process_row(row, pid):
    out = {}
    out['date'] = row['date']
    out['match_score'] = row['match_score']
    out['match_link'] = row['match_link']
    if pid == row['p1_id']:
        out['elo'] = row['elo1']
        out['opponent'] = row['p2_id']
        out['result'] = 'win'
        out['opponent_elo'] = row['elo2']
        out['opponent_name'] = row['p2_name']
        out['prediction'] = row['prediction']
    else:
        out['elo'] = row['elo2']
        out['opponent'] = row['p1_id']
        out['result'] = 'loss'
        out['opponent_elo'] = row['elo1']
        out['opponent_name'] = row['p1_name']
        out['prediction'] = 1. - row['prediction']
    return out
        
def summarize_player(pid, n_matches=10):
    # TODO: Finish this method and add to predict_match below
    recent_matches = history_df[
        (history_df['p1_id'] == pid) | (history_df['p2_id'] == pid)
    ].tail(n_matches)
    recent_matches = pd.DataFrame(
        recent_matches.apply(lambda x: _process_row(x, pid), axis=1).tolist() +
        [{
            'elo': set_elo.predictor.beta[pid],
            'date': datetime.strftime(datetime.today(), '%Y-%m-%d'),
            'opponent_name': 'today',
            'result': ''
        }]
    )

    # Plot recent history
    _x = range(len(recent_matches))
    plt.plot(_x, recent_matches['elo'])
    plt.xticks(
        ticks=_x,
        labels=recent_matches['opponent_name'] + '_' + recent_matches['result'],
        rotation='90'
    )
    plt.title("Recent ELO: %s" % pid)
    plt.show()
    plt.close()


    # Plot "0-Margin Casino" betting history
    w_odds = history_df[
        ((history_df['p1_id'] == pid) | (history_df['p2_id'] == pid)) &
        (history_df['p1_odds'].notnull() & history_df['p2_odds'].notnull())
    ]
    adjusted_odds = w_odds['p1_odds'] * w_odds['casino_margin']   # Get what odds would be with no casino margin
    winning_bets = w_odds['p1_id'] == pid
    losing_bets = w_odds['p2_id'] == pid
    _n_bets = np.arange(w_odds.shape[0]) + 1
    winnings_over_time = (winning_bets.astype(int) * adjusted_odds).cumsum() - _n_bets
    plt.plot(pd.to_datetime(w_odds['date']), winnings_over_time)
    plt.xlabel("Date")
    plt.ylabel("Winnings From 0-Margin Casino of Always Betting on Player")
    plt.title("Casino Bias")
    plt.show()
    plt.close()
    return(recent_matches)
    
def _predict_match(p1_id, p2_id):
    # Open up the TennisExplorer player profiles
    webbrowser.open("https://www.tennisexplorer.com" + p1_id)
    webbrowser.open("https://www.tennisexplorer.com" + p2_id)
    # Print our prediction
    print('-' * 20 + "SUMMARIZING %s" % p1_id + '-' * 20)
    p1_summary = summarize_player(p1_id)
    display(p1_summary)
    print('-' * 20 + "SUMMARIZING %s" % p2_id + '-' * 20)
    p2_summary = summarize_player(p2_id)
    display(p2_summary)

    print('-' * 20 + "HEAD-TO-HEAD" + '-' * 20)
    h2h = history_df[
        history_df['p1_id'].isin([p1_id, p2_id]) &
        history_df['p2_id'].isin([p1_id, p2_id])
    ]
    display(h2h)
    return set_elo.predictor.predict(p1_id, p2_id)

def predict_match(p1_name, p2_name):
    p1_players = [x for x in PLAYER_LIST if p1_name.lower() in x]
    p2_players = [x for x in PLAYER_LIST if p2_name.lower() in x]
    interact(_predict_match, p1_id=p1_players, p2_id=p2_players)

Automatic pdb calling has been turned ON


In [8]:
predict_match("Dhoe", "Harris")

interactive(children=(Dropdown(description='p1_id', options=('/player/dhoe/',), value='/player/dhoe/'), Dropdo…