use this notebook to see today's bet at the beginning of a day

In [11]:
import pandas as pd, numpy as np
import statsapi
from statsapi import player_stat_data
import requests
from datetime import datetime, timedelta
import numpy as np
import math, os
import meteostat
import pprint
import pickle
import copy

import pycaret
from pycaret import classification
import model.common
import importlib

In [2]:
from static_data.load_static_data import *

In [114]:
date_today_str = datetime.today().strftime("%Y-%m-%d")
date_yesterday_str = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
date_today_str, date_yesterday_str

('2023-06-15', '2023-06-14')

In [4]:
importlib.reload(model.common)

<module 'model.common' from '/home/junlim/projects/mlb-props/model/common.py'>

In [115]:
collect_data_Base_dir = 'collect_data'
df_game_matchup_total = pd.read_pickle(f'{collect_data_Base_dir}/df_game_matchup_total.pkl')
df_game_matchup_2023 = pd.read_pickle(f'{collect_data_Base_dir}/df_game_matchup_2023.pkl')
print(f'df_game_matchup_total: {len(df_game_matchup_total)}, df_game_matchup_2023: {len(df_game_matchup_2023)}')

df_game_matchup_total: 322963, df_game_matchup_2023: 1701


In [124]:
# live matchup does not have the result
df_live_game_matchup = pd.read_pickle(f'collect_data/df_live_game_matchup_{date_today_str}.pkl')
df_live_odds_hits = pd.read_pickle(f"odds_data/df_odds_hits_{date_today_str}.pkl")

In [118]:
date_today = datetime(*datetime.today().timetuple()[:3])
t = datetime(*datetime(2023, 5, 29, 15, 0, 0, 0).timetuple()[:3])
t_since_20230529 = []
while t < date_today:
    t_since_20230529.append(t)
    t += timedelta(days=1)

file_names = [f"odds_data/df_odds_hits_{t.strftime('%Y-%m-%d')}.pkl" for t in t_since_20230529]
file_names = [f for f in file_names if os.path.exists(f)]
df_odds_hits_since_20230529 = pd.concat([pd.read_pickle(f) for f in file_names])

# live prediction

In [15]:
regression_model = pycaret.classification.load_model(model.common.model_file_name)

Transformation Pipeline and Model Successfully Loaded


In [135]:
live_bet_columns = model.common.features + ["batting_hit_recorded", 'prediction_label', 'prediction_score', 'theo_odds']

def get_df_prediction_odd(df_matchup, regression_model):
    df_live_prediction = pycaret.classification.predict_model(data = df_matchup, estimator = regression_model)
    df_live_prediction = pd.merge(df_live_prediction, df_player_team_positions[['player_id','player_team_name']], left_on='batting_id', right_on='player_id', how='left')
    df_live_prediction["theo_odds"] = df_live_prediction["prediction_score"].apply(model.common.odds_calculator)
    return df_live_prediction

def get_df_prediction_hits_odds(df_prediction, df_hits_odds):
    df_hits_odds = copy.copy(df_hits_odds)
    df_hits_odds["over_prob"] = df_hits_odds["over_odds"].apply(model.common.odds_to_probability)
    df_prediction_odds = df_prediction[live_bet_columns + ['game_id']].set_index(['game_id', 'batting_name']).join(\
        df_hits_odds.rename(columns={'player_name': 'batting_name'}).set_index(['game_id', 'batting_name']), lsuffix='', rsuffix='_odds').reset_index()
    df_prediction_odds = df_prediction_odds[df_prediction_odds.over_line < 1.0]
    return df_prediction_odds

def get_df_confident_prediction_odds(df_prediction_odds, score_threshold = 0.70):
    df_confident_prediction_odds = df_prediction_odds.drop_duplicates("batting_name")
    df_confident_prediction_odds = df_confident_prediction_odds[df_confident_prediction_odds["prediction_score"] >= score_threshold]
    # for some reason, the prediction_label should be separatedly checked. higher score does not always lead to prediction label. (maybe the score stands for both labels).
    df_confident_prediction_odds = df_confident_prediction_odds[df_confident_prediction_odds["prediction_label"] == 1]
    hits = df_confident_prediction_odds.batting_hit_recorded.sum().values[0]
    l = len(df_confident_prediction_odds)
    print(f'hit recorded ratio: {1.0 * hits / l} ({hits} out of {l})')
    
    return df_confident_prediction_odds[['game_date', 'team_away', 'team_home', 'batting_name', "batting_hit_recorded", "prediction_score", "theo_odds", 'over_prob', 'over_odds', 'over_line']]

def get_df_advantageous_prediction_odds(df_prediction_odds, prediction_diff_threshold = 0.05, score_threshold = 0.60):
    df_prediction_odds_  = copy.copy(df_prediction_odds)
    df_prediction_odds_  = df_prediction_odds_.drop_duplicates("batting_name")
    df_prediction_odds_['prediction_diff'] = df_prediction_odds_['prediction_score'] - df_prediction_odds_['over_prob']
    df_advantageous_prediction_odds = df_prediction_odds_
    df_advantageous_prediction_odds = df_advantageous_prediction_odds[df_advantageous_prediction_odds["prediction_score"] >= score_threshold]
    df_advantageous_prediction_odds = df_advantageous_prediction_odds[df_advantageous_prediction_odds["prediction_label"] == 1]
    df_advantageous_prediction_odds = df_advantageous_prediction_odds[df_advantageous_prediction_odds["prediction_diff"] >= prediction_diff_threshold]
    hits = df_advantageous_prediction_odds.batting_hit_recorded.sum().values[0]
    l = len(df_advantageous_prediction_odds)
    print(f'hit recorded ratio: {1.0 * hits / l} ({hits} out of {l})')
    
    return df_advantageous_prediction_odds[['game_date', 'team_away', 'team_home', 'batting_name', "prediction_diff", "batting_hit_recorded", "prediction_score", "theo_odds", 'over_prob', 'over_odds', 'over_line']]

In [17]:
df_live_prediction = get_df_prediction_odd(df_live_game_matchup, regression_model)
df_live_prediction_hits_odds = get_df_prediction_hits_odds(df_live_prediction, df_live_odds_hits)

In [27]:
df_live_prediction.columns

Index(['pitching_name', 'pitching_id', 'pitching_gamesPlayed',
       'pitching_gamesStarted', 'pitching_groundOuts', 'pitching_airOuts',
       'pitching_runs', 'pitching_doubles', 'pitching_triples',
       'pitching_homeRuns',
       ...
       'game_date', 'game_datetime', 'temp', 'game_year',
       'batting_hit_recorded', 'prediction_label', 'prediction_score',
       'player_id', 'player_team_name', 'theo_odds'],
      dtype='object', length=121)

In [28]:
df_live_prediction_hits_odds.columns

Index(['game_id', 'batting_name', 'pitching_gamesPlayed',
       'pitching_runs_per_game', 'pitching_strikeOuts_per_game',
       'pitching_hits_per_game', 'pitching_id', 'batting_gamesPlayed',
       'batting_runs_per_game', 'batting_strikeOuts_per_game',
       'batting_hits_per_game', 'batting_rbi', 'batting_id', 'pitching_name',
       'batting_hit_recorded', 'pitching_cur_season_runs_per_game',
       'pitching_cur_season_strikeOuts_per_game',
       'pitching_cur_season_hits_per_game', 'batting_cur_season_runs_per_game',
       'batting_cur_season_strikeOuts_per_game',
       'batting_cur_season_hits_per_game', 'temp', 'game_venue', 'game_date',
       'game_year', 'batting_hit_recorded', 'prediction_label',
       'prediction_score', 'theo_odds', 'game_date_odds', 'team_away',
       'team_home', 'over_odds', 'over_line', 'under_odds', 'under_line',
       'over_prob'],
      dtype='object')

In [138]:
df_live_confident_prediction_odds = get_df_confident_prediction_odds(df_live_prediction_hits_odds, score_threshold = 0.70)
df_live_confident_prediction_odds

hit recorded ratio: 0.7692307692307693 (20 out of 26)


Unnamed: 0,game_date,team_away,team_home,batting_name,batting_hit_recorded,batting_hit_recorded.1,prediction_score,theo_odds,over_prob,over_odds,over_line
60,2023-06-11,Chicago Cubs,San Francisco Giants,Thairo Estrada,1,1,0.77,-335,0.71831,-255,0.5
62,2023-06-11,Chicago Cubs,San Francisco Giants,Trey Mancini,1,1,0.74,-285,0.62963,-170,0.5
81,2023-06-11,Cincinnati Reds,St. Louis Cardinals,Spencer Steer,1,1,0.81,-426,0.71831,-255,0.5
86,2023-06-11,Cincinnati Reds,St. Louis Cardinals,TJ Friedl,1,1,0.77,-335,0.714286,-250,0.5
102,2023-06-11,San Diego Padres,Colorado Rockies,Ha-Seong Kim,0,0,0.73,-270,0.672131,-205,0.5
135,2023-06-11,Seattle Mariners,Los Angeles Angels,Hunter Renfroe,1,1,0.73,-270,0.636364,-175,0.5
143,2023-06-11,Seattle Mariners,Los Angeles Angels,Shohei Ohtani,1,1,0.87,-669,0.71831,-255,0.5
151,2023-06-11,Oakland Athletics,Milwaukee Brewers,Esteury Ruiz,1,1,0.71,-245,0.672131,-205,0.5
174,2023-06-11,Oakland Athletics,Milwaukee Brewers,William Contreras,0,0,0.72,-257,0.677419,-210,0.5
184,2023-06-11,Texas Rangers,Tampa Bay Rays,Corey Seager,1,1,0.85,-567,0.71831,-255,0.5


In [139]:
df_live_advantageous_prediction_odds = get_df_advantageous_prediction_odds(df_live_prediction_hits_odds, prediction_diff_threshold = 0.05, score_threshold = 0.60)
df_live_advantageous_prediction_odds

hit recorded ratio: 0.7727272727272727 (17 out of 22)


Unnamed: 0,game_date,team_away,team_home,batting_name,prediction_diff,batting_hit_recorded,batting_hit_recorded.1,prediction_score,theo_odds,over_prob,over_odds,over_line
6,2023-06-11,Miami Marlins,Chicago White Sox,Clint Frazier,0.134545,1,1,0.68,-213,0.545455,-120,0.5
8,2023-06-11,Miami Marlins,Chicago White Sox,Elvis Andrus,0.088163,0,0,0.68,-213,0.591837,-145,0.5
60,2023-06-11,Chicago Cubs,San Francisco Giants,Thairo Estrada,0.05169,1,1,0.77,-335,0.71831,-255,0.5
62,2023-06-11,Chicago Cubs,San Francisco Giants,Trey Mancini,0.11037,1,1,0.74,-285,0.62963,-170,0.5
78,2023-06-11,Cincinnati Reds,St. Louis Cardinals,Nolan Gorman,0.054444,0,0,0.61,-156,0.555556,-125,0.5
81,2023-06-11,Cincinnati Reds,St. Louis Cardinals,Spencer Steer,0.09169,1,1,0.81,-426,0.71831,-255,0.5
86,2023-06-11,Cincinnati Reds,St. Louis Cardinals,TJ Friedl,0.055714,1,1,0.77,-335,0.714286,-250,0.5
102,2023-06-11,San Diego Padres,Colorado Rockies,Ha-Seong Kim,0.057869,0,0,0.73,-270,0.672131,-205,0.5
135,2023-06-11,Seattle Mariners,Los Angeles Angels,Hunter Renfroe,0.093636,1,1,0.73,-270,0.636364,-175,0.5
143,2023-06-11,Seattle Mariners,Los Angeles Angels,Shohei Ohtani,0.15169,1,1,0.87,-669,0.71831,-255,0.5


# past prediction

## past predictions and odds since 2023 start

In [140]:
df_live_prediction_since_2023_start = get_df_prediction_odd(df_game_matchup_2023[df_game_matchup_2023.game_date >= '2023-04-01'].reset_index(drop=True), regression_model)
df_live_prediction_hits_odds_since_2023_start = get_df_prediction_hits_odds(df_live_prediction_since_2023_start, df_odds_hits_since_20230529)
df_live_prediction_hits_odds_since_2023_start = df_live_prediction_hits_odds_since_2023_start.drop_duplicates()
df_live_prediction_hits_odds_since_2023_start.batting_hit_recorded = df_live_prediction_hits_odds_since_2023_start.batting_hit_recorded.astype(np.int)

In [121]:
def get_eval_profile(df_prediction, score_threshold):
    confident_prediction = df_prediction.drop_duplicates("batting_name")
    confident_prediction = confident_prediction[confident_prediction["prediction_score"] >= score_threshold]
    # for some reason, the prediction_label should be separatedly checked. higher score does not always lead to prediction label. (maybe the score stands for both labels).
    confident_prediction = confident_prediction[confident_prediction["prediction_label"] == 1]
    l = len(confident_prediction)
    return l, confident_prediction.batting_hit_recorded.sum() / l

In [141]:
df_live_confident_prediction_odds_since_2023_start = get_df_confident_prediction_odds(df_live_prediction_hits_odds_since_2023_start, score_threshold = 0.80)
df_live_confident_prediction_odds_since_2023_start = df_live_confident_prediction_odds_since_2023_start.loc[:,~df_live_confident_prediction_odds_since_2023_start.columns.duplicated()]
df_live_confident_prediction_odds_since_2023_start

hit recorded ratio: 1.0 (13 out of 13)


Unnamed: 0,game_date,team_away,team_home,batting_name,batting_hit_recorded,prediction_score,theo_odds,over_prob,over_odds,over_line
228,2023-06-14,Los Angeles Angels,Texas Rangers,Shohei Ohtani,1,0.87,-669,0.672131,-205,0.5
236,2023-06-14,New York Yankees,New York Mets,Anthony Rizzo,1,0.84,-525,0.62963,-170,0.5
322,2023-06-14,Colorado Rockies,Boston Red Sox,Randal Grichuk,1,0.9,-900,0.701493,-235,0.5
517,2023-06-13,Chicago White Sox,Los Angeles Dodgers,Will Smith,1,0.8,-400,0.6875,-220,0.5
558,2023-06-13,Washington Nationals,Houston Astros,Keibert Ruiz,1,0.82,-456,0.661017,-195,0.5
619,2023-06-13,Los Angeles Angels,Texas Rangers,Nathaniel Lowe,1,0.8,-400,0.736842,-280,0.5
697,2023-06-13,Milwaukee Brewers,Minnesota Twins,Carlos Correa,1,0.84,-525,0.666667,-200,0.5
838,2023-06-13,Colorado Rockies,Boston Red Sox,Rafael Devers,1,0.86,-614,0.74026,-285,0.5
1199,2023-06-11,Texas Rangers,Tampa Bay Rays,Corey Seager,1,0.85,-567,0.71831,-255,0.5
1898,2023-06-08,Boston Red Sox,Cleveland Guardians,Reese McGuire,1,0.81,-426,0.622642,-165,0.5


In [150]:
df_live_advantageous_prediction_odds_since_2023_start = get_df_advantageous_prediction_odds(df_live_prediction_hits_odds_since_2023_start, prediction_diff_threshold = 0.05, score_threshold = 0.70)
df_live_advantageous_prediction_odds_since_2023_start = df_live_advantageous_prediction_odds_since_2023_start.loc[:,~df_live_advantageous_prediction_odds_since_2023_start.columns.duplicated()]
df_live_advantageous_prediction_odds_since_2023_start

hit recorded ratio: 0.8666666666666667 (26 out of 30)


Unnamed: 0,game_date,team_away,team_home,batting_name,prediction_diff,batting_hit_recorded,prediction_score,theo_odds,over_prob,over_odds,over_line
4,2023-06-14,Philadelphia Phillies,Arizona Diamondbacks,Bryson Stott,0.062581,1,0.74,-285,0.677419,-210,0.5
41,2023-06-14,Cleveland Guardians,San Diego Padres,Ha-Seong Kim,0.176667,0,0.76,-317,0.583333,-140,0.5
110,2023-06-14,Chicago White Sox,Los Angeles Dodgers,J.D. Martinez,0.08746,0,0.77,-335,0.68254,-215,0.5
209,2023-06-14,Los Angeles Angels,Texas Rangers,Hunter Renfroe,0.0525,1,0.74,-285,0.6875,-220,0.5
228,2023-06-14,Los Angeles Angels,Texas Rangers,Shohei Ohtani,0.197869,1,0.87,-669,0.672131,-205,0.5
236,2023-06-14,New York Yankees,New York Mets,Anthony Rizzo,0.21037,1,0.84,-525,0.62963,-170,0.5
247,2023-06-14,New York Yankees,New York Mets,Francisco Lindor,0.063636,1,0.7,-233,0.636364,-175,0.5
252,2023-06-14,New York Yankees,New York Mets,Jeff McNeil,0.053973,1,0.78,-355,0.726027,-265,0.5
322,2023-06-14,Colorado Rockies,Boston Red Sox,Randal Grichuk,0.198507,1,0.9,-900,0.701493,-235,0.5
341,2023-06-14,Toronto Blue Jays,Baltimore Orioles,Daulton Varsho,0.077143,0,0.72,-257,0.642857,-180,0.5


## past predictions and odds since 2023/05/29

In [147]:
df_live_prediction_since_20230529 = get_df_prediction_odd(df_game_matchup_2023[df_game_matchup_2023.game_date >= '2023-05-29'].reset_index(drop=True), regression_model)
df_live_prediction_since_20230529.batting_hit_recorded = df_live_prediction_since_20230529.batting_hit_recorded.astype(np.int32)
df_live_prediction_hits_odds_since_20230529 = get_df_prediction_hits_odds(df_live_prediction_since_20230529, df_odds_hits_since_20230529)

In [148]:
get_df_confident_prediction_odds(df_live_prediction_hits_odds_since_20230529, score_threshold = 0.75)

hit recorded ratio: 0.88 (22 out of 25)


Unnamed: 0,game_date,team_away,team_home,batting_name,batting_hit_recorded,batting_hit_recorded.1,prediction_score,theo_odds,over_prob,over_odds,over_line
2,2023-06-14,Philadelphia Phillies,Arizona Diamondbacks,Bryce Harper,1,1,0.77,-335,0.733333,-275,0.5
41,2023-06-14,Cleveland Guardians,San Diego Padres,Ha-Seong Kim,0,0,0.76,-317,0.583333,-140,0.5
110,2023-06-14,Chicago White Sox,Los Angeles Dodgers,J.D. Martinez,0,0,0.77,-335,0.68254,-215,0.5
228,2023-06-14,Los Angeles Angels,Texas Rangers,Shohei Ohtani,1,1,0.87,-669,0.672131,-205,0.5
236,2023-06-14,New York Yankees,New York Mets,Anthony Rizzo,1,1,0.84,-525,0.62963,-170,0.5
252,2023-06-14,New York Yankees,New York Mets,Jeff McNeil,1,1,0.78,-355,0.726027,-265,0.5
322,2023-06-14,Colorado Rockies,Boston Red Sox,Randal Grichuk,1,1,0.9,-900,0.701493,-235,0.5
391,2023-06-14,Milwaukee Brewers,Minnesota Twins,William Contreras,0,0,0.75,-300,0.672131,-205,0.5
482,2023-06-13,Philadelphia Phillies,Arizona Diamondbacks,Josh Rojas,1,1,0.77,-335,0.622642,-165,0.5
491,2023-06-13,Chicago White Sox,Los Angeles Dodgers,Andrew Benintendi,1,1,0.76,-317,0.733333,-275,0.5


In [149]:
get_df_advantageous_prediction_odds(df_live_prediction_hits_odds_since_20230529, prediction_diff_threshold = 0.07, score_threshold = 0.70)

hit recorded ratio: 0.8181818181818182 (18 out of 22)


Unnamed: 0,game_date,team_away,team_home,batting_name,prediction_diff,batting_hit_recorded,batting_hit_recorded.1,prediction_score,theo_odds,over_prob,over_odds,over_line
41,2023-06-14,Cleveland Guardians,San Diego Padres,Ha-Seong Kim,0.176667,0,0,0.76,-317,0.583333,-140,0.5
110,2023-06-14,Chicago White Sox,Los Angeles Dodgers,J.D. Martinez,0.08746,0,0,0.77,-335,0.68254,-215,0.5
228,2023-06-14,Los Angeles Angels,Texas Rangers,Shohei Ohtani,0.197869,1,1,0.87,-669,0.672131,-205,0.5
236,2023-06-14,New York Yankees,New York Mets,Anthony Rizzo,0.21037,1,1,0.84,-525,0.62963,-170,0.5
322,2023-06-14,Colorado Rockies,Boston Red Sox,Randal Grichuk,0.198507,1,1,0.9,-900,0.701493,-235,0.5
341,2023-06-14,Toronto Blue Jays,Baltimore Orioles,Daulton Varsho,0.077143,0,0,0.72,-257,0.642857,-180,0.5
378,2023-06-14,Milwaukee Brewers,Minnesota Twins,Brian Anderson,0.07037,1,1,0.7,-233,0.62963,-170,0.5
391,2023-06-14,Milwaukee Brewers,Minnesota Twins,William Contreras,0.077869,0,0,0.75,-300,0.672131,-205,0.5
482,2023-06-13,Philadelphia Phillies,Arizona Diamondbacks,Josh Rojas,0.147358,1,1,0.77,-335,0.622642,-165,0.5
517,2023-06-13,Chicago White Sox,Los Angeles Dodgers,Will Smith,0.1125,1,1,0.8,-400,0.6875,-220,0.5
