**Import Packages**

In [1]:
import pycaret
import pandas as pd
import numpy as np
import sys

from pycaret import classification
from datetime import datetime
import model.common
import importlib

**Import Data**

In [2]:
from static_data.load_static_data import *

In [3]:
collect_data_Base_dir = 'collect_data'

In [4]:
df_game_matchup_total = pd.read_pickle(f'{collect_data_Base_dir}/df_game_matchup_total.pkl')
print(len(df_game_matchup_total))

322963


**Load Model**

In [5]:
regression_model = pycaret.classification.load_model(model.common.model_file_name)

Transformation Pipeline and Model Successfully Loaded


In [6]:
print(regression_model)

Pipeline(memory=FastMemory(location=/tmp/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['pitching_gamesPlayed',
                                             'pitching_runs_per_game',
                                             'pitching_strikeOuts_per_game',
                                             'pitching_hits_per_game',
                                             'batting_gamesPlayed',
                                             'batting_runs_per_game',
                                             'batting_strikeOuts_per_game',
                                             'batting_hits_per_game',
                                             'batting_rbi',
                                             'pitching_cur_season_runs_p...
                                                      class_weight=None,
                                                      classifier=RandomForestClassifier(n_jobs=-1,
                                

**Get Most Confident Predictions**

In [7]:
train_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2012-04-01") & (df_game_matchup_total.game_date < "2022-12-01")][model.common.features]
test_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2022-12-01")][model.common.features]

In [8]:
test_prediction = pycaret.classification.predict_model(data = test_data, estimator = regression_model)
test_prediction = pd.merge(test_prediction, df_player_team_positions[['player_id','player_team_name']], left_on='batting_id', right_on='player_id', how='left')
test_prediction["theo_odds"] = test_prediction["prediction_score"].apply(model.common.odds_calculator)

In [9]:
test_prediction[model.common.features + ['batting_hit_recorded', 'prediction_label', 'prediction_score', 'theo_odds']]

Unnamed: 0,pitching_gamesPlayed,pitching_runs_per_game,pitching_strikeOuts_per_game,pitching_hits_per_game,pitching_id,batting_name,batting_gamesPlayed,batting_runs_per_game,batting_strikeOuts_per_game,batting_hits_per_game,...,batting_cur_season_strikeOuts_per_game,batting_cur_season_hits_per_game,temp,game_venue,game_date,game_year,batting_hit_recorded,prediction_label,prediction_score,theo_odds
0,30,3.066667,5.90,5.70000,608337,Kyle Tucker,150,0.473333,0.633333,0.933333,...,0.632653,0.979592,84.199997,Minute Maid Park,2023-04-01,2023,1,0,0.52,-108
1,30,3.066667,5.90,5.70000,608337,Alex Bregman,155,0.600000,0.496774,0.916129,...,0.448980,0.836735,84.199997,Minute Maid Park,2023-04-01,2023,1,0,0.53,-113
2,30,3.066667,5.90,5.70000,608337,Jose Abreu,157,0.541401,0.700637,1.165605,...,0.979167,0.854167,84.199997,Minute Maid Park,2023-04-01,2023,1,1,0.66,-194
3,30,3.066667,5.90,5.70000,608337,Jose Abreu,157,0.541401,0.700637,1.165605,...,0.979167,0.854167,84.199997,Minute Maid Park,2023-04-01,2023,1,1,0.66,-194
4,30,3.066667,5.90,5.70000,608337,David Hensley,16,0.437500,0.375000,0.625000,...,1.230769,0.384615,84.199997,Minute Maid Park,2023-04-01,2023,0,0,0.70,-233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26558,32,0.687500,1.75,1.53125,656557,Josh Lowe,52,0.461538,1.269231,0.769231,...,0.900000,1.075000,53.060001,Fenway Park,2023-06-04,2023,1,1,0.71,-245
26559,32,0.687500,1.75,1.53125,656557,Yandy Diaz,137,0.518248,0.437956,1.021898,...,0.642857,1.166667,53.060001,Fenway Park,2023-06-04,2023,0,1,0.60,-150
26560,32,0.687500,1.75,1.53125,656557,Yandy Diaz,137,0.518248,0.437956,1.021898,...,0.642857,1.166667,53.060001,Fenway Park,2023-06-04,2023,0,1,0.60,-150
26561,32,0.687500,1.75,1.53125,656557,Luke Raley,22,0.318182,1.090909,0.545455,...,1.052632,0.736842,53.060001,Fenway Park,2023-06-04,2023,1,0,0.63,-170


In [10]:
score_threshold = 0.75
confident_test_prediction = test_prediction[(test_prediction["prediction_score"] >= score_threshold) & (test_prediction["prediction_label"] == 1)].sort_values(by = "prediction_score", ascending = False).drop_duplicates("batting_name")
confident_test_prediction[['game_date', "batting_name", "batting_hit_recorded",	"prediction_score", "player_team_name", "theo_odds"]]

Unnamed: 0,game_date,batting_name,batting_hit_recorded,prediction_score,player_team_name,theo_odds
23933,2023-05-25,Randal Grichuk,1,0.91,Colorado Rockies,-1011
16151,2023-05-07,Freddie Freeman,1,0.90,Atlanta Braves,-900
7494,2023-04-17,Shohei Ohtani,1,0.88,Los Angeles Angels,-733
9918,2023-04-22,Rafael Devers,1,0.88,Boston Red Sox,-733
25906,2023-06-03,Nico Hoerner,1,0.88,Chicago Cubs,-733
...,...,...,...,...,...,...
22095,2023-05-23,Anthony Santander,1,0.75,Baltimore Orioles,-300
17950,2023-05-12,Marcus Semien,1,0.75,Oakland Athletics,-300
15338,2023-05-06,C.J. Cron,1,0.75,Colorado Rockies,-300
11867,2023-04-27,Trey Mancini,1,0.75,Chicago Cubs,-300


In [11]:
confident_test_prediction.batting_hit_recorded.sum() / len(confident_test_prediction)

0.8360655737704918