**Import Packages**

In [1]:
import pycaret
import pandas as pd
import numpy as np
import sys

from pycaret import classification
from datetime import datetime
import model.common
import importlib

**Import Data**

In [2]:
from static_data.load_static_data import *

In [3]:
collect_data_Base_dir = 'collect_data'

In [4]:
df_game_matchup_total = pd.read_pickle(f'{collect_data_Base_dir}/df_game_matchup_total.pkl')
print(len(df_game_matchup_total))

320540


**Load Model**

In [19]:
regression_model = pycaret.classification.load_model(model.common.model_file_name)

Transformation Pipeline and Model Successfully Loaded


In [15]:
print(regression_model)

Pipeline(memory=FastMemory(location=/tmp/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['pitching_gamesPlayed',
                                             'pitching_runs',
                                             'pitching_strikeOuts',
                                             'pitching_hits',
                                             'batting_gamesPlayed',
                                             'batting_runs',
                                             'batting_strikeOuts',
                                             'batting_hits', 'batting_rbi',
                                             'temp'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(i...
                                                              handle_missing='return_nan'))),
                ('actual_estimator',
                 CustomProbabilityThresholdCla

**Get Most Confident Predictions**

In [43]:
train_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2012-04-01") & (df_game_matchup_total.game_date < "2022-12-01")][model.common.features]
test_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2022-12-01")][model.common.features]

In [45]:
test_prediction = pycaret.classification.predict_model(data = test_data, estimator = regression_model)
test_prediction = pd.merge(test_prediction, df_player_team_positions[['player_id','player_team_name']], left_on='batting_id', right_on='player_id', how='left')
test_prediction["theo_odds"] = test_prediction["prediction_score"].apply(model.common.odds_calculator)

In [46]:
test_prediction[model.common.features + ['batting_hit_recorded', 'prediction_label', 'prediction_score', 'theo_odds']]

Unnamed: 0,pitching_gamesPlayed,pitching_runs,pitching_strikeOuts,pitching_hits,pitching_id,batting_gamesPlayed,batting_runs,batting_strikeOuts,batting_hits,batting_rbi,...,temp,game_venue,game_date,game_year,batting_name,pitching_name,batting_hit_recorded,prediction_label,prediction_score,theo_odds
0,30,92,177,171,608337,150,71,95,140,107,...,84.199997,Minute Maid Park,2023-04-01,2023,Kyle Tucker,Lucas Giolito,1,1,0.6469,-183
1,30,92,177,171,608337,155,93,77,142,93,...,84.199997,Minute Maid Park,2023-04-01,2023,Alex Bregman,Lucas Giolito,0,1,0.6626,-196
2,30,92,177,171,608337,157,85,110,183,75,...,84.199997,Minute Maid Park,2023-04-01,2023,Jose Abreu,Lucas Giolito,1,1,0.7310,-272
3,30,92,177,171,608337,157,85,110,183,75,...,84.199997,Minute Maid Park,2023-04-01,2023,Jose Abreu,Lucas Giolito,1,1,0.7310,-272
4,30,92,177,171,608337,16,7,6,10,5,...,84.199997,Minute Maid Park,2023-04-01,2023,David Hensley,Lucas Giolito,0,1,0.5412,-118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22011,32,97,103,194,594835,92,35,53,74,38,...,62.060001,T-Mobile Park,2023-05-23,2023,Aledmys Diaz,Marco Gonzales,0,1,0.6383,-176
22012,32,97,103,194,594835,92,35,53,74,38,...,62.060001,T-Mobile Park,2023-05-23,2023,Aledmys Diaz,Marco Gonzales,0,1,0.6383,-176
22013,32,97,103,194,594835,92,35,53,74,38,...,62.060001,T-Mobile Park,2023-05-23,2023,Aledmys Diaz,Marco Gonzales,0,1,0.6383,-176
22014,32,97,103,194,594835,92,35,53,74,38,...,62.060001,T-Mobile Park,2023-05-23,2023,Aledmys Diaz,Marco Gonzales,0,1,0.6383,-176


In [47]:
score_threshold = 0.75
confident_test_prediction = test_prediction[(test_prediction["prediction_score"] >= score_threshold) & (test_prediction["prediction_label"] == 1)].sort_values(by = "prediction_score", ascending = False).drop_duplicates("batting_name")
confident_test_prediction[['game_date', "batting_name", "batting_hit_recorded",	"prediction_score", "player_team_name", "theo_odds"]]

Unnamed: 0,game_date,batting_name,batting_hit_recorded,prediction_score,player_team_name,theo_odds
11068,2023-04-25,Trea Turner,0,0.8132,Los Angeles Dodgers,-435
13122,2023-04-30,Bo Bichette,1,0.7911,Toronto Blue Jays,-379
16398,2023-05-08,Luis Robert Jr.,1,0.7768,Chicago White Sox,-348
13132,2023-04-30,Vladimir Guerrero Jr.,0,0.7702,Toronto Blue Jays,-335
15767,2023-05-06,Jose Abreu,0,0.7692,Chicago White Sox,-333
3136,2023-04-07,Mookie Betts,1,0.7669,Los Angeles Dodgers,-329
3125,2023-04-07,Freddie Freeman,1,0.7663,Los Angeles Dodgers,-328
16392,2023-05-08,Tim Anderson,1,0.7635,Chicago White Sox,-323
3134,2023-04-07,J.D. Martinez,1,0.7596,Detroit Tigers,-316
19853,2023-05-17,Charlie Blackmon,0,0.7546,Colorado Rockies,-307


In [34]:
confident_test_prediction.batting_hit_recorded.sum() / len(confident_test_prediction)

0.7333333333333333

In [None]:
Daily_Hit_Prediction.to_csv("labeled-"+file_string)
files.download("labeled-"+file_string)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>