**Import Packages**

In [1]:
import pycaret
import pandas as pd
import numpy as np
import sys

from pycaret import classification
from datetime import datetime
import model.common
import importlib

**Import Data**

In [2]:
from static_data.load_static_data import *

In [3]:
collect_data_Base_dir = 'collect_data'

In [86]:
df_game_matchup_total = pd.read_pickle(f'{collect_data_Base_dir}/df_game_matchup_total.pkl')
print(len(df_game_matchup_total))

326057


In [91]:
df_game_matchup_train = df_game_matchup_total[(df_game_matchup_total.game_date > "2012-04-01") & (df_game_matchup_total.game_date < "2022-12-01")]
df_game_matchup_test = df_game_matchup_total[(df_game_matchup_total.game_date > "2022-12-01")]

**Load Model**

In [None]:
feature_columns, target_column, model_file_name = model.common.features_1hits_recorded, model.common.target_1hits_recorded, model.common.model_1hits_file_name

In [52]:
feature_columns_list = [model.common.features_1hits_recorded, model.common.features_2hits_recorded, model.common.features_1homeruns_recorded, model.common.features_1hstrikeouts_recorded, model.common.features_2hstrikeouts_recorded, model.common.features_1runs_recorded, model.common.features_2runs_recorded, model.common.features_1stolenbases_recorded, model.common.features_2stolenbases_recorded]
target_column_list = [model.common.target_1hits_recorded, model.common.target_2hits_recorded, model.common.target_1homeruns_recorded, model.common.target_1hstrikeouts_recorded, model.common.target_2hstrikeouts_recorded, model.common.target_1runs_recorded, model.common.target_2runs_recorded, model.common.target_1stolenbases_recorded, model.common.target_2stolenbases_recorded]
model_file_name_list = [model.common.model_1hits_file_name, model.common.model_2hits_file_name, model.common.model_1homeruns_file_name, model.common.model_1hstrikeouts_file_name, model.common.model_2hstrikeouts_file_name, model.common.model_1runs_file_name ,model.common.model_2runs_file_name ,model.common.model_1stolenbases_file_name, model.common.model_2stolenbases_file_name]

In [104]:
[c for c in df_game_matchup_total.columns if 'stolen' in c]

['pitching_stolenBases',
 'pitching_stolenBasePercentage',
 'batting_boxscore_stolenBases',
 'batting_1stolenBases_recorded',
 'batting_2stolenBases_recorded',
 'batting_stolenBases',
 'batting_stolenBasePercentage']

In [105]:
df_game_matchup_total[['pitching_name', 'batting_name', 'batting_boxscore_hits', 'batting_1hits_recorded', 'batting_2hits_recorded', 'batting_boxscore_runs', 'batting_1runs_recorded', 'batting_1stolenBases_recorded', 'batting_2stolenBases_recorded']]

Unnamed: 0,pitching_name,batting_name,batting_boxscore_hits,batting_1hits_recorded,batting_2hits_recorded,batting_boxscore_runs,batting_1runs_recorded,batting_1stolenBases_recorded,batting_2stolenBases_recorded
0,Brett Myers,Wilson Valdez,2,1,1,0,0,0,0
1,Brett Myers,Ryan Howard,2,1,1,1,1,0,0
2,Brett Myers,Shane Victorino,0,0,0,0,0,0,0
3,Brett Myers,John Mayberry Jr.,1,1,0,0,0,0,0
4,Brett Myers,Jimmy Rollins,2,1,1,2,1,1,0
...,...,...,...,...,...,...,...,...,...
326052,Luke Weaver,Dominic Smith,1,1,0,0,0,0,0
326053,Luke Weaver,Keibert Ruiz,1,1,0,0,0,0,0
326054,Luke Weaver,Lane Thomas,2,1,1,0,0,0,0
326055,Luke Weaver,Derek Hill,1,1,0,0,0,0,0


In [115]:
for feature_columns, target_column, model_file_name in zip(feature_columns_list, target_column_list, model_file_name_list):    
    print('\n')
    print(feature_columns[-2:], target_column, model_file_name)
    regression_model = pycaret.classification.load_model(model_file_name)
    
    train_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2012-04-01") & (df_game_matchup_total.game_date < "2022-12-01")][feature_columns]
    test_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2022-12-01")][feature_columns]

    print(train_data.groupby([target_column]).count()['batting_name'])

    test_prediction = model.common.predict_and_odds(test_data, regression_model)
    
    def get_df_confident_prediction(df_prediction, score_threshold = 0.70):
        df_prediction = df_prediction.loc[:,~df_prediction.columns.duplicated()].copy()
        df_prediction = df_prediction[df_prediction["prediction_score"] >= score_threshold]
        # for some reason, the prediction_label should be separatedly checked. higher score does not always lead to prediction label. (maybe the score stands for both labels).
        df_prediction = df_prediction[df_prediction["prediction_label"] == 1]
        df_prediction = df_prediction.sort_values('prediction_score')
        correct_predictions = df_prediction[target_column].sum()
        l = len(df_prediction)
        print(f'correct prediction recorded ratio: {round(1.0 * correct_predictions / l, 2)} ({correct_predictions} out of {l})')
        return df_prediction[['game_date', 'batting_name', 'player_team_name', target_column, "prediction_score", "theo_odds"]]

    get_df_confident_prediction(test_prediction, 0.75)



['game_year', 'batting_1hits_recorded'] batting_1hits_recorded model/batter_1hits_regression_model
Transformation Pipeline and Model Successfully Loaded
batting_1hits_recorded
0    122085
1    163005
Name: batting_name, dtype: int64
correct prediction recorded ratio: 0.78 (2606 out of 3333)


['game_year', 'batting_2hits_recorded'] batting_2hits_recorded model/batter_2hits_regression_model
Transformation Pipeline and Model Successfully Loaded
batting_2hits_recorded
0    224320
1     60770
Name: batting_name, dtype: int64
correct prediction recorded ratio: 0.7 (71 out of 102)


['game_year', 'batting_1homeRuns_recorded'] batting_1homeRuns_recorded model/batter_1homeruns_regression_model
Transformation Pipeline and Model Successfully Loaded
batting_1homeRuns_recorded
0    254580
1     30510
Name: batting_name, dtype: int64
correct prediction recorded ratio: nan (0 out of 0)


['game_year', 'batting_1strikeOuts_recorded'] batting_1strikeOuts_recorded model/batter_1hstrikeouts_regression

In [5]:
feature_columns, target_column, model_file_name = model.common.features_1hits_recorded, model.common.target_1hits_recorded, model.common.model_1hits_file_name

In [19]:
regression_model = pycaret.classification.load_model(model_file_name)

Transformation Pipeline and Model Successfully Loaded


**Get Most Confident Predictions**

In [22]:
train_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2012-04-01") & (df_game_matchup_total.game_date < "2022-12-01")][feature_columns]
test_data = df_game_matchup_total[(df_game_matchup_total.game_date > "2022-12-01")][feature_columns]

In [23]:
test_prediction = model.common.predict_and_odds(test_data, regression_model)

In [47]:
def get_df_confident_prediction(df_prediction, score_threshold = 0.70):
    df_prediction = df_prediction.loc[:,~df_prediction.columns.duplicated()].copy()
    df_prediction = df_prediction[df_prediction["prediction_score"] >= score_threshold]
    # for some reason, the prediction_label should be separatedly checked. higher score does not always lead to prediction label. (maybe the score stands for both labels).
    df_prediction = df_prediction[df_prediction["prediction_label"] == 1]
    df_prediction = df_prediction.sort_values('prediction_score')
    correct_predictions = df_prediction[target_column].sum()
    l = len(df_prediction)
    print(f'correct prediction recorded ratio: {round(1.0 * correct_predictions / l, 2)} ({correct_predictions} out of {l})')
    return df_prediction[['game_date', 'batting_name', 'player_team_name', target_column, "prediction_score", "theo_odds"]]

In [49]:
get_df_confident_prediction(test_prediction, 0.80)

correct prediction recorded ratio: 0.81 (1102 out of 1360)


Unnamed: 0,game_date,batting_name,player_team_name,batting_1hits_recorded,prediction_score,theo_odds
28152,2023-06-10,Nolan Jones,Cleveland Guardians,1,0.8000,-400
498,2023-04-02,Brandon Lowe,Tampa Bay Rays,1,0.8001,-400
5755,2023-04-13,Brandon Marsh,Philadelphia Phillies,0,0.8002,-401
5754,2023-04-13,Brandon Marsh,Los Angeles Angels,0,0.8002,-401
6715,2023-04-15,Sean Murphy,Atlanta Braves,1,0.8003,-401
...,...,...,...,...,...,...
1084,2023-04-03,Dansby Swanson,Chicago Cubs,1,0.9799,-4875
710,2023-04-02,Paul Goldschmidt,Arizona Diamondbacks,1,0.9825,-5614
711,2023-04-02,Paul Goldschmidt,St. Louis Cardinals,1,0.9825,-5614
1277,2023-04-03,Paul Goldschmidt,St. Louis Cardinals,1,0.9856,-6844
