In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split

# from baseball savant, parse out the pitcher csv file downloaded.

pitching_stats = pd.read_csv('../../datasets/baseball savant/pitching stats.csv').dropna()
hitting_stats = pd.read_csv('../../datasets/baseball savant/hitting stats.csv')

# remove unnamed columns then rows with nan
hitting_stats = hitting_stats.dropna(how='all', axis='columns').dropna()

# we need to derive ALOT of normalized variables, each w.r.t. the amount of PA's

percent_cols = [col for col in hitting_stats.columns if 'percent' in col]

# percents that are actually already in decimal: slg%, ob%, so just fix everything else.

percent_cols = [col for col in percent_cols if col not in ['slg_percent', 'on_base_percent']]

# add in barrel_batted_rate as well
percent_cols.append('barrel_batted_rate')

hitting_stats[percent_cols] = hitting_stats[percent_cols].apply(lambda x: x/100)

# what avg's are we working with (yeah i could just look at the csv, but that shit makes me dizzy)

avg_cols = [col for col in hitting_stats.columns if 'avg' in col and 'batting_avg' not in col]

# define our predictor variable, which is runs/PA

hitting_stats['runs/PA'] =  hitting_stats['r_run'] / hitting_stats['b_total_pa']

# define the set of features we want to model for.

# features = ['b_k_percent', 'b_bb_percent', 'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent',
#             'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent', 'z_swing_percent',
#             'z_swing_miss_percent', 'oz_swing_percent','oz_swing_miss_percent', 'oz_contact_percent', 'meatball_swing_percent', 'iz_contact_percent',
#             'whiff_percent', 'swing_percent', 'pull_percent', 'straightaway_percent', 'opposite_percent', 'f_strike_percent', 'groundballs_percent', 
#             'flyballs_percent', 'linedrives_percent', 'popups_percent', 'sprint_speed', 'runs/PA']

features = ['b_k_percent', 'b_bb_percent', 'exit_velocity_avg', 'launch_angle_avg', 'solidcontact_percent',
            'sprint_speed','whiff_percent', 'oz_swing_percent','meatball_swing_percent','iz_contact_percent', 'linedrives_percent', 'woba']

dataset = hitting_stats[features]

train_dataset = dataset.sample(frac=0.9, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('woba')
test_labels = test_features.pop('woba')

# ok, normalize dis hoe (run me that z-score bruh).

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

def build_and_compile_model(norm):
  model = tf.keras.Sequential([
      norm,
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.0005))
  return model

dnn_model = build_and_compile_model(normalizer)

history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [wOBA]')
  plt.legend()
  plt.grid(True)

plot_loss(history)

# test_predictions = dnn_model.predict(test_features).flatten()

# a = plt.axes(aspect='equal')
# plt.scatter(test_labels, test_predictions)
# plt.xlabel('True Values (runs/PA)')
# plt.ylabel('Predictions (runs/PA)')

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

def parse_game_logs():
    
    split_headers = {
        0:'Platoon Splits',
        1:'Monthly Splits',
        2:'Base Runner Splits',
        3:'Game Type Splits',
        4:'Out Splits',
        5:'Inning Splits',
    } 

    response = requests.get('https://baseballsavant.mlb.com/probable-pitchers')
    soup = BeautifulSoup(response.text, 'html.parser')

    matchup_strings = [(i.text, i.get('href').split('=')[-1]) for i in soup.find_all("a", {"class": "matchup-link"}) if 'src=' not in str(i)]
    pitchers        = {tup[0]:None for tup in matchup_strings}
    for tup in matchup_strings:
        # structure the pitcher string for query.
        pitcher       = tup[0].lower().replace(' ','-') + tup[1]     
        storage      = {}
        tables       = pd.read_html(f'https://baseballsavant.mlb.com/savant-player/{pitcher}?stats=gamelogs-r-pitching-mlb&season=2022')
        
        count = 0
        # create references for each table.
        for i in range(len(tables)):
            _t = tables[i]

            # need to determine which tables are splits.
            try:
                
                if 'Team' in _t.columns and _t.iloc[0,1] == 'MLB':
                    # convert each column to float if possible.
                    for col in _t:
                        try:
                            _t[col] = _t[col].astype(float)
                        except ValueError as err:
                            pass
                    
                    storage[split_headers[count]] = _t
                    count += 1

            except Exception as err:
                pass

        pitchers[tup[0]] = storage
        
    with open('../datasets/baseball savant/probable_pitchers.pkl', 'wb') as handle:
        pickle.dump(pitchers, handle, protocol=pickle.HIGHEST_PROTOCOL)

parse_game_logs()

In [None]:
import matplotlib.pyplot as plt

def generate_inning_lines(surpress_visuals = False):
    with open('../datasets/baseball savant/probable_pitchers.pkl', 'rb') as handle:
        data = pickle.load(handle)

    # generate table of 1st inning performance for all pitchers.
    table        = {}
    cols_to_edit = ['BF', 'H','R', 'ER', 'HR', 'BB', 'SO']
    for pitcher, stats in data.items():
    
        try:
            inning_splits            = stats['Inning Splits']
            inning_splits.insert(0, 'Pitcher', pitcher)
            
            # fix the fact that baseball savant records fractional innings as decimals 
            # i.e. convert 17.2 IP -> 17.67 IP to represent 17 innings and 2 outs.

            inning_splits['adj_IP'] = inning_splits['IP'].astype(int) + (inning_splits['IP'] - inning_splits['IP'].astype(int)) * 3.3
            for col in cols_to_edit:

                inning_splits[f'{col}/inning'] = round(inning_splits[col] / (inning_splits['adj_IP']),3)
            
            inning_splits.style.set_table_attributes("style='display:inline'").set_caption(pitcher + 'inning splits.')
            inning_splits = inning_splits.fillna("")

            # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            #     display(inning_splits)

            table[pitcher] = inning_splits
            print(inning_splits)
        except Exception as err:
            pass

    with open('../datasets/baseball savant/probable_pitchers_inning_lines.pkl', 'wb') as handle:
        pickle.dump(table, handle, protocol=pickle.HIGHEST_PROTOCOL)

generate_inning_lines()

In [171]:
def generate_inning_lines(innings: list):

    with open('../datasets/baseball savant/probable_pitchers_inning_lines.pkl', 'rb') as handle:
        data = pickle.load(handle)
        
    _innings      = [i-1 for i in innings]
    table        = pd.DataFrame()
    cum_ks       = {}
    for pitcher, stats in data.items():
        _stats = []
        try:
            if table.empty:
                table = stats.iloc[_innings,:]
                _stats.append(round(table['SO/inning'].sum(),3))
                _stats.append(round(table['H/inning'].sum(),3))
                _stats.append(round(table['R/inning'].sum(),3))
                _stats.append(round(table['ER/inning'].sum(),3))
                _stats.append(round(table['HR/inning'].sum(),3))
                _stats.append(round(table['BB/inning'].sum(),3))
                
            else:
                table2 = stats.iloc[_innings,:]
                _stats.append(round(table2['SO/inning'].sum(),3))
                _stats.append(round(table2['H/inning'].sum(),3))
                _stats.append(round(table2['R/inning'].sum(),3))
                _stats.append(round(table2['ER/inning'].sum(),3))
                _stats.append(round(table2['HR/inning'].sum(),3))
                _stats.append(round(table2['BB/inning'].sum(),3))
                table = pd.concat([table,table2], ignore_index=True)

            cum_ks[pitcher] = _stats

        except Exception as err:
            print(err)
        
    per_inning_df = pd.DataFrame(table)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(per_inning_df)

    # display cumulative k's
    
    cum_ks_df = pd.DataFrame.from_dict(cum_ks, orient='index', columns=[f'avg. cum. k\'s in {innings} innings', f'avg. cum. Hs in {innings} innings',
                                                                        f'avg. cum. Rs in {innings} innings', f'avg. cum. ERs in {innings} innings', f'avg. cum. HRs in {innings} innings',
                                                                        f'avg. cum. BBs in {innings} innings',])

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(cum_ks_df)

generate_inning_lines([2,3])

Unnamed: 0,Pitcher,Team,L,Type,W,L.1,ERA,G,GS,SV,IP,BF,H,R,ER,HR,BB,SO,WHIP,adj_IP,BF/inning,H/inning,R/inning,ER/inning,HR/inning,BB/inning,SO/inning
0,Yusei Kikuchi,Toronto Blue Jays,MLB,Second Inning,,,2.65,17.0,,,17.0,73.0,13.0,10.0,5.0,5.0,9.0,23.0,1.29,17.0,4.294,0.765,0.588,0.294,0.294,0.529,1.353
1,Yusei Kikuchi,Toronto Blue Jays,MLB,Third Inning,,,8.4,17.0,,,15.0,68.0,13.0,14.0,14.0,5.0,7.0,19.0,1.33,15.0,4.533,0.867,0.933,0.933,0.333,0.467,1.267
2,Jordan Lyles,Baltimore Orioles,MLB,Second Inning,,,4.91,22.0,,,22.0,102.0,26.0,13.0,12.0,2.0,11.0,22.0,1.68,22.0,4.636,1.182,0.591,0.545,0.091,0.5,1.0
3,Jordan Lyles,Baltimore Orioles,MLB,Third Inning,,,7.48,22.0,,,21.2,109.0,32.0,19.0,18.0,4.0,12.0,18.0,2.03,21.66,5.032,1.477,0.877,0.831,0.185,0.554,0.831
4,Justin Dunn,Seattle Mariners,MLB,Second Inning,,,4.09,11.0,,,11.0,48.0,11.0,5.0,5.0,2.0,6.0,12.0,1.55,11.0,4.364,1.0,0.455,0.455,0.182,0.545,1.091
5,Justin Dunn,Seattle Mariners,MLB,Third Inning,,,2.7,10.0,,,10.0,40.0,7.0,3.0,3.0,2.0,2.0,6.0,0.9,10.0,4.0,0.7,0.3,0.3,0.2,0.2,0.6
6,Chris Bassitt,New York Mets,MLB,Second Inning,,,3.15,20.0,,,20.0,76.0,16.0,7.0,7.0,4.0,1.0,21.0,0.85,20.0,3.8,0.8,0.35,0.35,0.2,0.05,1.05
7,Chris Bassitt,New York Mets,MLB,Third Inning,,,2.25,20.0,,,20.0,82.0,13.0,5.0,5.0,2.0,10.0,21.0,1.15,20.0,4.1,0.65,0.25,0.25,0.1,0.5,1.05
8,Anibal Sanchez,Washington Nationals,MLB,Second Inning,,,2.25,4.0,,,4.0,15.0,2.0,1.0,1.0,1.0,1.0,3.0,0.75,4.0,3.75,0.5,0.25,0.25,0.25,0.25,0.75
9,Anibal Sanchez,Washington Nationals,MLB,Third Inning,,,11.25,4.0,,,4.0,19.0,6.0,5.0,5.0,1.0,2.0,3.0,2.0,4.0,4.75,1.5,1.25,1.25,0.25,0.5,0.75


Unnamed: 0,"avg. cum. k's in [2, 3] innings","avg. cum. Hs in [2, 3] innings","avg. cum. Rs in [2, 3] innings","avg. cum. ERs in [2, 3] innings","avg. cum. HRs in [2, 3] innings","avg. cum. BBs in [2, 3] innings"
Yusei Kikuchi,2.62,1.632,1.521,1.227,0.627,0.996
Jordan Lyles,1.831,2.659,1.468,1.376,0.276,1.054
Justin Dunn,1.691,1.7,0.755,0.755,0.382,0.745
Chris Bassitt,2.1,1.45,0.6,0.6,0.3,0.55
Anibal Sanchez,1.5,2.0,1.5,1.5,0.5,0.75
Keegan Thompson,1.649,2.124,1.066,1.066,0.271,0.808
Jose Suarez,1.5,1.5,1.166,1.166,0.333,0.917
Cole Irvin,1.21,1.421,0.474,0.421,0.158,0.579
Tyler Beede,0.0,6.061,4.545,4.545,0.0,0.0
Zac Gallen,1.959,1.34,0.614,0.46,0.102,0.41
