In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split

# from baseball savant, parse out the pitcher csv file downloaded.

pitching_stats = pd.read_csv('../../datasets/baseball savant/pitching stats.csv').dropna()
hitting_stats = pd.read_csv('../../datasets/baseball savant/hitting stats.csv')

# remove unnamed columns then rows with nan
hitting_stats = hitting_stats.dropna(how='all', axis='columns').dropna()

# we need to derive ALOT of normalized variables, each w.r.t. the amount of PA's

percent_cols = [col for col in hitting_stats.columns if 'percent' in col]

# percents that are actually already in decimal: slg%, ob%, so just fix everything else.

percent_cols = [col for col in percent_cols if col not in ['slg_percent', 'on_base_percent']]

# add in barrel_batted_rate as well
percent_cols.append('barrel_batted_rate')

hitting_stats[percent_cols] = hitting_stats[percent_cols].apply(lambda x: x/100)

# what avg's are we working with (yeah i could just look at the csv, but that shit makes me dizzy)

avg_cols = [col for col in hitting_stats.columns if 'avg' in col and 'batting_avg' not in col]

# define our predictor variable, which is runs/PA

hitting_stats['runs/PA'] =  hitting_stats['r_run'] / hitting_stats['b_total_pa']

# define the set of features we want to model for.

# features = ['b_k_percent', 'b_bb_percent', 'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent',
#             'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent', 'z_swing_percent',
#             'z_swing_miss_percent', 'oz_swing_percent','oz_swing_miss_percent', 'oz_contact_percent', 'meatball_swing_percent', 'iz_contact_percent',
#             'whiff_percent', 'swing_percent', 'pull_percent', 'straightaway_percent', 'opposite_percent', 'f_strike_percent', 'groundballs_percent', 
#             'flyballs_percent', 'linedrives_percent', 'popups_percent', 'sprint_speed', 'runs/PA']

features = ['b_k_percent', 'b_bb_percent', 'exit_velocity_avg', 'launch_angle_avg', 'solidcontact_percent',
            'sprint_speed','whiff_percent', 'oz_swing_percent','meatball_swing_percent','iz_contact_percent', 'linedrives_percent', 'woba']

dataset = hitting_stats[features]

train_dataset = dataset.sample(frac=0.9, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('woba')
test_labels = test_features.pop('woba')

# ok, normalize dis hoe (run me that z-score bruh).

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

def build_and_compile_model(norm):
  model = tf.keras.Sequential([
      norm,
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.0005))
  return model

dnn_model = build_and_compile_model(normalizer)

history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [wOBA]')
  plt.legend()
  plt.grid(True)

plot_loss(history)

# test_predictions = dnn_model.predict(test_features).flatten()

# a = plt.axes(aspect='equal')
# plt.scatter(test_labels, test_predictions)
# plt.xlabel('True Values (runs/PA)')
# plt.ylabel('Predictions (runs/PA)')

In [45]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

def parse_game_logs():
    
    split_headers = {
        0:'Platoon Splits',
        1:'Monthly Splits',
        2:'Base Runner Splits',
        3:'Game Type Splits',
        4:'Out Splits',
        5:'Inning Splits',
    } 

    response = requests.get('https://baseballsavant.mlb.com/probable-pitchers')
    soup = BeautifulSoup(response.text, 'html.parser')

    matchup_strings = [(i.text, i.get('href').split('=')[-1]) for i in soup.find_all("a", {"class": "matchup-link"}) if 'src=' not in str(i)]
    pitchers        = {tup[0]:None for tup in matchup_strings}
    for tup in matchup_strings:
        # structure the pitcher string for query.
        pitcher       = tup[0].lower().replace(' ','-') + tup[1]     
        storage      = {}
        tables       = pd.read_html(f'https://baseballsavant.mlb.com/savant-player/{pitcher}?stats=gamelogs-r-pitching-mlb&season=2022')[21:27]
        
        # create references for each table.
        for i in range(len(tables)):
            _t = tables[i]
            # convert each column to float if possible.
            for col in _t:
                try:
                    _t[col] = _t[col].astype(float)
                except ValueError as err:
                    pass
            
            storage[split_headers[i]] = _t

        pitchers[tup[0]] = storage
        
    with open('../datasets/baseball savant/probable_pitchers.pkl', 'wb') as handle:
        pickle.dump(pitcher, handle, protocol=pickle.HIGHEST_PROTOCOL)

parse_game_logs()

In [58]:
import matplotlib.pyplot as plt

def generate_inning_lines(surpress_visuals = False):
    with open('../datasets/baseball savant/probable_pitchers.pkl', 'rb') as handle:
        data = pickle.load(handle)

    # generate table of 1st inning performance for all pitchers.
    table        = {}
    cols_to_edit = ['BF', 'H','R', 'R', 'ER', 'HR', 'BB', 'SO']
    for pitcher, stats in data.items():
    
        inning_splits = stats['Inning Splits']
        
        # fix the fact that baseball savant records fractional innings as decimals 
        # i.e. convert 17.2 IP -> 17.67 IP to represent 17 innings and 2 outs.

        inning_splits['adj_IP'] = inning_splits['IP'].astype(int) + ((inning_splits['IP'] - inning_splits['IP'].astype(int)) * 33)
        for col in cols_to_edit:

            inning_splits[f'{col}/inning'] = inning_splits[col] / (inning_splits['adj_IP'])
        
        inning_splits.style.set_table_attributes("style='display:inline'").set_caption(pitcher + 'inning splits.')
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            display(inning_splits)

        table[pitcher] = inning_splits

        break

    return table

generate_inning_lines()

Unnamed: 0,Team,L,Type,W,L.1,ERA,G,GS,SV,IP,BF,H,R,ER,HR,BB,SO,WHIP,adj_IP,BF/inning,H/inning,R/inning,ER/inning,HR/inning,BB/inning,SO/inning
0,Toronto Blue Jays,MLB,First Inning,,,5.6,18.0,,,17.2,86.0,17.0,11.0,11.0,2.0,19.0,19.0,2.04,23.6,3.644068,0.720339,0.466102,0.466102,0.084746,0.805085,0.805085
1,Toronto Blue Jays,MLB,Second Inning,,,2.65,17.0,,,17.0,73.0,13.0,10.0,5.0,5.0,9.0,23.0,1.29,17.0,4.294118,0.764706,0.588235,0.294118,0.294118,0.529412,1.352941
2,Toronto Blue Jays,MLB,Third Inning,,,8.4,17.0,,,15.0,68.0,13.0,14.0,14.0,5.0,7.0,19.0,1.33,15.0,4.533333,0.866667,0.933333,0.933333,0.333333,0.466667,1.266667
3,Toronto Blue Jays,MLB,Fourth Inning,,,3.46,14.0,,,13.0,55.0,11.0,6.0,5.0,2.0,5.0,12.0,1.23,13.0,4.230769,0.846154,0.461538,0.384615,0.153846,0.384615,0.923077
4,Toronto Blue Jays,MLB,Fifth Inning,,,3.38,10.0,,,8.0,33.0,7.0,3.0,3.0,1.0,2.0,9.0,1.13,8.0,4.125,0.875,0.375,0.375,0.125,0.25,1.125
5,Toronto Blue Jays,MLB,Sixth Inning,,,5.4,4.0,,,3.1,15.0,5.0,2.0,2.0,0.0,1.0,2.0,1.8,6.3,2.380952,0.793651,0.31746,0.31746,0.0,0.15873,0.31746


{'Yusei Kikuchi':                 Team    L           Type   W  L.1   ERA     G  GS  SV    IP  \
 0  Toronto Blue Jays  MLB   First Inning NaN  NaN  5.60  18.0 NaN NaN  17.2   
 1  Toronto Blue Jays  MLB  Second Inning NaN  NaN  2.65  17.0 NaN NaN  17.0   
 2  Toronto Blue Jays  MLB   Third Inning NaN  NaN  8.40  17.0 NaN NaN  15.0   
 3  Toronto Blue Jays  MLB  Fourth Inning NaN  NaN  3.46  14.0 NaN NaN  13.0   
 4  Toronto Blue Jays  MLB   Fifth Inning NaN  NaN  3.38  10.0 NaN NaN   8.0   
 5  Toronto Blue Jays  MLB   Sixth Inning NaN  NaN  5.40   4.0 NaN NaN   3.1   
 
    ...    SO  WHIP  adj_IP  BF/inning  H/inning  R/inning  ER/inning  \
 0  ...  19.0  2.04    23.6   3.644068  0.720339  0.466102   0.466102   
 1  ...  23.0  1.29    17.0   4.294118  0.764706  0.588235   0.294118   
 2  ...  19.0  1.33    15.0   4.533333  0.866667  0.933333   0.933333   
 3  ...  12.0  1.23    13.0   4.230769  0.846154  0.461538   0.384615   
 4  ...   9.0  1.13     8.0   4.125000  0.875000  0.3750