In [132]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle

def parse_game_logs():
    
    split_headers = {
        0:'Platoon Splits',
        1:'Monthly Splits',
        2:'Base Runner Splits',
        3:'Game Type Splits',
        4:'Out Splits',
        5:'Inning Splits',
    } 

    response = requests.get('https://baseballsavant.mlb.com/probable-pitchers')
    soup = BeautifulSoup(response.text, 'html.parser')

    matchup_strings = [(i.text, i.get('href').split('=')[-1]) for i in soup.find_all("a", {"class": "matchup-link"}) if 'src=' not in str(i)]
    pitchers        = {tup[0]:None for tup in matchup_strings}
    for tup in matchup_strings:
        # structure the pitcher string for query.
        pitcher       = tup[0].lower().replace(' ','-') + tup[1]     
        storage      = {}
        tables       = pd.read_html(f'https://baseballsavant.mlb.com/savant-player/{pitcher}?stats=gamelogs-r-pitching-mlb&season=2022')
        
        count = 0
        # create references for each table.
        for i in range(len(tables)):
            _t = tables[i]

            # need to determine which tables are splits.
            try:
                
                if 'Team' in _t.columns and _t.iloc[0,1] == 'MLB':
                    # convert each column to float if possible.
                    for col in _t:
                        try:
                            _t[col] = _t[col].astype(float)
                        except ValueError as err:
                            pass
                    
                    storage[split_headers[count]] = _t
                    count += 1

            except Exception as err:
                pass

        pitchers[tup[0]] = storage
        
    with open('../datasets/baseball savant/probable_pitchers.pkl', 'wb') as handle:
        pickle.dump(pitchers, handle, protocol=pickle.HIGHEST_PROTOCOL)

parse_game_logs()

In [133]:
import matplotlib.pyplot as plt

def generate_inning_lines_a(surpress_visuals = False):
    with open('../datasets/baseball savant/probable_pitchers.pkl', 'rb') as handle:
        data = pickle.load(handle)

    # generate table of 1st inning performance for all pitchers.
    table        = {}
    cols_to_edit = ['BF', 'H','R', 'ER', 'HR', 'BB', 'SO']
    for pitcher, stats in data.items():
    
        try:
            inning_splits            = stats['Inning Splits']
            inning_splits.insert(0, 'Pitcher', pitcher)
            
            # fix the fact that baseball savant records fractional innings as decimals 
            # i.e. convert 17.2 IP -> 17.67 IP to represent 17 innings and 2 outs.

            inning_splits['adj_IP'] = inning_splits['IP'].astype(int) + (inning_splits['IP'] - inning_splits['IP'].astype(int)) * 3.3
            for col in cols_to_edit:

                inning_splits[f'{col}/inning'] = round(inning_splits[col] / (inning_splits['adj_IP']),3)
            
            inning_splits.style.set_table_attributes("style='display:inline'").set_caption(pitcher + 'inning splits.')
            inning_splits = inning_splits.fillna("")

            # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            #     display(inning_splits)

            table[pitcher] = inning_splits
        except Exception as err:
            pass

    with open('../datasets/baseball savant/probable_pitchers_inning_lines.pkl', 'wb') as handle:
        pickle.dump(table, handle, protocol=pickle.HIGHEST_PROTOCOL)

generate_inning_lines_a()

In [134]:
import pickle
import pandas as pd

def generate_inning_lines_b(innings: list, display_df = False):

    with open('../datasets/baseball savant/probable_pitchers_inning_lines.pkl', 'rb') as handle:
        data = pickle.load(handle)

    inning_mapping = {
        1:'First Inning',
        2:'Second Inning',
        3:'Third Inning',
        4:'Fourth Inning',
        5:'Fifth Inning',
        6:'Sixth Inning',
        7:'Seventh Inning',
        8:'Eighth Inning',
        9:'Ninth Inning'
    }
        
    _innings      = [inning_mapping[i] for i in innings]
    table        = pd.DataFrame()
    cum_ks       = {}
    cols_to_edit = ['BF', 'H','R', 'ER', 'HR', 'BB', 'SO']
    for pitcher, stats in data.items():
        try:
            df_to_add = stats.loc[stats['Type'].isin(_innings)]

            if df_to_add.shape[0] > len(_innings):
                df_grouped = df_to_add.groupby(by=['Type']).sum()
                for col in cols_to_edit:
                    df_grouped[f'{col}/inning'] = round(df_grouped[col] / (df_grouped['adj_IP']),3)
                _t = list(round(df_grouped[['SO/inning','H/inning','R/inning','ER/inning','HR/inning','BB/inning']].astype(float).sum(), 3))

            else:
                _t = list(round(df_to_add[['SO/inning','H/inning','R/inning','ER/inning','HR/inning','BB/inning']].astype(float).sum(), 3))
                                    
            if table.empty:
                table = df_to_add               
                
            else:
                table = pd.concat([table,df_to_add], ignore_index=True)

            cum_ks[pitcher] = _t


        except Exception as err:
            print(err)

    per_inning_df = pd.DataFrame(table)
    if display_df:
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            display(per_inning_df)

    # display cumulative k's
    
    cum_ks_df = pd.DataFrame.from_dict(cum_ks, orient='index', columns=[f'avg. cum. k\'s in {innings} innings', f'avg. cum. Hs in {innings} innings',
                                                                        f'avg. cum. Rs in {innings} innings', f'avg. cum. ERs in {innings} innings', f'avg. cum. HRs in {innings} innings',
                                                                        f'avg. cum. BBs in {innings} innings',])

    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(cum_ks_df)

generate_inning_lines_b([1,2], display_df = False)

Unnamed: 0,"avg. cum. k's in [1, 2] innings","avg. cum. Hs in [1, 2] innings","avg. cum. Rs in [1, 2] innings","avg. cum. ERs in [1, 2] innings","avg. cum. HRs in [1, 2] innings","avg. cum. BBs in [1, 2] innings"
Michael Kopech,1.783,1.309,0.886,0.748,0.092,1.309
Daniel Lynch,1.842,1.947,1.105,0.948,0.211,0.842
Luis Cessa,0.0,5.0,3.0,3.0,0.0,0.0
Noah Syndergaard,1.386,2.251,1.316,1.202,0.17,0.459
Jake Odorizzi,1.737,2.507,1.523,1.319,0.35,0.552
Roansy Contreras,1.828,2.153,1.242,1.242,0.414,1.442
Max Scherzer,2.823,1.353,0.47,0.353,0.118,0.294
Domingo German,1.333,2.5,1.333,1.333,0.666,0.666
Tucker Davidson,1.4,1.2,1.2,1.2,0.2,1.4
Jeffrey Springs,2.1,1.734,0.681,0.62,0.184,0.37


In [135]:
import requests
import pandas as pd

def get_strikeout_percents():

    response = requests.get("https://www.espn.com/mlb/stats/team").text
    tables   = pd.read_html(response)
    merged   = pd.concat([tables[0].iloc[:,1:], tables[1]], axis=1, ignore_index=False)

    # create new columns
    merged['SO/AB'] = round(merged['SO']/merged['AB'], 3)     
    merged          = merged.sort_values(by='SO/AB', ascending=False)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(merged)

    return merged
table = get_strikeout_percents()

Unnamed: 0,Team,GP,AB,R,H,2B,3B,HR,RBI,TB,BB,SO,SB,AVG,OBP,SLG,OPS,SO/AB
26,Los Angeles Angels,120,3985,459,904,155,24,133,440,1506,351,1153,63,0.227,0.295,0.378,0.673,0.289
28,Pittsburgh Pirates,120,3948,435,872,162,21,118,406,1430,354,1114,62,0.221,0.289,0.362,0.652,0.282
8,Atlanta Braves,122,4161,596,1049,229,9,185,568,1851,354,1128,70,0.252,0.317,0.445,0.761,0.271
18,Cincinnati Reds,118,3919,500,941,179,10,111,479,1473,327,1059,46,0.24,0.308,0.376,0.684,0.27
27,Detroit Tigers,122,4019,388,910,173,17,71,371,1330,283,1068,36,0.226,0.282,0.331,0.613,0.266
21,Milwaukee Brewers,119,3985,539,935,177,15,164,526,1634,427,1060,71,0.235,0.316,0.41,0.726,0.266
20,Tampa Bay Rays,119,3984,492,948,209,14,106,467,1503,369,1053,68,0.238,0.307,0.377,0.684,0.264
13,Chicago Cubs,119,4035,501,984,199,22,124,473,1599,381,1061,81,0.244,0.317,0.396,0.713,0.263
23,Miami Marlins,120,4000,441,931,184,17,107,416,1470,326,1047,101,0.233,0.297,0.368,0.665,0.262
22,San Francisco Giants,120,3988,538,933,198,12,140,514,1575,431,1046,54,0.234,0.317,0.395,0.712,0.262


In [193]:
from msilib.schema import Error
from tokenize import String
import requests
import pandas as pd
import re
import json

HEADERS  = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}

def get_last_10_games(headers):
    response = requests.get("https://www.espn.com/mlb/stats/team").text
    teams    = pd.read_html(response)[0]['Team']

    for team in teams:
        team  = team.lower().replace(' ','-')
        
        query   = requests.get(f'https://www.statmuse.com/mlb/ask/{team}-before-the-third-inning-game-log', headers=HEADERS).text
        
        string  = re.search(r'visual-answer answer="(.*?)/visual-answer>',query).group(1)
        
        col_str = re.search(r'columns&quot;:(.*?)]', string).group(1)[1:].split('}')[:-1]
        
        columns = [re.findall(r';(.*?)&', col)[2] for col in col_str]
                
        rows    = re.findall(r'rows&quot;:\[{&quot(.*?)}}"><', string)[0].split('}},')[:-1]
        
        col_ord = ['TEAM', 'DATE', 'TM', 'H/A', 'OPP', 'PA', 'AB', 'H', '2B','3B', 'HR', 'RBI','BB','IBB', 'HBP','SO', 'SH','SF', 'TB', 'XBH', 'AVG', 'OBP', 'SLG', 'OPS']
        
        _rows   = []
        for row in rows:
            
            cols = row.split('},')
            storage = {}
            for col in cols:
                
                key = re.search(r';(.*?)&quot;', col).group(1)
                if key in columns:
                    
                    val = col.split(';value&quot;:')[-1].replace('&quot;','')
                    
                    if key in ['DATE', 'TEAM', 'TM', 'OPP']:
                        val = re.search(rf'{key}:{{display:(.*?),', val).group(1)

                    # check last index XBH
                    if key == 'XBH':
                        try: 
                            val = float(val)
                        except Exception as err:
                            val = val[:3]
                    
                    storage[key] = val
                    
            _rows.append(storage)     
            
        df = pd.DataFrame(_rows).rename(columns={'ALIGNMENT':'H/A'})
        df = df[col_ord]
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            display(df)
        break
get_last_10_games(HEADERS)

Unnamed: 0,TEAM,DATE,TM,H/A,OPP,PA,AB,H,2B,3B,HR,RBI,BB,IBB,HBP,SO,SH,SF,TB,XBH,AVG,OBP,SLG,OPS
0,Colorado Rockies,8/21/2022,COL,vs,SFG,8.0,8.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0,0.25,0.25,0.375,0.625
1,Colorado Rockies,8/20/2022,COL,vs,SFG,8.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.25,0.25,0.25,0.5
2,Colorado Rockies,8/19/2022,COL,vs,SFG,10.0,9.0,3.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,6.0,1.0,0.3333,0.4,0.6667,1.0667
3,Colorado Rockies,8/18/2022,COL,@,STL,7.0,7.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.2857,0.2857,0.2857,0.5714
4,Colorado Rockies,8/17/2022,COL,@,STL,7.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,1.0,0.1429,0.1429,0.2857,0.4286
5,Colorado Rockies,8/16/2022,COL,@,STL,7.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.1429,0.0,0.1429
6,Colorado Rockies,8/14/2022,COL,vs,ARI,8.0,7.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.1429,0.25,0.2857,0.5357
7,Colorado Rockies,8/13/2022,COL,vs,ARI,7.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.1429,0.1429,0.1429,0.2857
8,Colorado Rockies,8/12/2022,COL,vs,ARI,7.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1429,0.0,0.1429
9,Colorado Rockies,8/11/2022,COL,vs,STL,10.0,7.0,2.0,1.0,0.0,0.0,2.0,3.0,0.0,0.0,3.0,0.0,0.0,3.0,1.0,0.2857,0.5,0.4286,0.9286


In [137]:
t = ';2B&quot;:{&quot;display&quot;:&quot;2&quot;,&quot;value&quot;:2.0},&quot;3B&quot;:{&quot;display&quot;:&quot;0&quot;,&quot;value&quot;:0.0},&quot;AB&quot;:{&quot;display&quot;:&quot;11&quot;,&quot;value&quot;:11.0},&quot;ALIGNMENT&quot;:{&quot;display&quot;:&quot;@&quot;,&quot;value&quot;:&quot;@&quot;},&quot;AVG&quot;:{&quot;display&quot;:&quot;.455&quot;,&quot;value&quot;:0.4545},&quot;BB&quot;:{&quot;display&quot;:&quot;1&quot;,&quot;value&quot;:1.0},&quot;DATE&quot;:{&quot;display&quot;:&quot;8/18/2022&quot;,&quot;entity&quot;:{&quot;baseResourcePath&quot;:&quot;mlb/games/226441&quot;,&quot;display&quot;:&quot;8/18/2022 TOR @ NYY&quot;,&quot;domain&quot;:&quot;MLB&quot;,&quot;id&quot;:&quot;226441&quot;,&quot;type&quot;:&quot;game&quot;},&quot;value&quot;:&quot;2022-08-18T00:00:00&quot;},&quot;H&quot;:{&quot;display&quot;:&quot;5&quot;,&quot;value&quot;:5.0},&quot;HBP&quot;:{&quot;display&quot;:&quot;0&quot;,&quot;value&quot;:0.0},&quot;HR&quot;:{&quot;display&quot;:&quot;1&quot;,&quot;value&quot;:1.0},&quot;IBB&quot;:{&quot;display&quot;:&quot;0&quot;,&quot;value&quot;:0.0},&quot;OBP&quot;:{&quot;display&quot;:&quot;.500&quot;,&quot;value&quot;:0.5},&quot;OPP&quot;:{&quot;display&quot;:&quot;NYY&quot;,&quot;entity&quot;:{&quot;baseResourcePath&quot;:&quot;mlb/teams/76/2022&quot;,&quot;display&quot;:&quot;2022 New York Yankees&quot;,&quot;domain&quot;:&quot;MLB&quot;,&quot;id&quot;:&quot;76/2022&quot;,&quot;type&quot;:&quot;teamSeason&quot;},&quot;imageUrl&quot;:&quot;https://cdn.statmuse.com/img/mlb/teams/mlb_new_york_yankees_secondary.png&quot;,&quot;value&quot;:&quot;NYY&quot;},&quot;OPS&quot;:{&quot;display&quot;:&quot;1.409&quot;,&quot;value&quot;:1.4091},&quot;PA&quot;:{&quot;display&quot;:&quot;12&quot;,&quot;value&quot;:12.0},&quot;RBI&quot;:{&quot;display&quot;:&quot;5&quot;,&quot;value&quot;:5.0},&quot;SF&quot;:{&quot;display&quot;:&quot;0&quot;,&quot;value&quot;:0.0},&quot;SH&quot;:{&quot;display&quot;:&quot;0&quot;,&quot;value&quot;:0.0},&quot;SLG&quot;:{&quot;display&quot;:&quot;.909&quot;,&quot;value&quot;:0.9091},&quot;SO&quot;:{&quot;display&quot;:&quot;2&quot;,&quot;value&quot;:2.0},&quot;TB&quot;:{&quot;display&quot;:&quot;10&quot;,&quot;value&quot;:10.0},&quot;TEAM&quot;:{&quot;display&quot;:&quot;Toronto Blue Jays&quot;,&quot;entity&quot;:{&quot;baseResourcePath&quot;:&quot;mlb/teams/94/2022&quot;,&quot;display&quot;:&quot;2022 Toronto Blue Jays&quot;,&quot;domain&quot;:&quot;MLB&quot;,&quot;id&quot;:&quot;94/2022&quot;,&quot;type&quot;:&quot;teamSeason&quot;},&quot;imageUrl&quot;:&quot;https://cdn.statmuse.com/img/mlb/teams/mlb_toronto_blue_jays_secondary.png&quot;,&quot;value&quot;:&quot;Toronto Blue Jays&quot;},&quot;TM&quot;:{&quot;display&quot;:&quot;TOR&quot;,&quot;entity&quot;:{&quot;baseResourcePath&quot;:&quot;mlb/teams/94/2022&quot;,&quot;display&quot;:&quot;2022 Toronto Blue Jays&quot;,&quot;domain&quot;:&quot;MLB&quot;,&quot;id&quot;:&quot;94/2022&quot;,&quot;type&quot;:&quot;teamSeason&quot;},&quot;imageUrl&quot;:&quot;https://cdn.statmuse.com/img/mlb/teams/mlb_toronto_blue_jays_secondary.png&quot;,&quot;value&quot;:&quot;TOR&quot;},&quot;XBH&quot;:{&quot;display&quot;:&quot;3&quot;,&quot;value&quot;:3.0'
replace_list = ['{', '&quot', ';', 'display',':','value', '}']
found = re.sub(r'|'.join(map(re.escape, replace_list)), '', t).split(',')
substring_list = ['TEAM', 'DATE', 'TM', 'ALIGNMENT', 'OPP', 'PA', 'AB', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'IBB', 'HBP', 'SO', 'SH', 'SF', 'TB', 'XBH', 'AVG', 'OBP', 'SLG', 'OPS']

matches = {}
for string in found:
    # print(string)
    for substring in substring_list:
        if substring in string and substring not in matches:
            matches[substring] = string.replace(substring,'')

print(sorted(substring_list))
print(matches) 

['2B', '3B', 'AB', 'ALIGNMENT', 'AVG', 'BB', 'DATE', 'H', 'HBP', 'HR', 'IBB', 'OBP', 'OPP', 'OPS', 'PA', 'RBI', 'SF', 'SH', 'SLG', 'SO', 'TB', 'TEAM', 'TM', 'XBH']
{'2B': '2', '3B': '0', 'AB': '11', 'ALIGNMENT': '@', 'AVG': '.455', 'BB': '1', 'DATE': '8/18/2022', 'H': '5', 'HBP': '0', 'HR': '1', 'IBB': '0', 'OBP': '.500', 'OPP': 'NYY', 'OPS': '1.409', 'PA': '12', 'RBI': '5', 'SF': '0', 'SH': '0', 'SLG': '.909', 'SO': '2', 'TB': '10', 'TEAM': 'Toronto Blue Jays', 'TM': 'TOR', 'XBH': '3'}
