In [38]:
import requests
from bs4 import BeautifulSoup
import re
import string
import json

def get_pitchers(url):

    response = requests.get(url).text

    # find the leaderboard, and its accompanying data.
    regex = re.search(r'var leaderboardData = \[(.*?)\];', response).group(1)
    # remove the hrefs since we can reference by player_id.
    no_href = '[' + re.sub(r',\"href\":(.*?)\}',"}",regex) + ']'
    
    return no_href

def get_var(text, var):
    """
    :type text: str
    :type var: str
    :rtype: str
    """
    pattern = 'var\s+' + var.rstrip() + '\s+?=\s+?{'
    open_token_found = False
    block = '{'
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        if open_token_found:
            if re.match('};', line):
                block += '}'
                break
            else:
                segments = line.split(':', 1)
                key = segments[0]
                if key[0] != '"':
                    key = '"' + key
                if key[-1] != '"':
                    key = key + '"'
                block += key + ':' + segments[1]
        elif re.match(pattern, line):
                open_token_found = True

    if block[-2] == ',':
        block = block[:-2] + '}'

    # edge case: O'Hearn throws JSONDecodeError
    if 'O"' in block:
        block = block.replace('O"',"O'")

    return json.loads(block)

# function that takes in a pitchers baseball savant pitching profile and parses out various stats/data.
def parse_bs_pitcher_script(url):

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # find the javascript section based on unique identifier: var serverVals =
    script = [script.text for script in soup.find_all('script') if 'var serverVals =' in script.text][0].replace("'",'"').replace("null","0")
    
    data_dict = get_var(script, 'serverVals')

    return data_dict
if __name__ == '__main__':

    pitcher_hrefs = get_pitchers('https://baseballsavant.mlb.com/leaderboard/pitch-arsenal-stats')
    # _ = parse_bs_pitcher_script('https://baseballsavant.mlb.com/savant-player/gerrit-cole-543037?stats=statcast-r-pitching-mlb')

In [30]:
# test to remove href via regex. players can be referenced via player_id in url endpoint.
t = '{"player_type":"pitcher","year":"2022","player_name":"Adon, Joan","player_id":672851,"team_id":"120","team_name":"Nationals","team_name_alt":"WSH","pitch_type":"FF","pitch_name":"4-Seamer","pa":"94","pitches":"340","ba":".286","est_ba":".311","bacon":".386","est_bacon":".421","babip":".34","est_babip":".382","obp":"0.415","slg":".558","est_obp":"0.436","est_slg":".705","iso":"0.273","est_iso":"0.393","woba":".418","est_woba":".469","wobacon":".48","est_wobacon":".564","fg_woba":"0.427","fg_est_woba":"0.468","fg_wobacon":"0.496","fg_est_wobacon":"0.564","k_percent":21.3,"whiff_percent":15.6,"put_away":28.6,"launch_angle_avg":"14.8","exit_velocity_avg":"92.3","hard_hit_percent":43.9,"sweet_spot_percent":42.1,"barrel_batted_rate":21.1,"run_value_unformatted":7.497213569926811,"run_value":7,"run_value_per_100":2.2,"total_pitches":"506","pitch_usage":67.2,"rowId":"672851_FF","href":"<a href=\"/savant-player/672851\">Adon, Joan</a>"}'
t = json.loads(re.sub(r',\"href\":(.*)\}',"}",t))
t

{'player_type': 'pitcher',
 'year': '2022',
 'player_name': 'Adon, Joan',
 'player_id': 672851,
 'team_id': '120',
 'team_name': 'Nationals',
 'team_name_alt': 'WSH',
 'pitch_type': 'FF',
 'pitch_name': '4-Seamer',
 'pa': '94',
 'pitches': '340',
 'ba': '.286',
 'est_ba': '.311',
 'bacon': '.386',
 'est_bacon': '.421',
 'babip': '.34',
 'est_babip': '.382',
 'obp': '0.415',
 'slg': '.558',
 'est_obp': '0.436',
 'est_slg': '.705',
 'iso': '0.273',
 'est_iso': '0.393',
 'woba': '.418',
 'est_woba': '.469',
 'wobacon': '.48',
 'est_wobacon': '.564',
 'fg_woba': '0.427',
 'fg_est_woba': '0.468',
 'fg_wobacon': '0.496',
 'fg_est_wobacon': '0.564',
 'k_percent': 21.3,
 'whiff_percent': 15.6,
 'put_away': 28.6,
 'launch_angle_avg': '14.8',
 'exit_velocity_avg': '92.3',
 'hard_hit_percent': 43.9,
 'sweet_spot_percent': 42.1,
 'barrel_batted_rate': 21.1,
 'run_value_unformatted': 7.497213569926811,
 'run_value': 7,
 'run_value_per_100': 2.2,
 'total_pitches': '506',
 'pitch_usage': 67.2,
 'rowI