In [11]:
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Fetch page content
url = "https://baseballsavant.mlb.com/savant-player/shohei-ohtani-660271?stats=statcast-r-pitching-mlb&playerType=pitcher"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

# Locate the JS script with the data
script = next(tag for tag in soup.find_all("script") if "statcastPitches" in tag.text)
script_text = script.text

# --- Extract statcastPitches block ---
pitches_match = re.search(r"statcastPitches\s*:\s*(\{.*?\})\s*,\s*pitchBreakdown", script_text, re.DOTALL)
pitches_raw = pitches_match.group(1).replace("'", '"').replace("undefined", "null")
statcast_pitches = json.loads(pitches_raw)

# Flatten and convert to DataFrame
all_pitches = []
for pitch_type, pitches in statcast_pitches.items():
    for pitch in pitches:
        pitch["pitch_type"] = pitch_type
        all_pitches.append(pitch)

pitches_df = pd.DataFrame(all_pitches)

# --- Extract pitchBreakdown block ---
breakdown_match = re.search(r"pitchBreakdown\s*:\s*(\[[^\]]+\])", script_text)
breakdown_raw = breakdown_match.group(1).replace("'", '"')
pitch_distribution = json.loads(breakdown_raw)
distribution_df = pd.DataFrame(pitch_distribution)

# --- Final Output ---
print("\nPitch Mix DataFrame:")
print(distribution_df[["pitchType", "name", "percent", "count"]])

print("\nSample Pitches DataFrame:")
print(pitches_df[["gd", "pitch_type", "vel", "x", "z"]].head())


Pitch Mix DataFrame:
  pitchType          name  percent  count
0        ST       Sweeper     35.7     10
1        FF   Four Seamer     32.1      9
2        SI        Sinker     28.6      8
3        FS  Split Finger      3.6      1

Sample Pitches DataFrame:
                         gd pitch_type    vel     x     z
0  2025-06-16T00:00:00.000Z         FF   98.3  0.22  2.77
1  2025-06-16T00:00:00.000Z         FF   98.6   1.5  1.44
2  2025-06-16T00:00:00.000Z         FF   99.1  1.12  1.89
3  2025-06-16T00:00:00.000Z         FF   98.3  2.04  0.78
4  2025-06-16T00:00:00.000Z         FF  100.2  1.25   1.1
