# Import Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.parse import urlencode, urlparse, parse_qs
import numpy as np

# Extracting Frangraphs data (Leaderboard - Single Season)

In [2]:
def fetch_npb_lb_single(
    start_year: int,
    end_year: int,
    stats_type: str,
    qual: int = 0,
    type_: int = 0
) -> pd.DataFrame:

    all_records = []

   # The order in which teams are queried
    teams = [4, 6, 7, 8, 9, 13,  3, 5, 10, 11, 12, 14]
    # Teams in the Central League
    cl_teams = {4, 6, 7, 8, 9, 13}
    # Teams in the Pacific League
    pl_teams = {3, 5, 10, 11, 12, 14}

    base_url = 'https://www.fangraphs.com/leaders/international/npb'

    for season in range(start_year, end_year + 1):
        for team in teams:
            # Build the request URL
            params = {
                'qual': qual,
                'seasonstart': season,
                'seasonend':   season,
                'stats':       stats_type,
                'team':        team
            }
            url = base_url + '?' + urlencode(params)

            # Fetch and parse the page
            res = requests.get(url)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, 'html.parser')

            # Extract the embedded JSON data
            script = soup.find('script', id='__NEXT_DATA__')
            data = json.loads(script.string)
            queries = data['props']['pageProps']['dehydratedState']['queries']

            # Locate the records list
            records = None
            for q in queries:
                dl = q.get('state', {}).get('data')
                if isinstance(dl, list) and dl and 'Name' in dl[0]:
                    records = dl
                    break

            if records is None:
                raise RuntimeError(f"No data for season={season}, team={team}")

            # Annotate each record with Season and League
            for rec in records:
                rec['Season'] = season
                if team in cl_teams:
                    rec['League'] = 'cl'
                elif team in pl_teams:
                    rec['League'] = 'pl'
                else:
                    rec['League'] = None

                # Extract the Position parameter from the 'Name' field URL
                name_html = rec.get('Name') or ''
                tag_soup = BeautifulSoup(name_html, 'html.parser')
                a = tag_soup.find('a')
                if a and a.has_attr('href'):
                    qs = urlparse(a['href']).query
                    params2 = parse_qs(qs)
                    rec['Position'] = params2.get('position', [None])[0]
                else:
                    rec['Position'] = None

            all_records.extend(records)

    # Normalize the list of records into a DataFrame
    df = pd.json_normalize(all_records)
    return df

In [3]:
# Define the stats type (batting:"bat", or pitching:"pit")
stats_type = "bat"

# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Fetch data (and delete the field "Name")
df_lb_single = fetch_npb_lb_single(start_year, end_year, stats_type)
df_lb_single = df_lb_single.drop(columns=['Name'])

# Display the first 5 records of the dataframe
df_lb_single.head()

Unnamed: 0,Team,G,AB,PA,H,1B,2B,3B,HR,R,...,wBsR,Age,Season,PlayerName,JName,teamid,playerids,minormasterid,League,Position
0,Giants (NPB),58,0,0,0,0,0,0,0,0,...,0.0,31,2024,Alberto Baldonado,,4,11812,sa550649,cl,P
1,Giants (NPB),56,221,240,65,46,11,0,8,34,...,-0.507462,29,2024,Elier Hernandez,,4,13750,sa659238,cl,DH/OF
2,Giants (NPB),2,1,1,0,0,0,0,0,0,...,0.0,29,2024,Yohander Méndez,,4,14391,sa657666,cl,P
3,Giants (NPB),20,30,35,3,1,2,0,0,0,...,-0.002984,28,2024,Foster Griffin,,4,16432,sa828674,cl,P
4,Giants (NPB),52,0,0,0,0,0,0,0,0,...,0.0,31,2024,Kyle Keller,,4,18890,sa875759,cl,P


# Extracting Frangraphs data (Leaderboard - multiple seasons)

In [10]:
def fetch_npb_lb_multiple(start_year: int,
                          end_year: int,
                          stats_type: str,
                          qual: int = 0) -> pd.DataFrame:
    # 1. Construct URL and fetch page
    url = (
        'https://www.fangraphs.com/leaders/international/npb'
        f'?qual={qual}'
        f'&seasonstart={start_year}'
        f'&seasonend={end_year}'
        f'&stats={stats_type}'
    )
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # 2. Parse __NEXT_DATA__ JSON
    script = soup.find('script', id='__NEXT_DATA__')
    data = json.loads(script.string)

    # 3. Search queries for the players data list
    queries = data['props']['pageProps']['dehydratedState']['queries']
    records = None
    for query in queries:
        data_list = query.get('state', {}).get('data')
        if isinstance(data_list, list) and data_list and 'Name' in data_list[0]:
            records = data_list
            break

    if records is None:
        raise RuntimeError("Player data not found in the JSON structure.")

    # 4. Extract the Position parameter from each record's 'Name' field URL
    for rec in records:
        name_html = rec.get('Name') or ''
        tag_soup = BeautifulSoup(name_html, 'html.parser')
        a = tag_soup.find('a')
        if a and a.has_attr('href'):
            qs = urlparse(a['href']).query
            params2 = parse_qs(qs)
            rec['Position'] = params2.get('position', [None])[0]
        else:
            rec['Position'] = None

    # 5. Normalize to pandas DataFrame
    df = pd.json_normalize(records)
    return df

In [11]:
# Define the stats type (batting:"bat", or pitching:"pit")
stats_type = "bat"

# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Fetch data (and delete the field "Name")
df_lb_multiple = fetch_npb_lb_multiple(start_year, end_year, stats_type)
df_lb_multiple = df_lb_multiple.drop(columns=['Name'])

# Display the first 5 records of the dataframe
df_lb_multiple.head()

Unnamed: 0,Team,G,AB,PA,H,1B,2B,3B,HR,R,...,wRC+,wBsR,Age,Season,PlayerName,JName,teamid,playerids,minormasterid,Position
0,- - -,140,485,560,149,98,33,0,18,67,...,165.908064,-0.471926,—,2024,Domingo Santana,,8,10348,sa503853,OF
1,Lions (NPB),30,113,124,23,16,5,0,2,7,...,78.129095,-0.080567,—,2024,Jesús Aguilar,,3,11342,sa505981,1B
2,Marines (NPB),7,0,0,0,0,0,0,0,0,...,,0.0,—,2025,Tayron Guerrero,,11,11691,sa551046,P
3,- - -,60,0,0,0,0,0,0,0,0,...,,0.0,—,2024,Alberto Baldonado,,4,11812,sa550649,P
4,- - -,112,418,471,130,66,37,2,25,69,...,193.266397,-0.670609,—,2024,Tyler Austin,,13,11850,sa549178,1B
