# Import Packages

In [24]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

# Extracting Frangraphs data (Leaderboard - Single Season)

In [15]:
def fetch_npb_lb_single(start_year, end_year, stats_type, qual=0, pos='all', type_=0):
    """
    Fetch NPB leader data from Fangraphs for each season in the given range.

    Arguments:
        start_year (int): The starting season (inclusive).
        end_year (int): The ending season (inclusive).
        qual (int): Qualification filter, default 0.
        type (str): Stats type to fetch ('bat', 'pit', etc.), default 'bat'.
        pos (str): Position filter, default 'all'.
    Returns:
        pandas.DataFrame: DataFrame containing leader records for all seasons.
    """
    all_records = []

    for year in range(start_year, end_year + 1):
        # 1. Construct URL for the single season and fetch page
        url = (
            'https://www.fangraphs.com/leaders/international/npb'
            f'?qual={qual}&seasonstart={year}&seasonend={year}&stats={stats_type}&pos={pos}'
        )
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # 2. Parse __NEXT_DATA__ JSON
        script = soup.find('script', id='__NEXT_DATA__')
        data = json.loads(script.string)

        # 3. Search queries for the players data list
        queries = data['props']['pageProps']['dehydratedState']['queries']
        records = None
        for query in queries:
            data_list = query.get('state', {}).get('data')
            if isinstance(data_list, list) and data_list and 'Name' in data_list[0]:
                records = data_list
                break

        if records is None:
            raise RuntimeError(f"Player data not found for season {year}.")

        # Add season metadata
        for rec in records:
            rec['Season'] = year

        all_records.extend(records)

    # 4. Normalize to pandas DataFrame
    df = pd.json_normalize(all_records)
    return df

In [17]:
# Define the stats type (batting:"bat", or pitching:"pit")
stats_type = "bat"

# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Fetch data (and delete the field "Name")
df_lb_single = fetch_npb_lb_single(start_year, end_year, stats_type)
df_lb_single = df_lb_single.drop(columns=['Name'])

# Display the first 5 records of the dataframe
df_lb_single.head()

Unnamed: 0,Team,G,AB,PA,H,1B,2B,3B,HR,R,...,wOBA,wRC+,wBsR,Age,Season,PlayerName,JName,teamid,playerids,minormasterid
0,Swallows (NPB),122,419,484,132,86,29,0,17,57,...,0.413732,171.900434,-0.355131,31,2024,Domingo Santana,,8,10348,sa503853
1,Lions (NPB),30,113,124,23,16,5,0,2,7,...,0.272771,78.129095,-0.080567,34,2024,Jesús Aguilar,,3,11342,sa505981
2,Giants (NPB),58,0,0,0,0,0,0,0,0,...,0.0,,0.0,31,2024,Alberto Baldonado,,4,11812,sa550649
3,BayStars (NPB),106,396,445,125,64,34,2,25,66,...,0.44488,197.176828,-0.638756,32,2024,Tyler Austin,,13,11850,sa549178
4,Dragons (NPB),17,0,0,0,0,0,0,0,0,...,0.0,,0.0,31,2024,Michael Feliz,,7,11903,sa557309


# Extracting Frangraphs data (Leaderboard - multiple seasons)

In [21]:
def fetch_npb_lb_multiple(start_year, end_year, stats_type, qual=0, pos='all', type_=0):
    """
    Fetch NPB Batting leaders data from Fangraphs for a given range of seasons.

    Arguments:
        start_year (int): The starting season (inclusive).
        end_year (int): The ending season (inclusive).
        qual (int): Qualification filter, default 0.
        stats (str): Stats type, default 'bat'.
        pos (str): Position filter, default 'all'.
        type_ (int): Leader type, default 0.

    Returns:
        pandas.DataFrame: DataFrame containing the leader records.
    """
    # 1. Construct URL and fetch page
    url = (
        'https://www.fangraphs.com/leaders/international/npb'
        f'?qual={qual}&seasonstart={start_year}&seasonend={end_year}&stats={stats_type}&pos={pos}'
    )
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # 2. Parse __NEXT_DATA__ JSON
    script = soup.find('script', id='__NEXT_DATA__')
    data = json.loads(script.string)

    # 3. Search queries for the players data list
    queries = data['props']['pageProps']['dehydratedState']['queries']
    records = None
    for query in queries:
        data_list = query.get('state', {}).get('data')
        if isinstance(data_list, list) and data_list and 'Name' in data_list[0]:
            records = data_list
            break

    if records is None:
        raise RuntimeError("Player data not found in the JSON structure.")

    # 4. Normalize to pandas DataFrame
    df = pd.json_normalize(records)
    return df

In [22]:
# Define the stats type (batting:"bat", or pitching:"pit")
stats_type = "bat"

# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Fetch data (and delete the field "Name")
df_lb_multiple = fetch_npb_lb_multiple(start_year, end_year, stats_type)
df_lb_multiple = df_lb_multiple.drop(columns=['Name'])

# Display the first 5 records of the dataframe
df_lb_multiple.head()

Unnamed: 0,Team,G,AB,PA,H,1B,2B,3B,HR,R,...,wOBA,wRC+,wBsR,Age,Season,PlayerName,JName,teamid,playerids,minormasterid
0,- - -,138,477,551,146,95,33,0,18,67,...,0.403781,166.31544,-0.443745,—,2024,Domingo Santana,,8,10348,sa503853
1,Lions (NPB),30,113,124,23,16,5,0,2,7,...,0.272771,78.129095,-0.080567,—,2024,Jesús Aguilar,,3,11342,sa505981
2,Marines (NPB),5,0,0,0,0,0,0,0,0,...,0.0,,0.0,—,2025,Tayron Guerrero,,11,11691,sa551046
3,- - -,60,0,0,0,0,0,0,0,0,...,0.0,,0.0,—,2024,Alberto Baldonado,,4,11812,sa550649
4,- - -,112,418,471,130,66,37,2,25,69,...,0.43865,193.421657,-0.668294,—,2024,Tyler Austin,,13,11850,sa549178
