# Import Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

# Extracting Frangraphs data (Leaderboard - Pitching, Single Season)

In [7]:
def fetch_npb_lb_pit_single(start_year, end_year, qual=0, stats='pit', pos='all', type_=0):
    """
    Fetch NPB Pitching leaders data from Fangraphs for each season in the given range.

    Arguments:
        start_year (int): The starting season (inclusive).
        end_year (int): The ending season (inclusive).
        qual (int): Qualification filter, default 0.
        stats (str): Stats type, default 'pit'.
        pos (str): Position filter, default 'all'.
        type_ (int): Leader type, default 0.

    Returns:
        pandas.DataFrame: DataFrame containing the leader records for all seasons.
    """
    all_records = []

    for year in range(start_year, end_year + 1):
        # 1. Construct URL for the single season and fetch page
        url = (
            'https://www.fangraphs.com/leaders/international/npb'
            f'?qual={qual}&seasonstart={year}&seasonend={year}'
            f'&type={type_}&stats={stats}&pos={pos}'
        )
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # 2. Parse __NEXT_DATA__ JSON
        script = soup.find('script', id='__NEXT_DATA__')
        data = json.loads(script.string)

        # 3. Search queries for the players data list
        queries = data['props']['pageProps']['dehydratedState']['queries']
        records = None
        for query in queries:
            data_list = query.get('state', {}).get('data')
            if isinstance(data_list, list) and data_list and 'Name' in data_list[0]:
                records = data_list
                break

        if records is None:
            raise RuntimeError(f"Player data not found for season {year}.")

        # Add season metadata
        for rec in records:
            rec['Season'] = year

        all_records.extend(records)

    # 4. Normalize to pandas DataFrame
    df = pd.json_normalize(all_records)
    return df

In [5]:
# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Fetch data (and delete the field "Name")
df_lb_pit_single = fetch_npb_lb_pit_single(start_year, end_year)
df_lb_pit_single = df_lb_pit_single.drop(columns=['Name'])

# Display the first 5 records of the dataframe
df_lb_pit_single.head()

Unnamed: 0,Team,W,L,ERA,G,GS,CG,ShO,SV,IP,...,K-BB%,Pitches,HLD,BS,Age,Season,PlayerName,JName,playerids,minormasterid
0,Giants (NPB),2,3,2.43871,58,0,0,0,9,51.2,...,0.159091,966,22,3,31,2024,Alberto Baldonado,,11812,sa550649
1,Dragons (NPB),0,1,5.192309,17,0,0,0,0,17.1,...,0.027027,293,0,0,31,2024,Michael Feliz,,11903,sa557309
2,Hawks (NPB),2,2,3.759494,8,5,0,0,0,26.1,...,0.12963,429,3,0,43,2024,Tsuyoshi Wada,,13046,sa637062
3,Fighters (NPB),2,2,3.122449,9,9,0,0,0,49.0,...,0.155779,735,0,0,33,2024,Drew VerHagen,,13424,sa578024
4,Hawks (NPB),0,3,3.756522,39,0,0,0,24,38.1,...,0.082803,568,1,3,29,2024,Roberto Osuna,,13764,sa594306


# Extracting Frangraphs data (Leaderboard - Batting, multiple seasons)

In [12]:
def fetch_npb_lb_pit_multiple(start_year, end_year, qual=0, stats='pit', pos='all', type_=0):
    """
    Fetch NPB Pitching leaders data from Fangraphs for a given range of seasons.

    Arguments:
        start_year (int): The starting season (inclusive).
        end_year (int): The ending season (inclusive).
        qual (int): Qualification filter, default 0.
        stats (str): Stats type, default 'pit'.
        pos (str): Position filter, default 'all'.
        type_ (int): Leader type, default 0.

    Returns:
        pandas.DataFrame: DataFrame containing the leader records.
    """
    # 1. Construct URL and fetch page
    url = (
        'https://www.fangraphs.com/leaders/international/npb'
        f'?qual={qual}&seasonstart={start_year}&seasonend={end_year}'
        f'&type={type_}&stats={stats}&pos={pos}'
    )
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # 2. Parse __NEXT_DATA__ JSON
    script = soup.find('script', id='__NEXT_DATA__')
    data = json.loads(script.string)

    # 3. Search queries for the players data list
    queries = data['props']['pageProps']['dehydratedState']['queries']
    records = None
    for query in queries:
        data_list = query.get('state', {}).get('data')
        if isinstance(data_list, list) and data_list and 'Name' in data_list[0]:
            records = data_list
            break

    if records is None:
        raise RuntimeError("Player data not found in the JSON structure.")

    # 4. Normalize to pandas DataFrame
    df = pd.json_normalize(records)
    return df

In [10]:
# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Fetch data (and delete the field "Name")
df_lb_pit_multiple = fetch_npb_lb_pit_multiple(start_year, end_year)
df_lb_pit_multiple = df_lb_pit_multiple.drop(columns=['Name'])

# Display the first 5 records of the dataframe
df_lb_pit_multiple.head()

Unnamed: 0,Team,W,L,ERA,G,GS,CG,ShO,SV,IP,...,K-BB%,Pitches,HLD,BS,Age,Season,PlayerName,JName,playerids,minormasterid
0,Marines (NPB),1,1,3.6,5,0,0,0,1,5.0,...,0.35,89,1,0,—,2025,Tayron Guerrero,,11691,sa551046
1,- - -,2,3,2.54717,60,0,0,0,9,53.0,...,0.144737,1013,23,3,—,2024,Alberto Baldonado,,11812,sa550649
2,Dragons (NPB),0,1,5.192309,17,0,0,0,0,17.1,...,0.027027,293,0,0,—,2024,Michael Feliz,,11903,sa557309
3,BayStars (NPB),0,2,4.909091,2,2,0,0,0,11.0,...,0.1875,174,0,0,—,2025,Trevor Bauer,,12703,sa597749
4,Hawks (NPB),2,2,3.759494,8,5,0,0,0,26.1,...,0.12963,429,3,0,—,2024,Tsuyoshi Wada,,13046,sa637062
