# Import Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.parse import urlencode, urlparse, parse_qs
import numpy as np

# Calculate wOBA coefficients and wOBAscale

In [3]:
def calc_npb_coeffs(start_year: int, end_year: int) -> pd.DataFrame:
    # --- Fetch and annotate records ---
    all_records = []
    stats_type = "bat"
    qual = 0

    # Teams in query order
    teams = [4,6,7,8,9,13,3,5,10,11,12,14]
    cl_teams = {4,6,7,8,9,13}
    pl_teams = {3,5,10,11,12,14}
    base_url = "https://www.fangraphs.com/leaders/international/npb"

    for season in range(start_year, end_year + 1):
        for team in teams:
            params = {"qual": qual, "seasonstart": season, "seasonend": season, "stats": stats_type, "team": team}
            url = f"{base_url}?{urlencode(params)}"
            res = requests.get(url)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, "html.parser")
            script = soup.find("script", id="__NEXT_DATA__")
            data = json.loads(script.string)
            queries = data["props"]["pageProps"]["dehydratedState"]["queries"]

            records = None
            for q in queries:
                dl = q.get("state", {}).get("data")
                if isinstance(dl, list) and dl and "Name" in dl[0]:
                    records = dl
                    break
            if records is None:
                raise RuntimeError(f"No data for season={season}, team={team}")

            for rec in records:
                rec["Season"] = season
                rec["League"] = "cl" if team in cl_teams else "pl"
                name_html = rec.get("Name") or ""
                tag_soup = BeautifulSoup(name_html, "html.parser")
                a = tag_soup.find("a")
                if a and a.has_attr("href"):
                    qs = urlparse(a["href"]).query
                    rec["Position"] = parse_qs(qs).get("position", [None])[0]
                else:
                    rec["Position"] = None

            all_records.extend(records)

    df = pd.json_normalize(all_records)
    results = []

    for season, df_sea in df.groupby("Season"):
        df_sea = df_sea[df_sea["PA"] > 0].copy()
        # Basic fields
        df_sea["uBB"] = df_sea["BB"] - df_sea["IBB"]
        # Denominator for stolen base linear weight
        df_sea["denomSB"] = df_sea["1B"] + df_sea["BB"] - df_sea["IBB"] + df_sea["HBP"]

        # Compute lgwSB: use records with SB==0, CS==0, denomSB!=0
        mask_lg = (df_sea["SB"] == 0) & (df_sea["CS"] == 0) & (df_sea["denomSB"] != 0)
        lgwSB_vals = df_sea.loc[mask_lg, "wBsR"] / df_sea.loc[mask_lg, "denomSB"] * -1
        lgwSB = lgwSB_vals.mean()

        # runSB is fixed
        runSB = 0.2

        # Compute runCS: use records with CS != 0
        mask_cs = df_sea["CS"] != 0
        runCS_vals = (
            df_sea.loc[mask_cs, "wBsR"]
            - runSB * df_sea.loc[mask_cs, "SB"]
            + lgwSB * df_sea.loc[mask_cs, "denomSB"]
        ) / df_sea.loc[mask_cs, "CS"]
        runCS = runCS_vals.mean()

        # Prepare regression target for event weights
        df_sea["denom"] = df_sea["AB"] + df_sea["BB"] - df_sea["IBB"] + df_sea["SF"] + df_sea["HBP"]
        df_sea["y"] = df_sea["denom"] * df_sea["wOBA"]
        X = df_sea[["uBB","HBP","1B","2B","3B","HR"]].values
        y = df_sea["y"].values
        beta, *_ = np.linalg.lstsq(X, y, rcond=None)

        # League wOBA
        tot_H   = df_sea["H"].sum()
        tot_uBB = df_sea["uBB"].sum()
        tot_HBP = df_sea["HBP"].sum()
        tot_SF  = df_sea["SF"].sum()
        tot_AB  = df_sea["AB"].sum()
        league_wOBA = (tot_H + tot_uBB + tot_HBP) / (tot_AB + tot_uBB + tot_HBP + tot_SF)

        # wOBA scale (trimmed mean)
        df_sea["scale"] = ((df_sea["wOBA"] - league_wOBA) * df_sea["PA"]) / df_sea["wRAA"]
        scales = df_sea["scale"].replace([np.inf, -np.inf], np.nan).dropna()
        lo, hi = scales.quantile([0.025, 0.975])
        mean_scale = scales[(scales >= lo) & (scales <= hi)].mean()

        # Runs per PA
        total_R  = df_sea["R"].sum()
        total_PA = df_sea["PA"].sum()
        lgR_PA   = total_R / total_PA if total_PA > 0 else np.nan

        results.append({
            "Season":    season,
            "uBB":       beta[0],
            "HBP":       beta[1],
            "1B":        beta[2],
            "2B":        beta[3],
            "3B":        beta[4],
            "HR":        beta[5],
            "lgR_PA":    lgR_PA,
            "lg_wOBA":   league_wOBA,
            "wOBAscale": mean_scale,
            "lgwSB":     lgwSB,
            "runSB":     runSB,
            "runCS":     runCS,
        })

    return pd.DataFrame(results).sort_values("Season").reset_index(drop=True)

In [4]:
# Define the season range (e.g. from 2019 to 2025)
start_year = 2019
end_year = 2025

# Calculate data
df_coeffs_by_season = calc_npb_coeffs(start_year, end_year)

# Display the first 10 records of the dataframe
df_coeffs_by_season.head(10)

Unnamed: 0,Season,uBB,HBP,1B,2B,3B,HR,lgR_PA,lg_wOBA,wOBAscale,lgwSB,runSB,runCS
0,2019,0.704138,0.736161,0.902682,1.286963,1.632815,2.114737,0.11135,0.321429,1.280934,0.001302,0.2,-0.392547
1,2020,0.707703,0.740318,0.909917,1.301299,1.653542,2.148281,0.109368,0.321833,1.304606,0.002227,0.2,-0.38655
2,2021,0.693758,0.727447,0.902627,1.30689,1.670726,2.199296,0.101582,0.312737,1.347542,0.001654,0.2,-0.360505
3,2022,0.686079,0.720881,0.901849,1.319469,1.695327,2.254673,0.095744,0.305781,1.392066,0.001709,0.2,-0.34138
4,2023,0.687127,0.722514,0.906523,1.331161,1.713335,2.286701,0.093369,0.305058,1.41546,0.00188,0.2,-0.334852
5,2024,0.687494,0.724387,0.916228,1.358939,1.757378,2.366495,0.088441,0.300512,1.475702,0.002984,0.2,-0.319472
6,2025,0.679623,0.717344,0.913494,1.366147,1.773536,2.407149,0.083168,0.294765,1.508845,0.005411,0.2,-0.305135


# Calculate Park Factors (from wRC+)

In [4]:
def calculate_pf(start_year: int, end_year: int) -> pd.DataFrame:
    # --- Fetch and annotate records ---
    all_records = []
    stats_type = "bat"

    # Teams in query order
    teams = [4,6,7,8,9,13,3,5,10,11,12,14]
    cl_teams = {4,6,7,8,9,13}
    pl_teams = {3,5,10,11,12,14}
    base_url = "https://www.fangraphs.com/leaders/international/npb"

    for season in range(start_year, end_year + 1):
        for team in teams:
            # Build URL with query parameters
            params = {"qual": 0, "seasonstart": season, "seasonend": season, "stats": stats_type, "team": team}
            url = f"{base_url}?{urlencode(params)}"

            # Request and parse page
            res = requests.get(url)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, "html.parser")

            # Extract embedded __NEXT_DATA__ JSON
            script = soup.find("script", id="__NEXT_DATA__")
            data = json.loads(script.string)
            queries = data["props"]["pageProps"]["dehydratedState"]["queries"]

            # Locate the records block
            records = None
            for q in queries:
                dl = q.get("state", {}).get("data")
                if isinstance(dl, list) and dl and "Name" in dl[0]:
                    records = dl
                    break
            if records is None:
                raise RuntimeError(f"No data for season={season}, team={team}")

            # Annotate each record
            for rec in records:
                rec["Season"] = season
                rec["League"] = "cl" if team in cl_teams else "pl" if team in pl_teams else None

                # Extract Position from the Name field URL
                name_html = rec.get("Name") or ""
                tag_soup = BeautifulSoup(name_html, "html.parser")
                a = tag_soup.find("a")
                if a and a.has_attr("href"):
                    qs = urlparse(a["href"]).query
                    rec["Position"] = parse_qs(qs).get("position", [None])[0]
                else:
                    rec["Position"] = None

            all_records.extend(records)

    # Normalize to DataFrame
    df = pd.json_normalize(all_records)

    # --- 2) Compute season-level lgR_PA, wRC_PA_cl, wRC_PA_pl ---
    coef_list = []
    for season, grp in df.groupby('Season'):
        df_s = grp[grp['PA'] > 0]
        # overall runs per PA
        total_R = df_s['R'].sum()
        total_PA = df_s['PA'].sum()
        lgR_PA = total_R / total_PA if total_PA > 0 else np.nan

        # CL non-pitchers wRC per PA
        m_cl = (df_s['League'] == 'cl') & (df_s['Position'] != 'P')
        pa_cl = df_s.loc[m_cl, 'PA'].sum()
        wrc_cl = df_s.loc[m_cl, 'wRC'].sum()
        wRC_PA_cl = wrc_cl / pa_cl if pa_cl > 0 else np.nan

        # PL non-pitchers wRC per PA
        m_pl = (df_s['League'] == 'pl') & (df_s['Position'] != 'P')
        pa_pl = df_s.loc[m_pl, 'PA'].sum()
        wrc_pl = df_s.loc[m_pl, 'wRC'].sum()
        wRC_PA_pl = wrc_pl / pa_pl if pa_pl > 0 else np.nan

        coef_list.append({'Season': season, 'lgR_PA': lgR_PA, 'wRC_PA_cl': wRC_PA_cl, 'wRC_PA_pl': wRC_PA_pl})

    df_coef = pd.DataFrame(coef_list)

    # --- 3) Merge and choose wRC_PA by league ---
    df_all = pd.merge(df, df_coef, on='Season', how='left')
    df_all['wRC_PA'] = np.where(df_all['League'] == 'cl', df_all['wRC_PA_cl'], df_all['wRC_PA_pl'])
    df_all.drop(columns=['wRC_PA_cl', 'wRC_PA_pl'], inplace=True)

    # --- 4) Calculate PF for each record ---
    df_all['PF'] = np.where(
        df_all['wRC+'].isna() |
        (df_all['wRC+'] == -100) |
        (df_all['PA'] == 0),
        np.nan,
        2 - (((df_all['wRC+'] / 100) * df_all['wRC_PA'] - (df_all['wRAA'] / df_all['PA'])) / df_all['lgR_PA']))

    # --- 5) PA-weighted average PF by Team, League, Season ---
    df_valid = df_all[df_all['PF'].notna()].copy()
    df_valid['PFxPA'] = df_valid['PF'] * df_valid['PA']
    df_out = (
        df_valid
        .groupby(['Season', 'League', 'Team'])
        .agg(
            total_PFxPA=('PFxPA', 'sum'),
            total_PA=('PA', 'sum')
        )
        .reset_index()
    )
    df_out['PF'] = df_out['total_PFxPA'] / df_out['total_PA']

    return (
        df_out[['Season', 'League', 'Team', 'PF']]
        .sort_values(['Season','League','Team'])
        .reset_index(drop=True)
    )

In [5]:
# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Calculate data
df_season_pf = calculate_pf(start_year, end_year)

# Display the first 10 records of the dataframe
df_season_pf.head(10)

Unnamed: 0,Season,League,Team,PF
0,2024,cl,BayStars (NPB),1.030336
1,2024,cl,Carp (NPB),0.991043
2,2024,cl,Dragons (NPB),0.931763
3,2024,cl,Giants (NPB),1.001669
4,2024,cl,Swallows (NPB),1.057778
5,2024,cl,Tigers (NPB),0.979296
6,2024,pl,Buffaloes (NPB),0.982944
7,2024,pl,Fighters (NPB),1.028555
8,2024,pl,Golden Eagles (NPB),0.976696
9,2024,pl,Hawks (NPB),0.998346
