# Import Packages

In [8]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from urllib.parse import urlencode, urlparse, parse_qs
import numpy as np

# Calculate wRC+ by team

In [9]:
def calculate_wrc_plus(start_year: int, end_year: int) -> pd.DataFrame:
    # --- 1) Fetch and annotate records ---
    all_records = []
    stats_type = "bat"
    teams = [4,6,7,8,9,13,3,5,10,11,12,14]
    cl_teams = {4,6,7,8,9,13}
    pl_teams = {3,5,10,11,12,14}
    base_url = "https://www.fangraphs.com/leaders/international/npb"

    for season in range(start_year, end_year + 1):
        for team in teams:
            params = {"qual": 0, "seasonstart": season, "seasonend": season, "stats": stats_type, "team": team}
            url = f"{base_url}?{urlencode(params)}"
            res = requests.get(url)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, "html.parser")
            script = soup.find("script", id="__NEXT_DATA__")
            data = json.loads(script.string)
            queries = data["props"]["pageProps"]["dehydratedState"]["queries"]

            records = None
            for q in queries:
                dl = q.get("state", {}).get("data")
                if isinstance(dl, list) and dl and "Name" in dl[0]:
                    records = dl
                    break
            if records is None:
                raise RuntimeError(f"No data for season={season}, team={team}")

            for rec in records:
                rec["Season"] = season
                rec["League"] = "cl" if team in cl_teams else "pl"
                name_html = rec.get("Name") or ""
                if name_html:
                    tag_soup = BeautifulSoup(name_html, "html.parser")
                    a = tag_soup.find("a")
                    if a and a.has_attr("href"):
                        qs = urlparse(a["href"]).query
                        rec["Position"] = parse_qs(qs).get("position", [None])[0]
                    else:
                        rec["Position"] = None
                else:
                    rec["Position"] = None
            all_records.extend(records)

    df = pd.json_normalize(all_records)

    # --- 2) Compute season-level coefficients ---
    coef_list = []
    for season, grp in df.groupby("Season"):
        df_s = grp[grp["PA"] > 0]
        total_R = df_s["R"].sum()
        total_PA = df_s["PA"].sum()
        lgR_PA = total_R / total_PA if total_PA > 0 else np.nan

        m_cl = (df_s["League"] == "cl") & (df_s["Position"] != "P")
        pa_cl = df_s.loc[m_cl, "PA"].sum()
        wrc_cl = df_s.loc[m_cl, "wRC"].sum()
        wRC_PA_cl = wrc_cl / pa_cl if pa_cl > 0 else np.nan

        m_pl = (df_s["League"] == "pl") & (df_s["Position"] != "P")
        pa_pl = df_s.loc[m_pl, "PA"].sum()
        wrc_pl = df_s.loc[m_pl, "wRC"].sum()
        wRC_PA_pl = wrc_pl / pa_pl if pa_pl > 0 else np.nan

        coef_list.append({"Season": season, "lgR_PA": lgR_PA, "wRC_PA_cl": wRC_PA_cl, "wRC_PA_pl": wRC_PA_pl})

    df_coef = pd.DataFrame(coef_list)

    # --- 3) Merge coefficients and compute PF per record ---
    df_all = pd.merge(df, df_coef, on="Season", how="left")
    df_all["wRC_PA"] = np.where(df_all["League"] == "cl", df_all["wRC_PA_cl"], df_all["wRC_PA_pl"])
    df_all.drop(columns=["wRC_PA_cl", "wRC_PA_pl"], inplace=True)

    df_all["PF"] = np.where(
        df_all["wRC+"].isna() | (df_all["wRC+"] == -100) | (df_all["PA"] == 0),
        np.nan,
        2 - (((df_all["wRC+"] / 100) * df_all["wRC_PA"] - (df_all["wRAA"] / df_all["PA"])) / df_all["lgR_PA"])
    )

    # --- 4) Compute team-level PF weighted average ---
    df_valid = df_all[df_all["PF"].notna()].copy()
    df_valid["PFxPA"] = df_valid["PF"] * df_valid["PA"]
    df_pf_team = (
        df_valid
        .groupby(["Season", "Team"])
        .agg(total_PFxPA=("PFxPA", "sum"), total_PA=("PA", "sum"))
        .reset_index()
    )
    df_pf_team["PF_wavg"] = df_pf_team["total_PFxPA"] / df_pf_team["total_PA"]

    # --- 5) Compute team aggregates for wRCplus ---
    df_team = df_all[df_all["Position"] != "P"].copy()
    df_team_agg = (
        df_team
        .groupby(["Season", "Team"])
        .agg(PA_team=("PA", "sum"), wRAA_team=("wRAA", "sum"), wRC_team=("wRC", "sum"), lgR_PA=("lgR_PA", "mean"), wRC_PA=("wRC_PA", "mean"))
        .reset_index()
    )

    # --- 6) Merge PF and compute wRCplus ---
    df_merged = pd.merge(
        df_team_agg,
        df_pf_team[["Season", "Team", "PF_wavg"]],
        on=["Season", "Team"],
        how="left"
    )
    df_merged["wRCplus"] = (
        (df_merged["wRAA_team"] / df_merged["PA_team"])
        + df_merged["lgR_PA"] + (1 - df_merged["PF_wavg"]) * df_merged["lgR_PA"]
    ) / df_merged["wRC_PA"] * 100

    # --- 7) Return Season, Team, wRCplus ---
    return (
        df_merged[["Season", "Team", "wRCplus"]]
        .sort_values(["Season", "Team"])
        .reset_index(drop=True)
    )

In [10]:
# Define the season range (e.g. from 2024 to 2025)
start_year = 2024
end_year = 2025

# Calculate data
df_season_pf = calculate_wrc_plus(start_year, end_year)

# Display the first 10 records of the dataframe
df_season_pf.head(10)

Unnamed: 0,Season,Team,wRCplus
0,2024,BayStars (NPB),110.888564
1,2024,Buffaloes (NPB),94.753274
2,2024,Carp (NPB),85.467041
3,2024,Dragons (NPB),97.860201
4,2024,Fighters (NPB),102.761484
5,2024,Giants (NPB),104.007071
6,2024,Golden Eagles (NPB),99.499954
7,2024,Hawks (NPB),123.982634
8,2024,Lions (NPB),75.855952
9,2024,Marines (NPB),98.725796
