In [1]:
# ============================================================
# Cell 1: Imports & paths
# ============================================================
import pandas as pd
import importlib.util
import sys

# Paths
UTILS_PATH = "../utils/utils.py"
LIVE_PATH  = "data/basketballReference_LIVE_L1.csv"

# Keys for joins
JOIN_KEY   = "PLAYER_NORM"
PLAYER_KEY = "INDEX"

In [2]:
# ============================================================
# Cell 2: Load utils.py (normalize_name, load_player_index)
# ============================================================
spec = importlib.util.spec_from_file_location("utils_module", UTILS_PATH)
utils_module = importlib.util.module_from_spec(spec)
sys.modules["utils_module"] = utils_module
spec.loader.exec_module(utils_module)

# Bind helpers
normalize_name    = utils_module.normalize_name
load_player_index = getattr(utils_module, "load_player_index", None)

print("✅ utils.py loaded from:", UTILS_PATH)

✅ utils.py loaded from: ../utils/utils.py


In [3]:
# ============================================================
# Cell 3: Load player index (INDEX, PLAYER, PLAYER_NORM)
# ============================================================
if load_player_index is None:
    raise RuntimeError("utils.py must expose load_player_index().")

index_df = load_player_index(utils_dir="../utils", index_filename="playerIndex.csv")

# Sanity check
display(index_df.head(10))
print("Rows:", len(index_df), "| Unique INDEX:", index_df[PLAYER_KEY].nunique())

# Slim view for merge
index_slim = index_df[[PLAYER_KEY, JOIN_KEY]].drop_duplicates()

Unnamed: 0,INDEX,PLAYER,PLAYER_NORM
0,10001,Nikola Jokic,nikola jokic
1,10002,Shai Gilgeous-Alexander,shai gilgeous alexander
2,10003,Victor Wembanyama,victor wembanyama
3,10004,Luka Doncic,luka doncic
4,10005,Giannis Antetokounmpo,giannis antetokounmpo
5,10006,Anthony Davis,anthony davis
6,10007,Cade Cunningham,cade cunningham
7,10008,Anthony Edwards,anthony edwards
8,10009,Karl-Anthony Towns,karl anthony towns
9,10010,James Harden,james harden


Rows: 547 | Unique INDEX: 547


In [4]:
# ============================================================
# Cell 4: Load basketballReference LIVE data
# ============================================================
keep_cols = ["PLAYER","G","MPG","PTS","FGM","FGA","FG%","FTA","FTM","FT%","3PM","REB","AST","STL","BLK","TO"]

df_live = pd.read_csv(LIVE_PATH)

# Standardize column casing
df_live = df_live.rename(columns={"Player": "PLAYER"})

# Keep only relevant columns, in order
df_live = df_live[[c for c in keep_cols if c in df_live.columns]]

# Normalize player names for merging
df_live["PLAYER"] = df_live["PLAYER"].astype(str).str.strip()
df_live[JOIN_KEY] = df_live["PLAYER"].apply(normalize_name)

# Convert numeric columns to float safely
num_cols = ["PTS","G","MPG","FGM","FGA","FG%","FTA","FTM","FT%","3PM","REB","AST","STL","BLK","TO"]
for c in set(num_cols).intersection(df_live.columns):
    df_live[c] = pd.to_numeric(df_live[c], errors="coerce")

display(df_live.head(10))

Unnamed: 0,PLAYER,G,MPG,PTS,FGM,FGA,FG%,FTA,FTM,FT%,3PM,REB,AST,STL,BLK,TO,PLAYER_NORM
0,Luka Dončić,2.0,38.0,46.0,15.5,25.0,0.62,14.5,11.5,0.793,3.5,11.5,8.5,1.0,0.5,3.0,luka doncic
1,Tyrese Maxey,4.0,43.0,37.5,11.5,25.3,0.455,11.5,10.0,0.87,4.5,3.8,8.3,1.0,0.8,2.0,tyrese maxey
2,Giannis Antetokounmpo,4.0,32.8,36.3,14.3,20.5,0.695,10.5,6.8,0.643,1.0,14.0,7.0,0.8,1.3,3.3,giannis antetokounmpo
3,Shai Gilgeous-Alexander,5.0,38.6,34.8,12.6,24.4,0.516,9.6,8.2,0.854,1.4,6.2,5.4,1.4,1.4,2.4,shai gilgeous alexander
4,Austin Reaves,5.0,38.2,34.2,10.4,19.8,0.525,11.6,10.4,0.897,3.0,5.6,10.0,1.8,0.0,3.8,austin reaves
5,Lauri Markkanen,4.0,38.3,34.0,11.0,21.8,0.506,8.8,8.0,0.914,4.0,7.3,2.8,0.8,0.8,1.0,lauri markkanen
6,Jalen Brunson,4.0,34.8,31.8,10.8,22.3,0.483,8.8,7.8,0.886,2.5,4.0,5.0,1.3,0.0,2.0,jalen brunson
7,Bennedict Mathurin,2.0,36.5,31.0,8.5,15.5,0.548,13.0,11.5,0.885,2.5,7.0,2.5,0.0,0.0,2.5,bennedict mathurin
8,Victor Wembanyama,4.0,32.3,31.0,11.0,18.3,0.603,10.0,8.0,0.8,1.0,13.8,2.8,1.5,4.8,1.5,victor wembanyama
9,Devin Booker,5.0,36.2,29.2,9.2,19.4,0.474,10.0,8.2,0.82,2.6,4.0,6.0,0.4,0.4,4.6,devin booker


In [5]:
# ============================================================
# Cell 5: Merge LIVE file with player index
# ============================================================
df_live_idx = df_live.merge(index_slim, on=JOIN_KEY, how="left")

# Merge results summary
total = len(df_live_idx)
matched = df_live_idx[PLAYER_KEY].notna().sum()
unmatched = total - matched
print(f"Rows: {total} | Matched INDEX: {matched} | Unmatched: {unmatched}")

# Show sample of unmatched players
if unmatched > 0:
    unmatched_df = df_live_idx[df_live_idx[PLAYER_KEY].isna()][["PLAYER", JOIN_KEY]]
    display(unmatched_df.head(20))
else:
    print("✅ All players matched to INDEX.")

Rows: 422 | Matched INDEX: 377 | Unmatched: 45


Unnamed: 0,PLAYER,PLAYER_NORM
222,Mac McClung,mac mcclung
235,Javon Small,javon small
241,E.J. Liddell,ej liddell
256,Will Richard,will richard
262,Caleb Love,caleb love
269,Tyler Kolek,tyler kolek
297,Daniss Jenkins,daniss jenkins
298,Keshad Johnson,keshad johnson
299,David Jones García,david jones garcia
309,Ben Saraf,ben saraf


In [6]:
# ============================================================
# Cell 6: Assemble final output shape & save
# ============================================================
import numpy as np

# Build final ordered columns
final_cols = [
    "INDEX", "PLAYER_NORM", "G", "MPG", "PTS", "FG%", "FT%", 
    "3PM", "REB", "AST", "STL", "BLK", "TO"
]

# Ensure G and MPG exist; if not, create as NaN
for col in ["G", "MPG"]:
    if col not in df_live_idx.columns:
        df_live_idx[col] = np.nan

# Keep only columns we need (if present), in order
final_df = df_live_idx.copy()

# Drop columns we don't want in the final file
drop_unneeded = ["PLAYER", "FGM", "FGA", "FTA", "FTM"]
final_df = final_df.drop(columns=[c for c in drop_unneeded if c in final_df.columns], errors="ignore")

# Reorder to the exact final spec
final_df = final_df[[c for c in final_cols if c in final_df.columns]]

# Coerce numeric columns to float
numeric_cols = ["INDEX", "G", "MPG", "PTS", "FG%", "FT%", "3PM", "REB", "AST", "STL", "BLK", "TO"]
for c in [col for col in numeric_cols if col in final_df.columns]:
    final_df[c] = pd.to_numeric(final_df[c], errors="coerce")

# Save (no unmatched file)
output_main = "../L2/data/live_L2.csv"
final_df.to_csv(output_main, index=False)
print(f"✅ Final LIVE output saved to: {output_main}")

# Quick preview
display(final_df.head(10))

# Match summary (still useful context)
total = len(df_live_idx)
matched = df_live_idx["INDEX"].notna().sum()
unmatched = total - matched
print(f"Rows: {total} | Matched INDEX: {matched} | Unmatched: {unmatched}")

✅ Final LIVE output saved to: ../L2/data/live_L2.csv


Unnamed: 0,INDEX,PLAYER_NORM,G,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO
0,10004.0,luka doncic,2.0,38.0,46.0,0.62,0.793,3.5,11.5,8.5,1.0,0.5,3.0
1,10014.0,tyrese maxey,4.0,43.0,37.5,0.455,0.87,4.5,3.8,8.3,1.0,0.8,2.0
2,10005.0,giannis antetokounmpo,4.0,32.8,36.3,0.695,0.643,1.0,14.0,7.0,0.8,1.3,3.3
3,10002.0,shai gilgeous alexander,5.0,38.6,34.8,0.516,0.854,1.4,6.2,5.4,1.4,1.4,2.4
4,10040.0,austin reaves,5.0,38.2,34.2,0.525,0.897,3.0,5.6,10.0,1.8,0.0,3.8
5,10064.0,lauri markkanen,4.0,38.3,34.0,0.506,0.914,4.0,7.3,2.8,0.8,0.8,1.0
6,10023.0,jalen brunson,4.0,34.8,31.8,0.483,0.886,2.5,4.0,5.0,1.3,0.0,2.0
7,10144.0,bennedict mathurin,2.0,36.5,31.0,0.548,0.885,2.5,7.0,2.5,0.0,0.0,2.5
8,10003.0,victor wembanyama,4.0,32.3,31.0,0.603,0.8,1.0,13.8,2.8,1.5,4.8,1.5
9,10011.0,devin booker,5.0,36.2,29.2,0.474,0.82,2.6,4.0,6.0,0.4,0.4,4.6


Rows: 422 | Matched INDEX: 377 | Unmatched: 45
