In [1]:
# ============================================================
# Cell 1: Imports & paths
# ============================================================
import pandas as pd
import importlib.util
import sys

# Paths
UTILS_PATH = "../utils/utils.py"
LIVE_PATH  = "data/basketballReference_LIVE_L1.csv"

# Keys for joins
JOIN_KEY   = "PLAYER_NORM"
PLAYER_KEY = "INDEX"

In [2]:
# ============================================================
# Cell 2: Load utils.py (normalize_name, load_player_index)
# ============================================================
spec = importlib.util.spec_from_file_location("utils_module", UTILS_PATH)
utils_module = importlib.util.module_from_spec(spec)
sys.modules["utils_module"] = utils_module
spec.loader.exec_module(utils_module)

# Bind helpers
normalize_name    = utils_module.normalize_name
load_player_index = getattr(utils_module, "load_player_index", None)

print("✅ utils.py loaded from:", UTILS_PATH)

✅ utils.py loaded from: ../utils/utils.py


In [3]:
# ============================================================
# Cell 3: Load player index (INDEX, PLAYER, PLAYER_NORM)
# ============================================================
if load_player_index is None:
    raise RuntimeError("utils.py must expose load_player_index().")

index_df = load_player_index(utils_dir="../utils", index_filename="playerIndex.csv")

# Sanity check
display(index_df.head(10))
print("Rows:", len(index_df), "| Unique INDEX:", index_df[PLAYER_KEY].nunique())

# Slim view for merge
index_slim = index_df[[PLAYER_KEY, JOIN_KEY]].drop_duplicates()

Unnamed: 0,INDEX,PLAYER,PLAYER_NORM
0,10001,Nikola Jokic,nikola jokic
1,10002,Shai Gilgeous-Alexander,shai gilgeous alexander
2,10003,Victor Wembanyama,victor wembanyama
3,10004,Luka Doncic,luka doncic
4,10005,Giannis Antetokounmpo,giannis antetokounmpo
5,10006,Anthony Davis,anthony davis
6,10007,Cade Cunningham,cade cunningham
7,10008,Anthony Edwards,anthony edwards
8,10009,Karl-Anthony Towns,karl anthony towns
9,10010,James Harden,james harden


Rows: 546 | Unique INDEX: 546


In [4]:
# ============================================================
# Cell 4: Load basketballReference LIVE data
# ============================================================
keep_cols = ["PLAYER","G","MPG","PTS","FGM","FGA","FG%","FTA","FTM","FT%","3PM","REB","AST","STL","BLK","TO"]

df_live = pd.read_csv(LIVE_PATH)

# Standardize column casing
df_live = df_live.rename(columns={"Player": "PLAYER"})

# Keep only relevant columns, in order
df_live = df_live[[c for c in keep_cols if c in df_live.columns]]

# Normalize player names for merging
df_live["PLAYER"] = df_live["PLAYER"].astype(str).str.strip()
df_live[JOIN_KEY] = df_live["PLAYER"].apply(normalize_name)

# Convert numeric columns to float safely
num_cols = ["PTS","G","MPG","FGM","FGA","FG%","FTA","FTM","FT%","3PM","REB","AST","STL","BLK","TO"]
for c in set(num_cols).intersection(df_live.columns):
    df_live[c] = pd.to_numeric(df_live[c], errors="coerce")

display(df_live.head(10))

Unnamed: 0,PLAYER,G,MPG,PTS,FGM,FGA,FG%,FTA,FTM,FT%,3PM,REB,AST,STL,BLK,TO,PLAYER_NORM
0,Aaron Gordon,1.0,39.0,50.0,17.0,21.0,0.81,6.0,6.0,1.0,10.0,8.0,2.0,0.0,1.0,2.0,aaron gordon
1,Luka Dončić,2.0,38.0,46.0,15.5,25.0,0.62,14.5,11.5,0.793,3.5,11.5,8.5,1.0,0.5,3.0,luka doncic
2,Shai Gilgeous-Alexander,2.0,46.5,45.0,13.5,28.5,0.474,20.0,16.5,0.825,1.5,6.5,5.0,2.0,1.5,2.5,shai gilgeous alexander
3,Tyrese Maxey,1.0,41.0,40.0,13.0,24.0,0.542,8.0,7.0,0.875,7.0,2.0,6.0,2.0,1.0,1.0,tyrese maxey
4,Anthony Edwards,2.0,37.0,36.0,12.5,23.5,0.532,8.5,7.0,0.824,4.0,5.5,3.0,0.0,1.0,3.0,anthony edwards
5,Bennedict Mathurin,1.0,45.0,36.0,9.0,19.0,0.474,17.0,15.0,0.882,3.0,11.0,1.0,0.0,0.0,4.0,bennedict mathurin
6,Victor Wembanyama,2.0,31.0,34.5,14.0,22.0,0.636,7.5,6.0,0.8,0.5,13.0,1.5,1.0,6.0,0.0,victor wembanyama
7,Giannis Antetokounmpo,2.0,32.0,34.0,13.5,20.0,0.675,9.5,5.5,0.579,1.5,17.0,6.0,0.0,0.5,3.5,giannis antetokounmpo
8,VJ Edgecombe,1.0,42.0,34.0,13.0,26.0,0.5,6.0,3.0,0.5,5.0,7.0,3.0,1.0,0.0,2.0,vj edgecombe
9,Stephen Curry,3.0,32.0,33.3,10.7,20.3,0.525,6.7,6.7,1.0,5.3,4.3,4.7,2.3,1.3,2.7,stephen curry


In [5]:
# ============================================================
# Cell 5: Merge LIVE file with player index
# ============================================================
df_live_idx = df_live.merge(index_slim, on=JOIN_KEY, how="left")

# Merge results summary
total = len(df_live_idx)
matched = df_live_idx[PLAYER_KEY].notna().sum()
unmatched = total - matched
print(f"Rows: {total} | Matched INDEX: {matched} | Unmatched: {unmatched}")

# Show sample of unmatched players
if unmatched > 0:
    unmatched_df = df_live_idx[df_live_idx[PLAYER_KEY].isna()][["PLAYER", JOIN_KEY]]
    display(unmatched_df.head(20))
else:
    print("✅ All players matched to INDEX.")

Rows: 369 | Matched INDEX: 337 | Unmatched: 32


Unnamed: 0,PLAYER,PLAYER_NORM
105,Nic Claxton,nic claxton
128,Alex Sarr,alex sarr
164,Bub Carrington,bub carrington
217,Will Richard,will richard
240,Daniss Jenkins,daniss jenkins
242,Keshad Johnson,keshad johnson
243,David Jones García,david jones garcia
248,Ryan Nembhard,ryan nembhard
252,Ben Saraf,ben saraf
254,Tyler Kolek,tyler kolek


In [6]:
# ============================================================
# Cell 6: Assemble final output shape & save
# ============================================================
import numpy as np

# Build final ordered columns
final_cols = [
    "INDEX", "PLAYER_NORM", "G", "MPG", "PTS", "FG%", "FT%", 
    "3PM", "REB", "AST", "STL", "BLK", "TO"
]

# Ensure G and MPG exist; if not, create as NaN
for col in ["G", "MPG"]:
    if col not in df_live_idx.columns:
        df_live_idx[col] = np.nan

# Keep only columns we need (if present), in order
final_df = df_live_idx.copy()

# Drop columns we don't want in the final file
drop_unneeded = ["PLAYER", "FGM", "FGA", "FTA", "FTM"]
final_df = final_df.drop(columns=[c for c in drop_unneeded if c in final_df.columns], errors="ignore")

# Reorder to the exact final spec
final_df = final_df[[c for c in final_cols if c in final_df.columns]]

# Coerce numeric columns to float
numeric_cols = ["INDEX", "G", "MPG", "PTS", "FG%", "FT%", "3PM", "REB", "AST", "STL", "BLK", "TO"]
for c in [col for col in numeric_cols if col in final_df.columns]:
    final_df[c] = pd.to_numeric(final_df[c], errors="coerce")

# Save (no unmatched file)
output_main = "../L2/data/live_L2.csv"
final_df.to_csv(output_main, index=False)
print(f"✅ Final LIVE output saved to: {output_main}")

# Quick preview
display(final_df.head(10))

# Match summary (still useful context)
total = len(df_live_idx)
matched = df_live_idx["INDEX"].notna().sum()
unmatched = total - matched
print(f"Rows: {total} | Matched INDEX: {matched} | Unmatched: {unmatched}")

✅ Final LIVE output saved to: ../L2/data/live_L2.csv


Unnamed: 0,INDEX,PLAYER_NORM,G,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO
0,10146.0,aaron gordon,1.0,39.0,50.0,0.81,1.0,10.0,8.0,2.0,0.0,1.0,2.0
1,10004.0,luka doncic,2.0,38.0,46.0,0.62,0.793,3.5,11.5,8.5,1.0,0.5,3.0
2,10002.0,shai gilgeous alexander,2.0,46.5,45.0,0.474,0.825,1.5,6.5,5.0,2.0,1.5,2.5
3,10014.0,tyrese maxey,1.0,41.0,40.0,0.542,0.875,7.0,2.0,6.0,2.0,1.0,1.0
4,10008.0,anthony edwards,2.0,37.0,36.0,0.532,0.824,4.0,5.5,3.0,0.0,1.0,3.0
5,10144.0,bennedict mathurin,1.0,45.0,36.0,0.474,0.882,3.0,11.0,1.0,0.0,0.0,4.0
6,10003.0,victor wembanyama,2.0,31.0,34.5,0.636,0.8,0.5,13.0,1.5,1.0,6.0,0.0
7,10005.0,giannis antetokounmpo,2.0,32.0,34.0,0.675,0.579,1.5,17.0,6.0,0.0,0.5,3.5
8,10148.0,vj edgecombe,1.0,42.0,34.0,0.5,0.5,5.0,7.0,3.0,1.0,0.0,2.0
9,10015.0,stephen curry,3.0,32.0,33.3,0.525,1.0,5.3,4.3,4.7,2.3,1.3,2.7


Rows: 369 | Matched INDEX: 337 | Unmatched: 32
