In [1]:
# ============================================================
# Cell 1: Imports & paths
# ============================================================
import pandas as pd
import importlib.util
import sys

# Paths
UTILS_PATH = "../utils/utils.py"
LIVE_PATH  = "data/basketballReference_LIVE_L1.csv"

# Keys for joins
JOIN_KEY   = "PLAYER_NORM"
PLAYER_KEY = "INDEX"

In [2]:
# ============================================================
# Cell 2: Load utils.py (normalize_name, load_player_index)
# ============================================================
spec = importlib.util.spec_from_file_location("utils_module", UTILS_PATH)
utils_module = importlib.util.module_from_spec(spec)
sys.modules["utils_module"] = utils_module
spec.loader.exec_module(utils_module)

# Bind helpers
normalize_name    = utils_module.normalize_name
load_player_index = getattr(utils_module, "load_player_index", None)

print("✅ utils.py loaded from:", UTILS_PATH)

✅ utils.py loaded from: ../utils/utils.py


In [3]:
# ============================================================
# Cell 3: Load player index (INDEX, PLAYER, PLAYER_NORM)
# ============================================================
if load_player_index is None:
    raise RuntimeError("utils.py must expose load_player_index().")

index_df = load_player_index(utils_dir="../utils", index_filename="playerIndex.csv")

# Sanity check
display(index_df.head(10))
print("Rows:", len(index_df), "| Unique INDEX:", index_df[PLAYER_KEY].nunique())

# Slim view for merge
index_slim = index_df[[PLAYER_KEY, JOIN_KEY]].drop_duplicates()

Unnamed: 0,INDEX,PLAYER,PLAYER_NORM
0,10001,Nikola Jokic,nikola jokic
1,10002,Shai Gilgeous-Alexander,shai gilgeous alexander
2,10003,Victor Wembanyama,victor wembanyama
3,10004,Luka Doncic,luka doncic
4,10005,Giannis Antetokounmpo,giannis antetokounmpo
5,10006,Anthony Davis,anthony davis
6,10007,Cade Cunningham,cade cunningham
7,10008,Anthony Edwards,anthony edwards
8,10009,Karl-Anthony Towns,karl anthony towns
9,10010,James Harden,james harden


Rows: 547 | Unique INDEX: 547


In [4]:
# ============================================================
# Cell 4: Load basketballReference LIVE data
# ============================================================
keep_cols = ["PLAYER","G","MPG","PTS","FGM","FGA","FG%","FTA","FTM","FT%","3PM","REB","AST","STL","BLK","TO"]

df_live = pd.read_csv(LIVE_PATH)

# Standardize column casing
df_live = df_live.rename(columns={"Player": "PLAYER"})

# Keep only relevant columns, in order
df_live = df_live[[c for c in keep_cols if c in df_live.columns]]

# Normalize player names for merging
df_live["PLAYER"] = df_live["PLAYER"].astype(str).str.strip()
df_live[JOIN_KEY] = df_live["PLAYER"].apply(normalize_name)

# Convert numeric columns to float safely
num_cols = ["PTS","G","MPG","FGM","FGA","FG%","FTA","FTM","FT%","3PM","REB","AST","STL","BLK","TO"]
for c in set(num_cols).intersection(df_live.columns):
    df_live[c] = pd.to_numeric(df_live[c], errors="coerce")

display(df_live.head(10))

Unnamed: 0,PLAYER,G,MPG,PTS,FGM,FGA,FG%,FTA,FTM,FT%,3PM,REB,AST,STL,BLK,TO,PLAYER_NORM
0,Luka Dončić,11.0,36.8,34.6,10.6,22.5,0.472,12.5,9.9,0.79,3.5,8.5,9.0,1.9,0.5,4.1,luka doncic
1,Tyrese Maxey,13.0,40.4,32.5,10.8,23.6,0.456,8.1,7.2,0.886,3.8,4.9,7.7,1.4,0.8,2.6,tyrese maxey
2,Shai Gilgeous-Alexander,15.0,33.3,31.9,10.7,20.2,0.531,9.1,8.2,0.904,2.3,4.9,6.7,1.5,0.9,1.7,shai gilgeous alexander
3,Giannis Antetokounmpo,13.0,31.8,31.2,12.0,19.1,0.629,10.2,6.5,0.636,0.7,10.8,6.8,0.9,1.2,3.5,giannis antetokounmpo
4,Donovan Mitchell,13.0,34.1,30.9,10.4,20.4,0.509,7.5,6.2,0.825,4.0,4.6,5.5,1.5,0.5,3.2,donovan mitchell
5,Lauri Markkanen,14.0,35.9,30.6,10.4,21.5,0.485,6.9,6.1,0.885,3.7,6.1,2.1,1.0,0.5,1.4,lauri markkanen
6,Nikola Jokić,13.0,34.5,29.2,10.9,16.8,0.648,6.5,5.5,0.857,1.8,13.4,11.1,1.8,0.8,3.2,nikola jokic
7,Bennedict Mathurin,3.0,32.7,29.0,8.0,15.7,0.511,13.3,10.3,0.775,2.7,5.7,2.3,0.3,0.0,3.0,bennedict mathurin
8,Austin Reaves,12.0,36.1,28.1,8.5,17.3,0.49,9.8,8.5,0.872,2.6,5.1,7.6,1.3,0.1,3.4,austin reaves
9,Jalen Brunson,11.0,33.9,28.0,9.7,20.8,0.467,6.6,5.7,0.863,2.8,3.5,6.5,0.8,0.0,2.7,jalen brunson


In [5]:
# ============================================================
# Cell 5: Merge LIVE file with player index
# ============================================================
df_live_idx = df_live.merge(index_slim, on=JOIN_KEY, how="left")

# Merge results summary
total = len(df_live_idx)
matched = df_live_idx[PLAYER_KEY].notna().sum()
unmatched = total - matched
print(f"Rows: {total} | Matched INDEX: {matched} | Unmatched: {unmatched}")

# Show sample of unmatched players
if unmatched > 0:
    unmatched_df = df_live_idx[df_live_idx[PLAYER_KEY].isna()][["PLAYER", JOIN_KEY]]
    display(unmatched_df.head(20))
else:
    print("✅ All players matched to INDEX.")

Rows: 473 | Matched INDEX: 424 | Unmatched: 49


Unnamed: 0,PLAYER,PLAYER_NORM
141,Daniss Jenkins,daniss jenkins
178,Pete Nance,pete nance
201,Will Richard,will richard
249,Kobe Sanders,kobe sanders
265,Mac McClung,mac mcclung
286,Caleb Love,caleb love
297,Cam Christie,cam christie
299,Tyrese Proctor,tyrese proctor
305,Pat Spencer,pat spencer
310,Moussa Cisse,moussa cisse


In [6]:
# ============================================================
# Cell 6: Assemble final output shape & save
# ============================================================
import numpy as np

# Build final ordered columns
final_cols = [
    "INDEX", "PLAYER_NORM", "G", "MPG", "PTS", "FG%", "FT%", 
    "3PM", "REB", "AST", "STL", "BLK", "TO"
]

# Ensure G and MPG exist; if not, create as NaN
for col in ["G", "MPG"]:
    if col not in df_live_idx.columns:
        df_live_idx[col] = np.nan

# Keep only columns we need (if present), in order
final_df = df_live_idx.copy()

# Drop columns we don't want in the final file
drop_unneeded = ["PLAYER", "FGM", "FGA", "FTA", "FTM"]
final_df = final_df.drop(columns=[c for c in drop_unneeded if c in final_df.columns], errors="ignore")

# Reorder to the exact final spec
final_df = final_df[[c for c in final_cols if c in final_df.columns]]

# Coerce numeric columns to float
numeric_cols = ["INDEX", "G", "MPG", "PTS", "FG%", "FT%", "3PM", "REB", "AST", "STL", "BLK", "TO"]
for c in [col for col in numeric_cols if col in final_df.columns]:
    final_df[c] = pd.to_numeric(final_df[c], errors="coerce")

# Save (no unmatched file)
output_main = "../L2/data/live_L2.csv"
final_df.to_csv(output_main, index=False)
print(f"✅ Final LIVE output saved to: {output_main}")

# Quick preview
display(final_df.head(10))

# Match summary (still useful context)
total = len(df_live_idx)
matched = df_live_idx["INDEX"].notna().sum()
unmatched = total - matched
print(f"Rows: {total} | Matched INDEX: {matched} | Unmatched: {unmatched}")

✅ Final LIVE output saved to: ../L2/data/live_L2.csv


Unnamed: 0,INDEX,PLAYER_NORM,G,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO
0,10004.0,luka doncic,11.0,36.8,34.6,0.472,0.79,3.5,8.5,9.0,1.9,0.5,4.1
1,10014.0,tyrese maxey,13.0,40.4,32.5,0.456,0.886,3.8,4.9,7.7,1.4,0.8,2.6
2,10002.0,shai gilgeous alexander,15.0,33.3,31.9,0.531,0.904,2.3,4.9,6.7,1.5,0.9,1.7
3,10005.0,giannis antetokounmpo,13.0,31.8,31.2,0.629,0.636,0.7,10.8,6.8,0.9,1.2,3.5
4,10018.0,donovan mitchell,13.0,34.1,30.9,0.509,0.825,4.0,4.6,5.5,1.5,0.5,3.2
5,10064.0,lauri markkanen,14.0,35.9,30.6,0.485,0.885,3.7,6.1,2.1,1.0,0.5,1.4
6,10001.0,nikola jokic,13.0,34.5,29.2,0.648,0.857,1.8,13.4,11.1,1.8,0.8,3.2
7,10144.0,bennedict mathurin,3.0,32.7,29.0,0.511,0.775,2.7,5.7,2.3,0.3,0.0,3.0
8,10040.0,austin reaves,12.0,36.1,28.1,0.49,0.872,2.6,5.1,7.6,1.3,0.1,3.4
9,10023.0,jalen brunson,11.0,33.9,28.0,0.467,0.863,2.8,3.5,6.5,0.8,0.0,2.7


Rows: 473 | Matched INDEX: 424 | Unmatched: 49
