In [1]:
# Cell 1: Imports & display options
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)
pd.set_option("display.float_format", lambda v: f"{v:,.4f}")

In [2]:
# Cell 2: Load data
CSV_PATH = "data/projections_L2.csv"   # adjust if needed

df = pd.read_csv(CSV_PATH)
display(df.head(3))
df.shape

Unnamed: 0,INDEX,PLAYER_NORM,G,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO
0,10001.0,nikola jokic,73.2,35.7176,28.0773,0.5782,0.8078,1.8568,12.3045,10.0114,1.6273,0.7341,3.1932
1,10002.0,shai gilgeous alexander,74.0,33.8412,32.3432,0.521,0.8918,1.9818,5.2364,6.3955,1.7409,0.9795,2.3773
2,10003.0,victor wembanyama,67.8,32.9618,25.3318,0.4769,0.8265,2.9091,11.3705,4.0114,1.2273,3.7636,3.5068


(758, 13)

In [3]:
# Cell 3: Coerce numeric columns (safe conversion)
# - Attempts to convert anything that looks numeric
# - Leaves truly textual columns alone

df_numeric = df.copy()

for col in df.columns:
    # Skip obvious identifier columns
    if col.upper() in {"PLAYER", "PLAYER_NORM", "TEAM", "POS", "INDEX"}:
        continue

    # Try a numeric cast; if it doesn't change anything, keep original
    coerced = pd.to_numeric(df_numeric[col], errors="coerce")
    # If we gained some numerics, keep the coerced version
    if coerced.notna().sum() > 0 and (coerced.notna().sum() >= df_numeric[col].notna().sum() / 4):
        df_numeric[col] = coerced

# Optional: If % columns are scaled weirdly (e.g., 55 instead of 0.55), auto-correct.
# This only affects columns ending with '%' and only values clearly in 1..100 range.
for col in df_numeric.columns:
    if col.endswith("%") and pd.api.types.is_numeric_dtype(df_numeric[col]):
        s = df_numeric[col]
        needs_scaling = (s.dropna().between(1, 100)).mean() > 0.9  # majority 1..100
        if needs_scaling:
            df_numeric[col] = s / 100.0

display(df_numeric.head(3))

Unnamed: 0,INDEX,PLAYER_NORM,G,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO
0,10001.0,nikola jokic,73.2,35.7176,28.0773,0.5782,0.8078,1.8568,12.3045,10.0114,1.6273,0.7341,3.1932
1,10002.0,shai gilgeous alexander,74.0,33.8412,32.3432,0.521,0.8918,1.9818,5.2364,6.3955,1.7409,0.9795,2.3773
2,10003.0,victor wembanyama,67.8,32.9618,25.3318,0.4769,0.8265,2.9091,11.3705,4.0114,1.2273,3.7636,3.5068


In [4]:
# Cell 4: Build summary stats table (count, mean, std) for numeric columns only
numeric_cols = [c for c in df_numeric.columns if pd.api.types.is_numeric_dtype(df_numeric[c])]

summary = df_numeric[numeric_cols].agg(['count', 'mean', 'std']).T
summary = summary.rename(columns=str.capitalize)  # Count, Mean, Std
summary = summary[['Count', 'Mean', 'Std']]       # Column order
summary = summary.sort_index()                    # Stable ordering
display(summary)

Unnamed: 0,Count,Mean,Std
3PM,586.0,1.1894,0.8658
AST,586.0,2.3711,1.7251
BLK,586.0,0.4854,0.4187
FG%,586.0,0.436,0.1356
FT%,586.0,0.7084,0.2113
G,586.0,49.1375,22.4952
INDEX,586.0,10268.4317,153.6126
MPG,586.0,18.9476,9.5054
PTS,586.0,10.2977,5.9572
REB,586.0,4.1823,2.3748


In [5]:
# Cell 6: Compute per-column z-scores (standard deviations from mean)
z_df = df_numeric.copy()

for col in summary.index:
    mean = summary.loc[col, 'Mean']
    std = summary.loc[col, 'Std']
    
    # Only compute if std is nonzero and column exists
    if col in z_df.columns and pd.api.types.is_numeric_dtype(z_df[col]) and std > 0:
        z_df[col + "_z"] = (z_df[col] - mean) / std

display(z_df.head(3))

Unnamed: 0,INDEX,PLAYER_NORM,G,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO,3PM_z,AST_z,BLK_z,FG%_z,FT%_z,G_z,INDEX_z,MPG_z,PTS_z,REB_z,STL_z,TO_z
0,10001.0,nikola jokic,73.2,35.7176,28.0773,0.5782,0.8078,1.8568,12.3045,10.0114,1.6273,0.7341,3.1932,0.7709,4.429,0.5939,1.0483,0.4707,1.0697,-1.7409,1.7643,2.9845,3.4202,2.2714,2.608
1,10002.0,shai gilgeous alexander,74.0,33.8412,32.3432,0.521,0.8918,1.9818,5.2364,6.3955,1.7409,0.9795,2.3773,0.9153,2.3329,1.1801,0.6269,0.868,1.1052,-1.7344,1.5669,3.7006,0.4438,2.5672,1.5154
2,10003.0,victor wembanyama,67.8,32.9618,25.3318,0.4769,0.8265,2.9091,11.3705,4.0114,1.2273,3.7636,3.5068,1.9863,0.9508,7.829,0.3015,0.5594,0.8296,-1.7279,1.4743,2.5237,3.0268,1.2302,3.0281


In [6]:
# Clean INDEX column safely (handles NaN and bad strings)
z_df['INDEX'] = (
    z_df['INDEX']
    .astype(str)                      # make sure it's string
    .str.replace(',', '', regex=False) # remove commas
    .str.split('.').str[0]            # cut off decimals
)

# Convert to numeric safely
z_df['INDEX'] = pd.to_numeric(z_df['INDEX'], errors='coerce')

# Drop or fill NaN (optional: choose one)
z_df = z_df.dropna(subset=['INDEX'])           # drop rows missing INDEX
# OR if you prefer:
# z_df['INDEX'] = z_df['INDEX'].fillna(0)

# Finally convert to int
z_df['INDEX'] = z_df['INDEX'].astype(int)

In [7]:
# Define the desired column order
cols_to_keep = [
    'INDEX',
    'PLAYER_NORM',
    'G_z',
    'PTS_z',
    'FG%_z',
    'FT%_z',
    '3PM_z',
    'REB_z',
    'AST_z',
    'STL_z',
    'BLK_z',
    'TO_z'
]

# Subset and reorder
z_df = z_df[cols_to_keep]

display(z_df.head(3))

Unnamed: 0,INDEX,PLAYER_NORM,G_z,PTS_z,FG%_z,FT%_z,3PM_z,REB_z,AST_z,STL_z,BLK_z,TO_z
0,10001,nikola jokic,1.0697,2.9845,1.0483,0.4707,0.7709,3.4202,4.429,2.2714,0.5939,2.608
1,10002,shai gilgeous alexander,1.1052,3.7006,0.6269,0.868,0.9153,0.4438,2.3329,2.5672,1.1801,1.5154
2,10003,victor wembanyama,0.8296,2.5237,0.3015,0.5594,1.9863,3.0268,0.9508,1.2302,7.829,3.0281


In [8]:
# Create RANK column as the sum of all z-score stats
z_df['RANK'] = z_df[
    ['PTS_z', 'FG%_z', 'FT%_z', '3PM_z', 'REB_z', 'AST_z', 'STL_z', 'BLK_z', 'TO_z']
].sum(axis=1)

# Weight RANK by games played (availability adjustment)
z_df['RANK'] = z_df['RANK'] * (df_numeric['G'] / 82)

# Optional: sort by RANK descending (best players first)
z_df = z_df.sort_values('RANK', ascending=False).reset_index(drop=True)

display(z_df.head(3))

Unnamed: 0,INDEX,PLAYER_NORM,G_z,PTS_z,FG%_z,FT%_z,3PM_z,REB_z,AST_z,STL_z,BLK_z,TO_z,RANK
0,10003,victor wembanyama,0.8296,2.5237,0.3015,0.5594,1.9863,3.0268,0.9508,1.2302,7.829,3.0281,17.7238
1,10001,nikola jokic,1.0697,2.9845,1.0483,0.4707,0.7709,3.4202,4.429,2.2714,0.5939,2.608,16.6012
2,10004,luka doncic,0.963,3.3867,0.2938,0.3532,2.7791,1.7865,3.6042,2.3069,0.0077,3.375,15.4493
3,10012,trae young,1.0163,2.4474,-0.0991,0.7642,2.1097,-0.4835,5.1509,1.3071,-0.7467,4.4007,13.0397
4,10002,shai gilgeous alexander,1.1052,3.7006,0.6269,0.868,0.9153,0.4438,2.3329,2.5672,1.1801,1.5154,12.7698
5,10008,anthony edwards,1.3275,2.9418,0.1192,0.5933,3.1073,0.6764,1.4791,1.3189,0.3551,2.6446,12.7515
6,10007,cade cunningham,1.0163,2.6954,0.2411,0.6833,1.1804,0.6955,3.8611,0.6859,0.5614,3.8711,12.71
7,10010,james harden,1.1319,1.7534,-0.1113,0.7832,1.9102,0.5596,3.5265,1.5023,0.5451,3.2989,12.5255
8,10005,giannis antetokounmpo,0.9274,3.5297,1.1939,-0.3562,-0.9092,3.2182,2.5555,0.5025,1.4298,2.885,11.9933
9,10033,josh giddey,1.0697,1.2189,0.2843,0.3686,0.4821,1.8909,3.3236,1.0882,0.48,2.6598,10.5304


In [9]:
import sys, os

# Go up one level (adjust path if your utils.py lives elsewhere)
sys.path.append(os.path.abspath("../utils"))

from utils import get_stat_weights

# Define the z-score columns you want to weight
z_cols = ['PTS_z', 'REB_z', 'AST_z', 'STL_z', 'BLK_z', '3PM_z', 'FG%_z', 'FT%_z', 'TO_z']

# Pull matching weights from utils
weights = get_stat_weights(include=[c.replace('_z', '') for c in z_cols])

# Apply weights dynamically (using the resolved stat name)
for col in z_cols:
    stat_name = col.replace('_z', '')  # e.g., 'PTS'
    w = weights.get(stat_name, 1.0)    # default to 1.0 if not found
    z_df[col] = z_df[col] * w

# Recalculate weighted RANK
z_df['RANK'] = z_df[z_cols].sum(axis=1)

# Apply games-played weighting (availability adjustment)
z_df['RANK'] = z_df['RANK'] * (df_numeric['G'] / 82) * 1.3

# Sort by new RANK
z_df = z_df.sort_values('RANK', ascending=False).reset_index(drop=True)

In [10]:
display(z_df.head(15))

Unnamed: 0,INDEX,PLAYER_NORM,G_z,PTS_z,FG%_z,FT%_z,3PM_z,REB_z,AST_z,STL_z,BLK_z,TO_z,RANK
0,10003,victor wembanyama,0.8296,1.8819,0.128,0.1673,0.8573,1.6817,0.4939,0.7039,3.6358,1.1385,12.4037
1,10001,nikola jokic,1.0697,2.2256,0.445,0.1408,0.3327,1.9003,2.3004,1.2997,0.2758,0.9806,11.6154
2,10004,luka doncic,0.963,2.5254,0.1247,0.1056,1.1995,0.9926,1.872,1.32,0.0036,1.269,10.1173
3,10005,giannis antetokounmpo,0.9274,2.6321,0.5068,-0.1065,-0.3924,1.7881,1.3273,0.2875,0.664,1.0848,8.6222
4,10002,shai gilgeous alexander,1.1052,2.7596,0.2661,0.2596,0.395,0.2466,1.2117,1.469,0.5481,0.5698,8.5734
5,10010,james harden,1.1319,1.3075,-0.0472,0.2342,0.8244,0.3109,1.8317,0.8596,0.2531,1.2404,8.535
6,10007,cade cunningham,1.0163,2.0099,0.1023,0.2044,0.5095,0.3864,2.0055,0.3925,0.2607,1.4556,8.3632
7,10012,trae young,1.0163,1.825,-0.0421,0.2286,0.9105,-0.2686,2.6754,0.7479,-0.3468,1.6547,8.2888
8,10006,anthony davis,0.634,1.8122,0.269,0.1111,-0.2281,1.7035,0.396,0.6193,1.8764,0.5457,8.1553
9,10017,lamelo ball,0.0739,1.9118,-0.0652,0.195,1.3298,0.3221,1.6524,0.7919,-0.2107,1.2244,8.1178


In [11]:
# Cell 5: (Optional) Save the summary
OUT_PATH = "../L3/data/rankings_L3.csv"
z_df.to_csv(OUT_PATH, index=True)
OUT_PATH

'../L3/data/rankings_L3.csv'