In [34]:
import pandas as pd
import re
import numpy as np
from sklearn.impute import KNNImputer


In [21]:
def normalize_name(x):
    if pd.isna(x):
        return ""
    x = x.strip().lower()
    x = re.sub(r"[^a-z\s,]", "", x)  # remove punctuation except comma

    # Handle Last, First → First Last
    if "," in x:
        last, first = [t.strip() for t in x.split(",", 1)]
        x = f"{first} {last}"
        
    # Remove double spaces
    x = re.sub(r"\s+", " ", x)
    return x


In [None]:
rookiestats = pd.read_csv("nba-players.csv")
combineStats_df = pd.read_csv("nbadraftcombinestats.csv")

rookiestats["name"] = rookiestats["name"].apply(normalize_name)
combineStats_df["name"] = combineStats_df["PLAYER"].apply(normalize_name)

merged_data = combineStats_df.merge(rookiestats, on="name", how="inner")
merged_data = merged_data.drop_duplicates()
merged_data = merged_data.drop(columns=["name", "PAN", "HANDL", "HANDW", "SHUTTLE", "PAN"])



In [45]:
num_cols = merged_data.select_dtypes(include=[np.number]).columns.tolist()

# Split numeric columns into:
# - columns with at least one non-NaN (usable for KNN)
# - columns that are entirely NaN (KNN can't infer anything there)
num_cols_for_knn = [c for c in num_cols if not merged_data[c].isna().all()]
all_na_num_cols  = [c for c in num_cols if merged_data[c].isna().all()]

# Run KNN only if we have something to impute on
if num_cols_for_knn:
    imputer = KNNImputer(n_neighbors=5)
    merged_data[num_cols_for_knn] = imputer.fit_transform(merged_data[num_cols_for_knn])

# For numeric columns that were all NaN, fill with 0 (or a global mean if you prefer)
for c in all_na_num_cols:
    # Option 1: fill with 0
    merged_data[c] = 0
    # Option 2 (alternative): use global mean of all numeric values
    # global_mean = merged_data[num_cols_for_knn].to_numpy().mean()
    # merged_data[c] = global_mean

# -----------------------------
# 2. Non-numeric columns → mode imputation
# -----------------------------
non_num_cols = merged_data.select_dtypes(exclude=[np.number]).columns.tolist()

for c in non_num_cols:
    if merged_data[c].isna().any():
        mode = merged_data[c].mode()
        if len(mode) > 0:
            merged_data[c] = merged_data[c].fillna(mode[0])  # most frequent category
        else:
            merged_data[c] = merged_data[c].fillna("Unknown")

# -----------------------------
# 3. Sanity check: should be 0
# -----------------------------
print("Total remaining NaNs:", merged_data.isna().sum().sum())
print(merged_data.isna().sum())

# Save if you want
merged_data.to_csv("final_data_no_nans.csv", index=False)
print("\nImputation complete! All NaNs should be gone.")

Total remaining NaNs: 0
YEAR           0
PLAYER         0
POS            0
HGT            0
WGT            0
BMI            0
BF             0
WNGSPN         0
STNDRCH        0
STNDVERT       0
LPVERT         0
LANE           0
SPRINT         0
BENCH          0
BAR            0
PBHGT          0
PDHGT          0
Unnamed: 0     0
gp             0
min            0
pts            0
fgm            0
fga            0
fg             0
3p_made        0
3pa            0
3p             0
ftm            0
fta            0
ft             0
oreb           0
dreb           0
reb            0
ast            0
stl            0
blk            0
tov            0
target_5yrs    0
POSITION       0
dtype: int64

Imputation complete! All NaNs should be gone.
