In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
import random


plt.style.use('dark_background')
pd.set_option('display.max_columns', None)

In [4]:
def add_noise(df, col, m, sd):
    df[col] = df[col] + np.random.normal(loc=m, scale=sd, size=df.shape[0])
    return df


def load_historical_stats(position):

    # Load data
    df = pd.concat([pd.read_csv(f"../nhl-data/data/{f}") for f in os.listdir("../nhl-data/data") if position in f])

    # Filter to NHL regular seasons
    df = df[(df["gameTypeId"] == "regular season") & (df["leagueAbbrev"] == "NHL")]
    df = df.sort_values(['playerId', 'season'])

    # Clean columns
    df = df.drop(columns=["Unnamed: 0"])
    df["playerName"] = df["firstName"] + " " + df["lastName"]

    # Encode avgToi to float
    def encode_avgtoi(avgtoi):
        minutes, seconds = avgtoi.split(":")
        return int(minutes) + int(seconds) / 60

    df["avgToi_adj"] = df["avgToi"].apply(encode_avgtoi)
    df["totalToi"] = df["avgToi_adj"] * df["gamesPlayed"]

    # Group trade deadline seasons together
    df = df.groupby(["playerId", "season"]).agg(
        team=("teamName", list),
        playerName=("playerName", "max"),
        age=("age", "max"),
        gamesPlayed=("gamesPlayed", "sum"),
        goals=("goals", "sum"),
        assists=("assists", "sum"),
        points=("points", "sum"),
        plusMinus=("plusMinus", "sum"),
        pim=("pim", "sum"),
        powerPlayPoints=("powerPlayPoints", "sum"),
        shots=("shots", "sum"),
        totalToi=("totalToi", "sum"),
    ).reset_index()

    df["avgToi"] = df["totalToi"] / df["gamesPlayed"]
    df["yoe"] = df.groupby("playerId").cumcount() + 1
    df["playerId"] = df["playerId"].astype(str)
    df["season"] = df["season"].astype(str)

    # Clean season column
    season_lengths = {
        "20192020": 70,    # COVID-shortened (approx; varies a bit by team)
        "20202021": 56,    # COVID-shortened
        "20042005": 0,     # lockout — drop if present
    }

    df["seasonLength"] = df["season"].map(season_lengths).fillna(82)
    df["gpa"] = df["gamesPlayed"] / df["seasonLength"]
    df = df[df["seasonLength"] > 0]

    # Normalize stats by games played
    df["gpg"] = df["goals"] / df["gamesPlayed"]
    df["apg"] = df["assists"] / df["gamesPlayed"]
    df["spg"] = df["shots"] / df["gamesPlayed"]
    df["ppppg"] = df["powerPlayPoints"] / df["gamesPlayed"]

    df["g/60"] = (df["goals"] / df["totalToi"]) * 60
    df["a/60"] = (df["assists"] / df["totalToi"]) * 60
    df["s/60"] = (df["shots"] / df["totalToi"]) * 60
    df["ppp/60"] = (df["powerPlayPoints"] / df["totalToi"]) * 60

    return df


def pivot_stats(df, time_col, time_vals):
    # Filter to target time values
    df = df[df[time_col].isin(time_vals)]

    # Pivot stats from target times into single row for each player
    pdf = pd.pivot(data=df, index=["playerId", "playerName"], columns=time_col)
    pdf.columns = ['_'.join(map(str, col)) for col in pdf.columns]
    pdf = pdf.reset_index()

    # Filter out players who dont have all target times
    pdf = pdf[
        (~pdf[f"age_{time_vals[0]}"].isna()) &
        (~pdf[f"age_{time_vals[1]}"].isna())
    ]

    return pdf


def load_bio_stats(position):
    df = pd.concat([pd.read_csv(f"../nhl-data/bio_stats/{f}") for f in os.listdir("../nhl-data/bio_stats") if position in f])
    df = df.drop(columns=["Unnamed: 0"])
    df["id"] = df["id"].astype(str)
    df = df.drop_duplicates(subset=["id"])
    df = df[["id", "positionCode", "shootsCatches", "heightInInches", "weightInPounds"]]
    df = add_noise(df, "heightInInches", 0, 0.2)
    df = add_noise(df, "weightInPounds", 0, 0.2)

    return df


def train_model(db_matrix):
    # Normalize features for KNN
    scaler = StandardScaler()
    db_matrix = pd.DataFrame(scaler.fit_transform(db_matrix), columns=db_matrix.columns, index=db_matrix.index)

    # Impute values
    # imp = SimpleImputer(strategy="mean")
    imp = SimpleImputer(strategy="constant", fill_value=0)
    db_matrix = pd.DataFrame(imp.fit_transform(db_matrix), columns=db_matrix.columns, index=db_matrix.index)
    
    # Train model
    knn = NearestNeighbors(metric="cosine")
    knn.fit(db_matrix)

    return knn, db_matrix


def run_inference(input_id, knn, db_matrix, df, num_neighbors):
    # Inference
    input_vector = np.array(db_matrix.loc[input_id, :]).reshape(1, -1)
    distances, indices = knn.kneighbors(input_vector, n_neighbors=num_neighbors + 1)
    nbrs = pd.DataFrame(db_matrix.iloc[indices[0], :].index)
    nbrs_stats = nbrs.merge(df, on=["playerId"], how="inner")
    print(nbrs_stats["playerName"].unique()[1:])

    return nbrs_stats


def runner(position, input_text, time_col, lookback, num_neighbors):
    hdf = load_historical_stats(position)

    input_name = max(hdf[hdf["playerName"].str.contains(input_text)]["playerName"])
    input_id = max(hdf[hdf["playerName"].str.contains(input_text)]["playerId"])
    time_vals = list(hdf[hdf["playerName"].str.contains(input_text)][time_col][-lookback:])

    print(f"Looking for players similar to {input_name} [{input_id}] using stats from {time_col} = {time_vals}")

    pdf = pivot_stats(hdf, time_col, time_vals)
    bdf = load_bio_stats(position)
    joined = pdf.merge(bdf, left_on="playerId", right_on="id", how="left")

    SCHEMA = [
        "playerId",
        "playerName",
        "age",
        "yoe",
        "gpa", # games played adjusted
        "g/60", # goals per 60
        "a/60",
        "s/60",
        "ppp/60",
        # "avgToi",
        # "positionCode",
        # "shootsCatches",
        "heightInInches",
        "weightInPounds",
    ]
    joined = joined[[col for col in joined.columns if any(col.startswith(prefix) for prefix in SCHEMA)]]
    db_matrix = joined.set_index("playerId")
    db_matrix = db_matrix.drop(columns=["playerName"])

    knn, db_matrix = train_model(db_matrix)
    nbrs_stats = run_inference(input_id, knn, db_matrix, hdf, num_neighbors)

    return nbrs_stats, time_vals

In [8]:
POSITION = "forwards" # {forwards, defensemen, goalies}
INPUT = "Cuylle"
TIME_COL = "yoe"
LOOKBACK = 2
NUM_NEIGHBORS = 5

NBRS_DF, TIME_VALS = runner(POSITION, INPUT, TIME_COL, LOOKBACK, NUM_NEIGHBORS)
NBRS_DF = NBRS_DF.sort_values([TIME_COL])

def plot_stat(df, time_vals, stat):
    fig = px.line(
        data_frame=df,
        x=TIME_COL,
        y=stat,
        color="playerName",
        markers=True,
        labels={TIME_COL: TIME_COL, stat: stat},
        title=f"{stat} Over Time"
    )
    for s in time_vals:
        fig.add_vline(x=s, line_dash='dot', line_color='gray')


    fig.show()

plot_stat(NBRS_DF, TIME_VALS, "g/60")
plot_stat(NBRS_DF, TIME_VALS, "a/60")
plot_stat(NBRS_DF, TIME_VALS, "ppp/60")

Looking for players similar to Will Cuylle [8482157] using stats from yoe = [2, 3]
['Tomas Hertl' 'Jordan Staal' 'Tyson Foerster' 'Jordan Greenway'
 'Warren Foegele']



X does not have valid feature names, but NearestNeighbors was fitted with feature names


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul



In [9]:
BIO_STATS = load_bio_stats("forwards")
BIO_STATS = BIO_STATS.merge(NBRS_DF[["playerId", "playerName"]].drop_duplicates(), left_on="id", right_on="playerId", how="inner")
BIO_STATS

Unnamed: 0,id,positionCode,shootsCatches,heightInInches,weightInPounds,playerId,playerName
0,8478413,L,L,77.689979,231.027881,8478413,Jordan Greenway
1,8482159,R,R,74.002597,214.926604,8482159,Tyson Foerster
2,8473533,C,L,76.026312,219.905489,8473533,Jordan Staal
3,8476881,C,L,75.086726,219.998842,8476881,Tomas Hertl
4,8477998,L,L,73.924718,204.047876,8477998,Warren Foegele
5,8482157,L,L,75.027994,211.922517,8482157,Will Cuylle


In [10]:
fig = px.scatter(BIO_STATS, x='weightInPounds', y='heightInInches', hover_name='playerName')
fig.show()