In [1]:
import os
import re
import json
import numpy as np
import pandas as pd

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

import clean_reports
import preprocess_reports
import setup_predictor
from model import *
from train_test_predictor import train_and_test

nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mrquo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mrquo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mrquo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mrquo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# dataset location
DATASET = "data/prospect-data.csv"

# load dataset into dataframe
data = clean_reports.clean(DATASET, raw=True)

data.head()

Unnamed: 0,Year,Position,Height,Weight,Drafted,Team,Average Ranking,Name,Description - Corey Pronman,Description - Scott Wheeler,Description - Smaht Scouting,Description - ESPN (Chris Peters),Description - EP Rinkside,Description - EP Rinkside Part 2,Description - The Painted Lines,Description - FCHockey
0,2023,C,69.75,185.0,,,1.0,Connor Bedard,Bedard is a potential franchise-changing No. 1...,Bedard’s statistical profile speaks for itself...,Connor Bedard is an extremely gifted generatio...,One of the most naturally gifted goal scorers ...,,Connor Bedard is the premier prospect in the w...,,
1,2023,C,74.0,187.0,,,2.0,Adam Fantilli,There's so much to love about Fantilli's NHL p...,"Fantilli is a big, strong, powerful center who...",Adam Fantilli has every tool that an NHL team ...,"A 6-foot-2, 200-pound power center with touch,...",,"A fantastic consolation prize, Adam Fantilli w...",,
2,2023,RW,70.0,148.0,,,3.0,Matvei Michkov,Michkov is one of the very best first-year dra...,Michkov is the best Russian prospect since Ale...,"A smart, dynamic goal-scoring winger, Michkov ...","For the last few years, I’ve described Michkov...",,"Statistically, Matvei Michkov is *another* fir...",,
3,2023,C,75.0,194.0,,,4.0,Leo Carlsson,"Carlsson has elite skill, which when combined ...",Though he doesn’t play the game with some of t...,Carlsson has been played extremely well at the...,The buzz is growing (and rightfully so) that C...,,"Oh, look, another first-overall talent. Leo Ca...",,
4,2023,LW,69.75,170.0,,,5.0,Zach Benson,Benson has a ton of creativity and offense in ...,"There were a lot of nights last season, on an ...",While I don’t necessarily see Zach Benson reac...,"An offensive dynamo with deft scoring touch, B...",,Some people are worried about selecting a 5-9 ...,,


In [3]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Year                               402 non-null    int64  
 1   Position                           402 non-null    object 
 2   Height                             402 non-null    float64
 3   Weight                             402 non-null    float64
 4   Drafted                            360 non-null    float64
 5   Team                               360 non-null    object 
 6   Average Ranking                    162 non-null    float64
 7   Name                               402 non-null    object 
 8   Description - Corey Pronman        389 non-null    object 
 9   Description - Scott Wheeler        213 non-null    object 
 10  Description - Smaht Scouting       149 non-null    object 
 11  Description - ESPN (Chris Peters)  229 non-null    object 

In [4]:
# clean up dataset
# might have to look at dropping seattle in the future but for clustering it 
# should not matter
data = data[data['Team'] != 'SEA']

# try with only forwards
# data = data[
#     (data['Position'] == 'C') | 
#     (data['Position'] == 'LW') | 
#     (data['Position'] == 'RW')
# ]

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 0 to 401
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Year                               397 non-null    int64  
 1   Position                           397 non-null    object 
 2   Height                             397 non-null    float64
 3   Weight                             397 non-null    float64
 4   Drafted                            355 non-null    float64
 5   Team                               355 non-null    object 
 6   Average Ranking                    157 non-null    float64
 7   Name                               397 non-null    object 
 8   Description - Corey Pronman        384 non-null    object 
 9   Description - Scott Wheeler        208 non-null    object 
 10  Description - Smaht Scouting       146 non-null    object 
 11  Description - ESPN (Chris Peters)  224 non-null    object 

In [5]:
data.sample(5)

Unnamed: 0,Year,Position,Height,Weight,Drafted,Team,Average Ranking,Name,Description - Corey Pronman,Description - Scott Wheeler,Description - Smaht Scouting,Description - ESPN (Chris Peters),Description - EP Rinkside,Description - EP Rinkside Part 2,Description - The Painted Lines,Description - FCHockey
51,2022,LW,74.5,200.0,5.0,PHI,10.0,Cutter Gauthier,Gauthier has the tools that can make you easil...,"As the season progressed, Gauthier's athletic ...",Although Gauthier does have the ability to pla...,"Though he played wing a lot this season, NHL t...",A power forward with good feet and a heavy rel...,There's something to be said for a complementa...,Gauthier is a player that matured and progress...,Cutter Gauthier is a power forward with higher...
198,2019,G,79.5,199.0,37.0,OTT,,Mads Sogaard,,,,"A mountain in net, the big Dane has great mobi...",,,,A large goalie with a massive presence in the ...
156,2020,D,72.0,196.0,72.0,CGY,35.0,Jeremie Poirier,"Poirier had a fantastic season offensively, pu...",Poirier may be the most dynamic offensive defe...,Poirier is an outstanding offensive defenseman...,There is little debate that Poirier is among t...,A toolsy defender with above-average skating a...,,"I liked Poirier last season, and my appreciati...",Poirier is an incredibly skilled and talented ...
119,2021,LW,70.0,168.0,73.0,DAL,38.0,Ayrton Martino,Martino ended up in the USHL after the BCHL's ...,Martino's one of the oldest players on this li...,"In the grand scheme of things, Ayrton Martino ...",,,Even with two high-scoring OJHL seasons under ...,Transitioned well to USHL. Excellent hockey se...,"When watching Martino, you can tell he likes h..."
182,2019,RW,73.0,212.0,21.0,PIT,,Samuel Poulin,"Poulin, the 2017 second-overall pick in the QM...","Poulin, taken second-overall in the 2017 QMJHL...",,Poulin has been one of the toughest players fo...,You just get the sense watching Samuel Poulin ...,There wasn't a lot of talent around Samuel Pou...,"Poulin is a strong, power forward type player ...",A decent skater with an above-average top gear...


In [6]:
HOCKEY_WORDS = ["usntdp", "ntdp", "development", "program",
                "khl", "shl", "ushl", "ncaa", "ohl", "chl", "whl", "qmjhl",
                "sweden", "russia", "usa", "canada", "ojhl", "finland", 
                "finnish", "swedish", "russian", "american", "wisconsin",
                "michigan", "bc", "boston", "london", "bchl", "kelowna",
                "liiga", 
                "portland", "minnesota", "ska", "frolunda", "sjhl", "college",
                "center", "left", "right", "saginaw", "kelowna", "frolunda",
                "slovakia"]

# scouting report columns
mask = data.columns.str.match('Description')
scouting_reports = data.columns[mask]

# preprocess data with NLTK
preprocessed_df = data.copy()
for report in scouting_reports:
    # skip columns with ALL missing values
    if data[report].isnull().all():
        continue
    report_preprocessor = preprocess_reports.NltkPreprocessor(data[report])
    preprocessed_df.loc[:,report] = report_preprocessor\
        .remove_names(data['Name'])\
        .remove_whitespace()\
        .remove_words(HOCKEY_WORDS)\
        .get_text()


In [7]:
# transform from wide to long data frame
long_df = preprocessed_df.melt(
    id_vars=['Year', 'Position', 'Height', 'Weight', 'Drafted', 'Team', 'Average Ranking', 'Name'],
    value_vars=scouting_reports.tolist(),
    var_name='reporter',  
    value_name='text'
).dropna(
    subset=['text']
)



In [9]:
openai_embeddings_path = 'data/reports_with_embeddings.csv'
if os.path.exists(openai_embeddings_path):
    openai_df = pd.read_csv(openai_embeddings_path)
    openai_df['embeddings'] = openai_df.embeddings.apply(eval).apply(np.array)
    embeddings = np.vstack(openai_df['embeddings'].values).astype(np.float64)
    openai_df['embeddings'] = [np.array(x, dtype=np.float64) for x in embeddings]

    openai_cols = [f'openai{i}' for i in range(openai_df['embeddings'].iloc[0].shape[0])]

    # create individual columns for each openai embedding
    embeddings_df = pd.DataFrame(
        np.concatenate([x.reshape(1,-1) for x in openai_df['embeddings']]),
        columns=openai_cols
    )

    embeddings_df.loc[:,'player_name'] = openai_df['player_name']

    full_df = pd.merge(preprocessed_df, embeddings_df, left_on='Name', right_on='player_name')

In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)

X_pca = pca.fit_transform(
    pd.DataFrame(
        np.concatenate([x.reshape(1,-1) for x in openai_df['embeddings']]),
    )
)

openai_pca_cols = [f'openai_pca{i}' for i in range(X_pca.shape[1])]

embeddings_pca_df = pd.DataFrame(X_pca, columns=openai_pca_cols)

embeddings_pca_df.loc[:,'player_name'] = openai_df['player_name']

full_df = pd.merge(preprocessed_df, embeddings_pca_df, left_on='Name', right_on='player_name')

In [17]:
# check that every player has OpenAI embeddings
full_df[full_df.columns[:20]].info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397 entries, 0 to 396
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Year                               397 non-null    int64  
 1   Position                           397 non-null    object 
 2   Height                             397 non-null    float64
 3   Weight                             397 non-null    float64
 4   Drafted                            355 non-null    float64
 5   Team                               355 non-null    object 
 6   Average Ranking                    157 non-null    float64
 7   Name                               397 non-null    object 
 8   Description - Corey Pronman        384 non-null    object 
 9   Description - Scott Wheeler        208 non-null    object 
 10  Description - Smaht Scouting       146 non-null    object 
 11  Description - ESPN (Chris Peters)  224 non-null    object 

In [18]:
# setup model architecture
numeric_cols = ['Height', 'Weight'] + openai_pca_cols
categorical_cols = ['Position']
# text_cols = scouting_reports.tolist()
text_cols = []
lr_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=LogisticOrdinalRegression()
)
knn_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=OrdinalKNeighborsClassifier()
)
rf_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=RandomForestOrdinalClassifier()
)

In [19]:
X = full_df[numeric_cols + categorical_cols + text_cols]
y = full_df['Drafted']
groups = full_df['Name']

mean_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision', 'recall'])
std_df = pd.DataFrame(columns=['accuracy', 'f1', 'precision', 'recall'])

In [20]:
train_idx = full_df[full_df['Year'] <= 2022].index.tolist()
test_idx = full_df[full_df['Year'] == 2023].index.tolist()

X_train = X.iloc[train_idx]
y_train = y.iloc[train_idx]
X_test = X.iloc[test_idx]
y_test = y.iloc[test_idx]

In [21]:
# Random Forest Classification model
param_grid = {
    'clf__n_estimators' : np.arange(60, 110, 20).tolist(),
    'clf__max_depth' : np.arange(20, 100, 20).tolist(),
}

label = 'OpenAI_rand_forest_2023_prediction'

rf_metrics = train_and_test(rf_model, X_train, y_train, groups[train_idx], param_grid, notes=label)

rf_mean = {k : np.mean(v) for k,v in rf_metrics.items()}
rf_std = {k : np.std(v) for k,v in rf_metrics.items()}

mean_df.loc[label] = pd.Series(rf_mean)
std_df.loc[label] = pd.Series(rf_std)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


100%|██████████| 52/52 [00:03<00:00, 16.16it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 12 candidates, totalling 36 fits


100%|██████████| 52/52 [00:03<00:00, 13.10it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 12 candidates, totalling 36 fits


100%|██████████| 60/60 [00:02<00:00, 21.25it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 2023 Predictions

In [None]:
# try just one model fit
rf_model = setup_predictor.setup(
    numeric_cols=numeric_cols, 
    categorical_cols=categorical_cols,
    text_cols=text_cols,
    func=RandomForestOrdinalClassifier(n_estimators=80, max_depth=40, random_state=42)
)

rf_model.fit(X_train, y_train)

100%|██████████| 63/63 [00:05<00:00, 11.34it/s]


Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['openai_pca0', 'openai_pca1',
                                                   'openai_pca2', 'openai_pca3',
                                                   'openai_pca4', 'openai_pca5',
                                                   'openai_pca6', 'openai_pca7',
                                                   'openai_pca8', 'openai_pca9',
                                                   'openai_pca10',
                                                   'openai_pca11',
                          

In [None]:
# make predictions on test set
# since it is entire class of 2023, we can actually rank them
y_test_pred = rf_model.predict(X_test).argsort()

In [None]:
foo = pd.DataFrame()
foo.loc[:,'name'] = groups[test_idx]
foo.loc[:,'ranking'] = y_test_pred + 1

In [None]:
foo.sort_values(by='ranking')

Unnamed: 0,name,ranking
0,Connor Bedard,1
9,Eduard Sale,2
33,Oscar Fisker Molgaard,3
26,Lukas Dragicevic,4
11,Brayden Yager,5
23,Gavin Brindley,6
35,Michael Hrabal,7
10,Colby Barlow,8
2,Matvei Michkov,9
28,Kasper Halttunen,10
