# Task 6 - Advanced Models

In [79]:
from ift6758.data.nhl_data_parser import NHLDataParser
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  RocCurveDisplay
from sklearn.calibration import CalibrationDisplay
import matplotlib.pyplot as plt
import os
import wandb
from ift6758.data.data_logger import DataLogger

In [80]:
# Load the data
data_parser = NHLDataParser()
train_df = data_parser.get_shot_and_goal_pbp_df_for_seasons(2016, 2019, with_playoff_season=False)
test_df = data_parser.get_shot_and_goal_pbp_df_for_season(2020, with_playoff_season=False)

In [81]:
#default values
project_name = "IFT6758.2024-B08"
entity_name = "team08"
dir_path = os.getenv('WANDB_DIR')


In [82]:
train_df

Unnamed: 0,gameId,timeRemaining,periodNumber,timeInPeriod,isGoal,shotType,emptyNet,xCoord,yCoord,zoneCode,...,shootingPlayer,goalieInNet,previousEvent,timeDiff,previousEventX,previousEventY,rebound,distanceDiff,shotAngleDiff,speed
0,2016020001,1129,1,01:11,0,wrist,0,-77.0,5.0,O,...,Mitch Marner,Craig Anderson,blocked-shot,1.0,-61.0,11.0,0,17.088007,0.000000,17.088007
1,2016020001,1027,1,02:53,0,wrist,0,86.0,13.0,O,...,Chris Kelly,Frederik Andersen,giveaway,5.0,54.0,-5.0,0,36.715120,0.000000,7.343024
2,2016020001,959,1,04:01,0,wrist,0,23.0,-38.0,N,...,Cody Ceci,Frederik Andersen,missed-shot,18.0,-72.0,0.0,0,102.318131,0.000000,5.684341
3,2016020001,914,1,04:46,0,slap,0,33.0,-15.0,O,...,Erik Karlsson,Frederik Andersen,missed-shot,19.0,77.0,-2.0,0,45.880279,0.000000,2.414752
4,2016020001,794,1,06:46,0,wrist,0,-34.0,28.0,O,...,Martin Marincin,Craig Anderson,hit,16.0,47.0,34.0,0,81.221918,0.000000,5.076370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305764,2019021082,195,3,16:45,0,backhand,0,77.0,-23.0,O,...,Nicholas Paul,Cal Petersen,blocked-shot,21.0,-72.0,-21.0,0,149.013422,0.000000,7.095877
305765,2019021082,160,3,17:20,0,slap,0,-76.0,38.0,O,...,Drew Doughty,Craig Anderson,hit,16.0,-95.0,-17.0,0,58.189346,0.000000,3.636834
305766,2019021082,158,3,17:22,0,backhand,0,-79.0,10.0,O,...,Gabriel Vilardi,Craig Anderson,shot-on-goal,2.0,-76.0,38.0,1,28.160256,26.113913,14.080128
305767,2019021082,70,3,18:50,0,snap,0,81.0,-7.0,O,...,Thomas Chabot,Cal Petersen,missed-shot,32.0,-60.0,40.0,0,148.627050,0.000000,4.644595


In [83]:
#Data Preprocessing
dropped_columns = ['gameId',
                   'timeRemaining',
                   'periodNumber',
                   'timeInPeriod',
                   'xCoord',
                   'yCoord',
                   'zoneCode',
                   'shootingTeam',
                   'shootingPlayer',
                   'previousEventX',
                   'previousEventY',
                   'goalieInNet',
                   'shootingTeamSide',
                   ]
train_df_clean = train_df.drop(columns=dropped_columns)

#Drop rows with missing values
train_df_clean = train_df_clean.dropna()


#one hot encoding
train_df_clean = pd.get_dummies(train_df_clean,columns = ['shotType','previousEvent'])



## Decision Tree

In [84]:
# load path and make dirs to store model
model_path = os.getenv("MODEL_PATH") #edit .env file to change path
test_path = os.path.join(model_path, "decision_tree")
if not os.path.exists(test_path):
    os.makedirs(test_path)


In [None]:

# Define the sweep configuration
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'roc_auc_score',
        'goal': 'maximize'
    },
    'parameters': {
        'criterion': {
            'values': ['gini', 'entropy']
        },
        'splitter': {
            'values': ['best', 'random']
        },
        'max_depth': {
            'values': [None,5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        },
        'min_samples_split': {
            'values': [2, 5, 10]
        },
        'min_samples_leaf': {
            'values': [1, 2, 4]
        },
        'max_features': {
            'values': ['sqrt', 'log2']
        }
        
    }
}

# Define the training function
def train(config=None):
    # Initialize a new wandb run
    with wandb.init(dir=dir_path, 
                    group='decision_tree',
                    job_type='log_hyperparameter_sweep',
                    name=f'log_hyperparameter_sweep: decision_tree'):

        # Get the hyperparameters
        config = wandb.config

       # Split the data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(train_df_clean.drop(columns='isGoal'), train_df_clean['isGoal'], test_size=0.3, stratify=train_df_clean['isGoal'])
    
        # Initialize the model
        model = DecisionTreeClassifier(
            criterion=config.criterion,
            splitter=config.splitter,
            max_depth=config.max_depth,
            min_samples_split=config.min_samples_split,
            min_samples_leaf=config.min_samples_leaf,
            max_features=config.max_features
        )

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_val)

        # Calculate metrics
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        roc_auc = roc_auc_score(y_val, y_pred)
        avg_precision = average_precision_score(y_val, y_pred)

        # Log metrics to wandb
        wandb.log({
            'precision_score': precision,
            'recall_score': recall,
            'f1_score': f1,
            'roc_auc_score': roc_auc,
            'average_precision_score': avg_precision
        })

   




In [86]:
sweep_id = wandb.sweep(sweep_config, project=project_name, entity=entity_name)
wandb.agent(sweep_id, function=train, count=100)
wandb.teardown()

Create sweep with ID: 1ra4oxhs
Sweep URL: https://wandb.ai/team08/IFT6758.2024-B08/sweeps/1ra4oxhs


wandb: Agent Starting Run: 2qd1axgz with config:
wandb: 	criterion: gini
wandb: 	max_depth: 45
wandb: 	max_features: log2
wandb: 	min_samples_leaf: 1
wandb: 	min_samples_split: 2
wandb: 	splitter: random


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_precision_score,▁
f1_score,▁
precision_score,▁
recall_score,▁
roc_auc_score,▁

0,1
average_precision_score,0.11925
f1_score,0.20495
precision_score,0.20344
recall_score,0.20649
roc_auc_score,0.55965


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: zc0i6x5c with config:
wandb: 	criterion: entropy
wandb: 	max_depth: None
wandb: 	max_features: log2
wandb: 	min_samples_leaf: 4
wandb: 	min_samples_split: 2
wandb: 	splitter: best


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_precision_score,▁
f1_score,▁
precision_score,▁
recall_score,▁
roc_auc_score,▁

0,1
average_precision_score,0.12832
f1_score,0.1938
precision_score,0.32029
recall_score,0.13894
roc_auc_score,0.55357


wandb: Agent Starting Run: e27ef9uw with config:
wandb: 	criterion: entropy
wandb: 	max_depth: 25
wandb: 	max_features: log2
wandb: 	min_samples_leaf: 2
wandb: 	min_samples_split: 2
wandb: 	splitter: random


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_precision_score,▁
f1_score,▁
precision_score,▁
recall_score,▁
roc_auc_score,▁

0,1
average_precision_score,0.1209
f1_score,0.09318
precision_score,0.56098
recall_score,0.05081
roc_auc_score,0.52326


wandb: Ctrl + C detected. Stopping sweep.
