# TEST ON TEST SET

In [None]:
from ift6758.data.nhl_data_parser import NHLDataParser
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score,RocCurveDisplay
from sklearn.calibration import CalibrationDisplay
import wandb
import os
import pandas as pd

In [None]:
# Load the data
data_parser = NHLDataParser()
test_df = data_parser.get_shot_and_goal_pbp_df_for_season(2020, with_playoff_season=False)

In [None]:
#Data Preprocessing
def preprocess_data(train_df,scaling = False):
    dropped_columns = ['gameId',
                   'timeRemaining',
                   'periodNumber',
                   'timeInPeriod',
                   'xCoord',
                   'yCoord',
                   'zoneCode',
                   'shootingTeam',
                   'shootingPlayer',
                   'previousEventX',
                   'previousEventY',
                   'goalieInNet',
                   'shootingTeamSide',
                   ]
    numeric_columns = ['shotDistance',
                       'shotAngle',
                       'timeDiff',
                       'distanceDiff',
                       'shotAngleDiff',
                       'speed']
    train_df_clean = train_df.drop(columns=dropped_columns)

    #Drop rows with missing values
    train_df_clean = train_df_clean.dropna()
    
    #scale the numerical columns
    if scaling:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        train_df_clean[numeric_columns] = scaler.fit_transform(train_df_clean[numeric_columns])
    
   

    #one hot encoding
    train_df_clean = pd.get_dummies(train_df_clean,columns = ['shotType','previousEvent'])

    
    
    return train_df_clean






In [None]:
#default values
project_name = "IFT6758.2024-B08"
entity_name = "team08"
dir_path = os.getenv('WANDB_DIR')


In [None]:
import joblib
model_list = ['Decision_Tree:latest',
              'Linear_SVC:latest',
              'Kernel_SVC:latest',
              'Logistic_Regression:latest',
              'Random_Forest:latest',
              'lg_distance:latest',
              'lg_angle:latest',
              'lg_angle_distance:latest',
              'xgb_Final:latest']


# Initialize wandb
wandb.init(project=project_name, entity=entity_name, dir=dir_path)

# Download the model from wandb
for name in model_list:
    model_artifact = wandb.use_artifact(name)
    model_dir = model_artifact.download()
    model_file_name = os.path.basename(model_dir)
wandb.finish()





In [None]:
# Load the model (assuming it's a scikit-learn model saved as a pickle file)
model_path = os.path.join(model_dir, 'random_forest.pkl')
model = joblib.load(model_path)


In [None]:
#load the models

d_tree = joblib.load('artifacts\\Decision_Tree-v0\\decision_tree_undersample_newfeatures.pkl')
linear_svc = joblib.load('artifacts\\Linear_SVC-v0\\linearSvc_balanced_Kfold_newfeatures.pkl')
kernel_svc = joblib.load('artifacts\\Kernel_SVC-v0\\svc_balanced_Kfold_newfeatures.pkl')
lg_best = joblib.load('artifacts\\Logistic_Regression-v0\\logistic_newfeatures.pkl')
random_forest = joblib.load('artifacts\\Random_Forest-v0\\random_forest_balanced_newfeatures.pkl')
lg_distance = joblib.load('artifacts\\lg_distance-v0\\lg_distance.pkl')
lg_angle = joblib.load('artifacts\\lg_angle-v0\\lg_angle.pkl')
lg_angle_distance = joblib.load('artifacts\\lg_angle_distance-v0\\lg_distance_angle.pkl')
xgb = joblib.load('artifacts\\xgb_Final-v0\\xgb_final.pkl')


#make a dictionary of models
models = {'Random Forest':random_forest,
          'Logistic Regression Distance':lg_distance,
          'Logistic Regression Angle':lg_angle,
          'Logistic Regression Angle Distance':lg_angle_distance,
          'XGBoost':xgb}


## Best Model

In [38]:
test_df_clean = preprocess_data(test_df)
X_test = test_df_clean.drop(columns=['isGoal'])
y_test = test_df_clean['isGoal']
prob_scores_rf = random_forest.predict_proba(X_test)[:,1]


# Logistic Regression : Angle

In [40]:
X_test = test_df[['shotAngle']]
y_test = test_df['isGoal']
prob_scores_lg_angle = lg_angle.predict_proba(X_test)[:,1]

# Logistic Regression : Distance

In [44]:
X_test = test_df[['shotDistance']]
y_test = test_df['isGoal']
prob_scores_lg_distance = lg_distance.predict_proba(X_test)[:,1]

# Logistic Regression : Angle and Distance

In [49]:
X_test = test_df[['shotDistance','shotAngle']]
y_test = test_df['isGoal']
prob_scores_lg_angle_distance = lg_angle_distance.predict_proba(X_test)[:,1]

# XgBoost

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X_test = test_df[['periodNumber',
                   'shotAngle', 
                   'shotDistance', 
                   'shotType', 
                   'xCoord',
                   'yCoord', 
                   'previousEventX', 
                   'previousEventY', 
                   'rebound', 
                   'timeDiff',
                   'distanceDiff', 
                   'shotAngleDiff', 
                   'speed']]

y_test = test_df['isGoal']

prob_scores_xgb = xgb.predict_proba(X_test)[:,1]


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:shotType: object