<a href="https://colab.research.google.com/github/rafabandoni/nfl-predict/blob/main/notebooks/03_nfl_predict_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from xgboost import XGBClassifier

import shap
import optuna
import pickle

# 0.3 Pre Processing

In [None]:
featured_df = pd.read_parquet('https://github.com/rafabandoni/nfl-predict/raw/refs/heads/main/data/output/featured_df.parquet')
featured_df.head()

## Train Test Split

**IMPORTANT**: X, y and train test split before making preprocessing to avoid data leakage

In [None]:
target = 'home_winner'
X = featured_df.drop(target, axis=1)
y = featured_df[[target]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scaling

In [None]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)

## Principal Component Analysis(PCA)

PCA is an unsupervised learning algorithm, meaning it doesn’t require prior knowledge of target variables. It’s commonly used in exploratory data analysis and machine learning to simplify datasets without losing critical information.

Know more: https://www.geeksforgeeks.org/principal-component-analysis-pca/

In [None]:
# Without pca seems to work better

# pca = PCA(n_components=0.7)
# X_train = pca.fit_transform(X_train)
# X_train

In [None]:
# X_test = scaler.transform(X_test)
# X_test = pca.transform(X_test)

# 0.3.1 ML Model

## Building model

In [None]:
#Define the objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'max_depth': trial.suggest_int('max_depth', 2, 25),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.2, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 2, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1, 15.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 5.0)
    }

    model = XGBClassifier(
        **params,
        # use_label_encoder=False,
        eval_metric='mlogloss',
        # enable_categorical=True,
        tree_method='hist',
        device= 'cuda',
        # objective='multi:softmax',
        objective='binary:logistic',
        # num_class=2
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # mcc = matthews_corrcoef(y_test, y_pred)
    # acc = accuracy_score(y_test, y_pred)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    acc = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy').mean()
    trial.set_user_attr("acc", acc)
    return acc

# Callback to print the MCC score for each trial
# def print_mcc_callback(study, trial):
#     mcc = trial.user_attrs["mcc"]
#     print(f"Trial {trial.number}: MCC = {mcc}")

# def print_accuracy_callback(study, trial):
#     acc = trial.user_attrs["acc"]
#     print(f"Trial {trial.number}: Accuracy score = {acc}")

# Optimize hyperparameters with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Get the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

In [None]:
xgb = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    tree_method='hist',
    device= 'cuda',
    objective='binary:logistic'
)
xgb.fit(X_train, y_train)

## Predicting

In [None]:
predictions = xgb.predict(X_test)

## Evaluating model

In [None]:
cf_matrix = confusion_matrix(y_test, predictions)

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Reds')

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# explainer = shap.Explainer(xgb, X_test)
explainer = shap.Explainer(xgb, pd.DataFrame(X_train.astype('float64'), columns=X.columns))

# shap_values = explainer(X_test)
shap_values = explainer(pd.DataFrame(X_test, columns=X.columns))

shap.plots.waterfall(shap_values[0])

In [None]:
shap.plots.beeswarm(shap_values)

## Saving model

In [None]:
# save
with open('model_nfl.pkl','wb') as f:
    pickle.dump(xgb,f)