<a href="https://colab.research.google.com/github/rafabandoni/nfl-predict/blob/main/notebooks/03_nfl_predict_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from xgboost import XGBClassifier

import shap

# 0.3 Pre Processing

In [None]:
featured_df = pd.read_parquet('https://github.com/rafabandoni/nfl-predict/raw/refs/heads/main/data/output/featured_df.parquet')
featured_df.head()

## Train Test Split

**IMPORTANT**: X, y and train test split before making preprocessing to avoid data leakage

In [None]:
target = 'home_winner'
X = featured_df.drop(target, axis=1)
y = featured_df[[target]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

## Principal Component Analysis(PCA)

PCA is an unsupervised learning algorithm, meaning it doesn’t require prior knowledge of target variables. It’s commonly used in exploratory data analysis and machine learning to simplify datasets without losing critical information.

Know more: https://www.geeksforgeeks.org/principal-component-analysis-pca/

In [None]:
# Without pca seems to work better

# pca = PCA(n_components=0.7)
# X_train = pca.fit_transform(X_train)
# X_train

In [None]:
X_test = scaler.transform(X_test)
# X_test = pca.transform(X_test)

# 0.3.1 ML Model

## Building model

In [None]:
xgb = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    min_child_weight=4,
    learning_rate=0.2,
    subsample=0.7,
    colsample_bytree=0.5,
    gamma=0.1,
    reg_lambda=2,
    alpha=2,
    # scale_pos_weight=0.1
    # use_label_encoder=False,
    # eval_metric='mlogloss',
    # enable_categorical=True,
    # tree_method='hist',
    # device= 'cuda',
    # # objective='multi:softmax',
    # objective='binary:logistic',
    # num_class=2
)
xgb.fit(X_train, y_train)

## Predicting

In [None]:
predictions = xgb.predict(X_test)

## Evaluating model

In [None]:
accuracy_score(y_test, predictions)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# explainer = shap.Explainer(xgb, X_test)
explainer = shap.Explainer(xgb, pd.DataFrame(X_train, columns=X.columns))

# shap_values = explainer(X_test)
shap_values = explainer(pd.DataFrame(X_test, columns=X.columns))

shap.plots.waterfall(shap_values[0])

In [None]:
shap.plots.beeswarm(shap_values)