In [9]:
import pandas as pd
import os
import joblib
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils.data_handler import get_games, compute_head_to_head_avg, get_season_start

In [10]:
# Get the data
df = get_games()
df = df.fillna(0)

df["home_avg_points"] = (
    df.groupby("home_teamId")["home_teamScore"]
    .transform(lambda x: x.shift(1).expanding().mean())
)

df["away_avg_points"] = (
    df.groupby("away_teamId")["away_teamScore"]
    .transform(lambda x: x.shift(1).expanding().mean())
)

df[["home_head_to_head_avg_points", "away_head_to_head_avg_points"]] = df.apply(
    lambda row: compute_head_to_head_avg(row, df), axis=1
)

df["home_last_5_win_percentage"] = (
    df.groupby("home_teamId")["home_win"]
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)
df["away_last_5_win_percentage"] = (
    df.groupby("away_teamId")["away_win"]
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df["gameDate"] = pd.to_datetime(df["gameDate"], errors="coerce", utc=True)

df["season"] = df["gameDate"].apply(get_season_start)

df = df.sort_values("gameDate").reset_index(drop=True)

df["home_season_win_percentage"] = (
    df.groupby(["home_teamId", "season"])["home_win"]
    .transform(lambda x: x.shift(1).expanding().mean())
)

df["away_season_win_percentage"] = (
    df.groupby(["away_teamId", "season"])["away_win"]
    .transform(lambda x: x.shift(1).expanding().mean())
)

df["home_advantage"] = 1

In [11]:
features = [
    "home_avg_points",
    "away_avg_points",
    "home_head_to_head_avg_points",
    "away_head_to_head_avg_points",
    "home_last_5_win_percentage",
    "away_last_5_win_percentage",
    "home_advantage",
]

df = df.dropna(subset=features + ["overtime"]).reset_index(drop=True)
X = df[features]
y = df["overtime"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=3,
    random_state=42
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")
print(f"F1 Score: {f1:.3f}")

sample = X_test_scaled[:1]
prob_ot = model.predict_proba(sample)[0, 1]
print(f"\nPredicted overtime probability: {prob_ot*100:.2f}%")

Accuracy: 0.951
AUC: 0.501
F1 Score: 0.000

Predicted overtime probability: 4.34%


In [None]:
base_dir = os.path.dirname(os.getcwd())
models_dir = os.path.join(base_dir, "models")
scalers_dir = os.path.join(base_dir, 'scalers')

MODEL_PATH = os.path.join(models_dir, "overtime_model_gb.pkl")
SCALER_PATH = os.path.join(scalers_dir, "overtime_scaler.pkl")

joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)