In [1]:
import pandas as pd
import os
import sys
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils.data_handler import get_games, compute_head_to_head_avg, get_season_start

df = get_games()

/Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference


In [None]:
df['home_avg_points'] = (
    df.groupby('home_teamId')['home_teamScore']
    .transform(lambda x: x.shift(1).expanding().mean())
)
df['away_avg_points'] = (
    df.groupby('away_teamId')['away_teamScore']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df[['home_head_to_head_avg_points', 'away_head_to_head_avg_points']] = df.apply(
    lambda row: compute_head_to_head_avg(row, df), axis=1
)

df['home_last_5_win_percentage'] = (
    df.groupby('home_teamId')['home_win']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)
df['away_last_5_win_percentage'] = (
    df.groupby('away_teamId')['away_win']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df['gameDate'] = pd.to_datetime(df['gameDate'], errors='coerce', utc=True)

df['season'] = df['gameDate'].apply(get_season_start)

df = df.sort_values('gameDate').reset_index(drop=True)

df['home_season_win_percentage'] = (
    df.groupby(['home_teamId', 'season'])['home_win']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['away_season_win_percentage'] = (
    df.groupby(['away_teamId', 'season'])['away_win']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['points_avg_diff'] = df['home_avg_points'] - df['away_avg_points']
df['winrate_diff'] = df['home_season_win_percentage'] - df['away_season_win_percentage']

df['home_advantage'] = 1
df['margin'] = df['home_teamScore'] - df['away_teamScore']

In [3]:
features = [
    'home_avg_points',
    'away_avg_points',
    'points_avg_diff',
    'winrate_diff',
    'home_head_to_head_avg_points',
    'away_head_to_head_avg_points',
    'home_last_5_win_percentage',
    'away_last_5_win_percentage',
    'home_season_win_percentage',
    'away_season_win_percentage',
    'home_advantage'
]
df = df.dropna(subset=features + ['margin']).reset_index(drop=True)

X = df[features]
y = df['margin']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae:.2f}')
print(f'R²: {r2:.3f}')

MAE: 12.50
R²: -0.752


In [None]:
base_dir = os.path.dirname(os.getcwd())
models_dir = os.path.join(base_dir, "models")

MODEL_PATH = os.path.join(models_dir, "expected_margin_model.pkl")
SCALER_PATH = os.path.join(models_dir, "expected_margin_scaler.pkl")

joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)