In [1]:
import pandas as pd
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score
import datetime as dt
import joblib

# Add parent folder to the system path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.data_handler import get_games, compute_head_to_head_avg, get_season_start

/Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference


In [2]:
# Get the data
df = get_games()
df = df.fillna(0)

df['home_avg_points'] = (
    df.groupby('home_teamId')['home_teamScore']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['away_avg_points'] = (
    df.groupby('away_teamId')['away_teamScore']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df[['home_head_to_head_avg_points', 'away_head_to_head_avg_points']] = df.apply(
    lambda row: compute_head_to_head_avg(row, df), axis=1
)

df['home_last_5_win_percentage'] = (
    df.groupby('home_teamId')['home_win']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)
df['away_last_5_win_percentage'] = (
    df.groupby('away_teamId')['away_win']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df['gameDate'] = pd.to_datetime(df['gameDate'], errors='coerce', utc=True)

df['season'] = df['gameDate'].apply(get_season_start)

df = df.sort_values('gameDate').reset_index(drop=True)

df['home_season_win_percentage'] = (
    df.groupby(['home_teamId', 'season'])['home_win']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['away_season_win_percentage'] = (
    df.groupby(['away_teamId', 'season'])['away_win']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['home_advantage'] = 1

In [3]:
# # # # # # # # # # # #
# Logistic Regression #
# # # # # # # # # # # #

features = [
    'home_avg_points',
    'away_avg_points',
    'home_head_to_head_avg_points',
    'away_head_to_head_avg_points',
    'home_last_5_win_percentage',
    'away_last_5_win_percentage',
    'home_season_win_percentage',
    'away_season_win_percentage',
]
df = df.dropna(subset=features + ['home_win']).reset_index(drop=True)

X = df[features]
y = df['home_win']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
r2 = r2_score(y_test, y_prob)

print(f'Accuracy: {acc:.3f}')
print(f'AUC: {auc:.3f}')

Accuracy: 0.625
AUC: 0.571


In [None]:
base_dir = os.path.dirname(os.getcwd())  # this points to 'Scripts/ai'
models_dir = os.path.join(base_dir, 'models')
scalers_dir = os.path.join(base_dir, 'scalers')

MODEL_PATH = os.path.join(models_dir, 'win_probability_model.pkl')
SCALER_PATH = os.path.join(scalers_dir, 'win_probability_scaler.pkl')

joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

['/Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/scalers/win_probability_scaler.pkl']