In [None]:
import pandas as pd
import os
import sys
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.data_handler import get_games, compute_head_to_head_avg, get_season_start

df = get_games()

/Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference

Error while processing file: /Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference/2020_21/sportsref_download(2).xls
ImportError: Missing optional dependency 'lxml'.  Use pip or conda to install lxml.
No columns

Error while processing file: /Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference/2020_21/sportsref_download(3).xls
ImportError: Missing optional dependency 'lxml'.  Use pip or conda to install lxml.
No columns

Error while processing file: /Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference/2020_21/sportsref_download(8).xls
ImportError: Missing optional dependency 'lxml'.  Use pip or conda to install lxml.
No columns

Error while processing file: /Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/data/basketball_reference/2020_21/sportsref_download(4).xls
ImportError: Missing optional

In [None]:
df['home_avg_points'] = (
    df.groupby('home_teamId')['home_teamScore']
    .transform(lambda x: x.shift(1).expanding().mean())
)
df['away_avg_points'] = (
    df.groupby('away_teamId')['away_teamScore']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df[['home_head_to_head_avg_points', 'away_head_to_head_avg_points']] = df.apply(
    lambda row: compute_head_to_head_avg(row, df), axis=1
)

df['home_last_5_win_percentage'] = (
    df.groupby('home_teamId')['home_win']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)
df['away_last_5_win_percentage'] = (
    df.groupby('away_teamId')['away_win']
    .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
)

df['gameDate'] = pd.to_datetime(df['gameDate'], errors='coerce', utc=True)

df['season'] = df['gameDate'].apply(get_season_start)

df = df.sort_values('gameDate').reset_index(drop=True)

df['home_season_win_percentage'] = (
    df.groupby(['home_teamId', 'season'])['home_win']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['away_season_win_percentage'] = (
    df.groupby(['away_teamId', 'season'])['away_win']
    .transform(lambda x: x.shift(1).expanding().mean())
)

df['home_advantage'] = 1

In [None]:
features = [
    'home_avg_points', 'away_avg_points',
    'home_head_to_head_avg_points', 'away_head_to_head_avg_points',
    'home_last_5_win_percentage', 'away_last_5_win_percentage',
    'home_advantage'
]
df = df.dropna(subset=features + ['home_teamScore', 'away_teamScore']).reset_index(drop=True)

X = df[features]
y_home = df['home_teamScore']
y_away = df['away_teamScore']

X_train, X_test, yh_train, yh_test = train_test_split(X, y_home, test_size=0.2, random_state=42)
_, _, ya_train, ya_test = train_test_split(X, y_away, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

home_model = RandomForestRegressor(n_estimators=200, random_state=42)
away_model = RandomForestRegressor(n_estimators=200, random_state=42)

home_model.fit(X_train_scaled, yh_train)
away_model.fit(X_train_scaled, ya_train)

home_pred = home_model.predict(X_test_scaled)
away_pred = away_model.predict(X_test_scaled)

mae_home = mean_absolute_error(yh_test, home_pred)
mae_away = mean_absolute_error(ya_test, away_pred)
r2_home = r2_score(yh_test, home_pred)
r2_away = r2_score(ya_test, away_pred)

print(f'Home Points - MAE: {mae_home:.2f}, R²: {r2_home:.3f}')
print(f'Away Points - MAE: {mae_away:.2f}, R²: {r2_away:.3f}')

Home Points - MAE: 10.02, R²: 0.018
Away Points - MAE: 10.33, R²: 0.014


In [None]:
base_dir = os.path.dirname(os.getcwd())
models_dir = os.path.join(base_dir, 'models')
scalers_dir = os.path.join(base_dir, 'scalers')

HOME_MODEL_PATH = os.path.join(models_dir, 'home_points_model.pkl')
AWAY_MODEL_PATH = os.path.join(models_dir, 'away_points_model.pkl')
SCALER_PATH = os.path.join(scalers_dir, 'points_scaler.pkl')

joblib.dump(home_model, HOME_MODEL_PATH)
joblib.dump(away_model, AWAY_MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)


['/Users/simonsalaj/Documents/test_ai/StatistIQApp/Scripts/ai/models/points_scaler.pkl']