In [2]:
from sports_modelling.data_management.NCAABManager import NCAABManager

In [3]:
ncaab_manager = NCAABManager()

data = ncaab_manager.load_season_data(2024)

In [5]:
data.keys()

dict_keys(['game_info', 'box_scores', 'play_by_play'])

Todo:
- build power rankings for teams
- build position power rankings, offensive and defensive

- try to build model for game totals? can backtest, etc. after scraping odds portal.

In [2]:
import pandas as pd
import numpy as np
from typing import Dict, Tuple
from dataclasses import dataclass
from sklearn.preprocessing import StandardScaler

@dataclass
class TeamStats:
    """Container for team statistics."""
    games_played: int
    points_scored: float
    points_allowed: float
    possessions: float
    offensive_rating: float  # Points per 100 possessions
    defensive_rating: float  # Points allowed per 100 possessions
    net_rating: float       # Difference between offensive and defensive rating
    
class PowerRankings:
    """Calculate and maintain basketball power rankings."""
    
    def __init__(self):
        self.team_stats: Dict[str, TeamStats] = {}
        self.adjusted_ratings: Dict[str, Dict[str, float]] = {}
        self.scaler = StandardScaler()
        
    def estimate_possessions(self, team_box: pd.DataFrame) -> float:
        """
        Estimate possessions using the NBA formula.
        Poss = FGA - OReb + TO + (0.44 * FTA)
        """
        fga = team_box['fga'].sum()
        oreb = team_box['oreb'].sum()
        to = team_box['to'].sum()
        fta = team_box['fta'].sum()
        
        return fga - oreb + to + (0.44 * fta)
    
    def calculate_base_ratings(self, game_info: pd.DataFrame, box_scores: pd.DataFrame) -> None:
        """Calculate base (unadjusted) ratings for all teams."""
        # Initialize containers for team stats
        team_possessions = {}
        team_points_scored = {}
        team_points_allowed = {}
        games_played = {}
        
        # Process each game
        for _, game in game_info.iterrows():
            home_team = game['home_team']
            away_team = game['away_team']
            game_id = game['game_id']
            
            # Get box scores for this game
            game_box = box_scores[box_scores['game_id'] == game_id]
            home_box = game_box[game_box['team'] == home_team]
            away_box = game_box[game_box['team'] == away_team]
            
            # Calculate possessions
            home_poss = self.estimate_possessions(home_box)
            away_poss = self.estimate_possessions(away_box)
            avg_poss = (home_poss + away_poss) / 2  # Use average for consistency
            
            # Update team stats
            for team, score, opp_score, poss in [
                (home_team, game['home_score'], game['away_score'], avg_poss),
                (away_team, game['away_score'], game['home_score'], avg_poss)
            ]:
                team_possessions[team] = team_possessions.get(team, 0) + poss
                team_points_scored[team] = team_points_scored.get(team, 0) + score
                team_points_allowed[team] = team_points_allowed.get(team, 0) + opp_score
                games_played[team] = games_played.get(team, 0) + 1
        
        # Calculate per-game and per-possession stats
        for team in games_played.keys():
            games = games_played[team]
            possessions = team_possessions[team]
            points_scored = team_points_scored[team]
            points_allowed = team_points_allowed[team]
            
            # Calculate ratings per 100 possessions
            offensive_rating = (points_scored / possessions) * 100
            defensive_rating = (points_allowed / possessions) * 100
            net_rating = offensive_rating - defensive_rating
            
            self.team_stats[team] = TeamStats(
                games_played=games,
                points_scored=points_scored/games,
                points_allowed=points_allowed/games,
                possessions=possessions/games,
                offensive_rating=offensive_rating,
                defensive_rating=defensive_rating,
                net_rating=net_rating
            )
    
    def calculate_adjusted_ratings(self, game_info: pd.DataFrame, iterations: int = 10) -> None:
        """
        Calculate adjusted ratings accounting for opponent strength.
        Uses an iterative approach similar to KenPom's adjustments.
        """
        # Initialize with base ratings
        adjusted = {team: {
            'offensive': stats.offensive_rating,
            'defensive': stats.defensive_rating,
            'net': stats.net_rating
        } for team, stats in self.team_stats.items()}
        
        # Iteratively adjust ratings
        for _ in range(iterations):
            new_adjusted = {team: {'offensive': 0, 'defensive': 0, 'games': 0}
                          for team in self.team_stats.keys()}
            
            # Process each game
            for _, game in game_info.iterrows():
                home_team = game['home_team']
                away_team = game['away_team']
                home_score = game['home_score']
                away_score = game['away_score']
                
                # Get average possession length for this game
                home_poss = self.team_stats[home_team].possessions
                away_poss = self.team_stats[away_team].possessions
                avg_poss = (home_poss + away_poss) / 2
                
                # Calculate tempo-free scores (per 100 possessions)
                home_off = (home_score / avg_poss) * 100
                away_off = (away_score / avg_poss) * 100
                
                # Adjust based on opponent's current defensive rating
                new_adjusted[home_team]['offensive'] += home_off + (100 - adjusted[away_team]['defensive'])
                new_adjusted[away_team]['offensive'] += away_off + (100 - adjusted[home_team]['defensive'])
                
                # Adjust based on opponent's current offensive rating
                new_adjusted[home_team]['defensive'] += away_off + (100 - adjusted[away_team]['offensive'])
                new_adjusted[away_team]['defensive'] += home_off + (100 - adjusted[home_team]['offensive'])
                
                new_adjusted[home_team]['games'] += 1
                new_adjusted[away_team]['games'] += 1
            
            # Calculate averages
            for team in adjusted:
                if new_adjusted[team]['games'] > 0:
                    adjusted[team]['offensive'] = new_adjusted[team]['offensive'] / new_adjusted[team]['games']
                    adjusted[team]['defensive'] = new_adjusted[team]['defensive'] / new_adjusted[team]['games']
                    adjusted[team]['net'] = adjusted[team]['offensive'] - adjusted[team]['defensive']
        
        self.adjusted_ratings = adjusted
    
    def get_standardized_ratings(self) -> pd.DataFrame:
        """
        Convert ratings to standardized scores (z-scores).
        Returns DataFrame with standardized ratings for each team.
        """
        ratings_df = pd.DataFrame.from_dict(
            self.adjusted_ratings,
            orient='index',
            columns=['offensive', 'defensive', 'net']
        )
        
        # Convert to z-scores
        standardized = pd.DataFrame(
            self.scaler.fit_transform(ratings_df),
            index=ratings_df.index,
            columns=ratings_df.columns
        )
        
        return standardized

def calculate_power_rankings(game_info: pd.DataFrame, box_scores: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate power rankings from game data.
    
    Args:
        game_info (pd.DataFrame): Game information DataFrame
        box_scores (pd.DataFrame): Box scores DataFrame
    
    Returns:
        pd.DataFrame: Power rankings for each team
    """
    rankings = PowerRankings()
    
    # Calculate base ratings
    rankings.calculate_base_ratings(game_info, box_scores)
    
    # Calculate adjusted ratings
    rankings.calculate_adjusted_ratings(game_info)
    
    # Get standardized ratings
    return rankings.get_standardized_ratings()

In [4]:
from sports_modelling.data_management.NCAABManager import NCAABManager
# Example usage
def main():
    # Initialize NCAABManager and load data
    manager = NCAABManager()
    season_data = manager.load_season_data(2024)
    
    # Calculate power rankings
    rankings_df = calculate_power_rankings(
        season_data['game_info'],
        season_data['box_scores']
    )
    
    # Sort by net rating and display top 10
    print("\nTop 10 Teams by Net Rating:")
    print(rankings_df.sort_values('net', ascending=False).head(10))


main()


Top 10 Teams by Net Rating:
                          offensive  defensive       net
UConn Huskies              1.858179  -1.717207  1.945466
Houston Cougars            1.455756  -2.047882  1.921042
Purdue Boilermakers        1.941465  -1.483357  1.857074
Arizona Wildcats           1.966027  -1.371696  1.806886
Iowa State Cyclones        1.406923  -1.809713  1.760860
Auburn Tigers              1.764035  -1.456371  1.748696
Tennessee Volunteers       1.502959  -1.652975  1.722724
North Carolina Tar Heels   1.772717  -1.373507  1.706441
Duke Blue Devils           1.688269  -1.437642  1.698345
Alabama Crimson Tide       2.217445  -0.892330  1.668047


In [2]:
from pathlib import Path
from sports_modelling.data_management.NCAABManager import NCAABManager
from sports_modelling.power_rankings.PowerRankingsManager import PowerRankingsManager
import pandas as pd

# Load data
ncaab = NCAABManager()
data = ncaab.load_season_data(2024)

# Initialize rankings manager
manager = PowerRankingsManager(ncaab.data_dir)

# Calculate historical rankings
rankings_df = manager.calculate_historical_rankings(
    game_info=data['game_info'],
    box_scores=data['box_scores']
)

# Save rankings
manager.save_rankings(rankings_df, season=2024)

In [4]:
# Get rankings for specific date
date_rankings = manager.get_rankings_for_date(pd.Timestamp('2024-04-02'), season=2024)

# Get team history
duke_history = manager.get_team_ranking_history('Duke Blue Devils', season=2024)

# Get date range
march_rankings = manager.get_rankings_date_range(
    start_date=pd.Timestamp('2024-03-01'),
    end_date=pd.Timestamp('2024-03-31'),
    season=2024
)

date_rankings

Unnamed: 0,team,elo,efficiency_offensive_offensive,efficiency_offensive_defensive,efficiency_defensive_offensive,efficiency_defensive_defensive,form,date,elo_standardized,efficiency_offensive_offensive_standardized,efficiency_offensive_defensive_standardized,efficiency_defensive_offensive_standardized,efficiency_defensive_defensive_standardized,form_standardized
0,Indiana State Sycamores,1503.452346,67.294751,60.565276,67.294751,60.565276,0.0,"April 02, 2024",0.902967,1.172684,0.43787,1.172684,0.43787,0.0
1,Utah Utes,1496.547654,60.483871,67.204301,60.483871,67.204301,0.0,"April 02, 2024",-0.902967,0.375882,1.286024,0.375882,1.286024,0.0
2,Seton Hall Pirates,1504.161384,57.502738,45.865279,57.502738,45.865279,0.0,"April 02, 2024",1.088417,0.027121,-1.440094,0.027121,-1.440094,0.0
3,Georgia Bulldogs,1495.838616,43.802301,54.916318,43.802301,54.916318,0.0,"April 02, 2024",-1.088417,-1.575687,-0.2838,-1.575687,-0.2838,0.0


In [6]:
ranks = manager.load_rankings(2024)

In [13]:
manager.get_rankings_for_date(pd.Timestamp('2024-03-01'), 2024).sort_values('efficiency_offensive_offensive', ascending=False)

Unnamed: 0,team,elo,efficiency_offensive_offensive,efficiency_offensive_defensive,efficiency_defensive_offensive,efficiency_defensive_defensive,form,date,elo_standardized,efficiency_offensive_offensive_standardized,efficiency_offensive_defensive_standardized,efficiency_defensive_offensive_standardized,efficiency_defensive_defensive_standardized,form_standardized
46118,Gonzaga Bulldogs,1564.263854,61.581970,49.783123,61.581970,49.783123,0.0,"March 01, 2024",2.468478,1.583642,-0.870490,1.583642,-0.870490,0.0
46042,UConn Huskies,1566.367994,61.206606,50.657230,61.206606,50.657230,0.0,"March 01, 2024",2.549301,1.539660,-0.759419,1.539660,-0.759419,0.0
46129,Illinois Fighting Illini,1540.995431,60.940407,54.140740,60.940407,54.140740,0.0,"March 01, 2024",1.574700,1.508469,-0.316777,1.508469,-0.316777,0.0
46040,Purdue Boilermakers,1560.318368,60.772953,52.273017,60.772953,52.273017,0.0,"March 01, 2024",2.316926,1.488848,-0.554105,1.488848,-0.554105,0.0
46043,Alabama Crimson Tide,1540.470395,60.454189,54.289150,60.454189,54.289150,0.0,"March 01, 2024",1.554533,1.451498,-0.297919,1.451498,-0.297919,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46485,Portland Bible Arrows,1495.173462,23.167075,70.182611,23.167075,70.182611,0.0,"March 01, 2024",-0.185395,-2.917473,1.721632,-2.917473,1.721632,0.0
46436,Cairn University Highlanders,1495.328320,22.147651,59.060403,22.147651,59.060403,0.0,"March 01, 2024",-0.179447,-3.036921,0.308355,-3.036921,0.308355,0.0
46487,Saint Mary-Of-The-Woods College SMWC,1495.234292,21.848517,67.531780,21.848517,67.531780,0.0,"March 01, 2024",-0.183059,-3.071970,1.384796,-3.071970,1.384796,0.0
46395,Central Penn Knights,1495.034819,20.588235,65.294118,20.588235,65.294118,0.0,"March 01, 2024",-0.190721,-3.219639,1.100461,-3.219639,1.100461,0.0


In [14]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import xgboost as xgb

class GameFeatureGenerator:
    def __init__(self, rankings_manager):
        self.rankings_manager = rankings_manager
        
    def _get_team_stats_for_date(self, game_info: pd.DataFrame, team: str) -> Dict:
        """Calculate aggregated team stats from previous games."""
        games = game_info[
            ((game_info['home_team'] == team) | (game_info['away_team'] == team))
        ]
        
        if games.empty:
            return {
                'avg_points_scored': 0,
                'avg_points_allowed': 0,
                'win_pct': 0,
                'home_win_pct': 0,
                'away_win_pct': 0,
                'avg_margin': 0,
                'games_played': 0
            }
        
        # Calculate various team statistics
        points_scored = []
        points_allowed = []
        wins = []
        home_games = 0
        home_wins = 0
        away_games = 0
        away_wins = 0
        
        for _, game in games.iterrows():
            if game['home_team'] == team:
                points_scored.append(game['home_score'])
                points_allowed.append(game['away_score'])
                wins.append(1 if game['home_score'] > game['away_score'] else 0)
                home_games += 1
                if game['home_score'] > game['away_score']:
                    home_wins += 1
            else:
                points_scored.append(game['away_score'])
                points_allowed.append(game['home_score'])
                wins.append(1 if game['away_score'] > game['home_score'] else 0)
                away_games += 1
                if game['away_score'] > game['home_score']:
                    away_wins += 1
        
        return {
            'avg_points_scored': np.mean(points_scored) if points_scored else 0,
            'avg_points_allowed': np.mean(points_allowed) if points_allowed else 0,
            'win_pct': np.mean(wins) if wins else 0,
            'home_win_pct': home_wins / home_games if home_games > 0 else 0,
            'away_win_pct': away_wins / away_games if away_games > 0 else 0,
            'avg_margin': np.mean(np.array(points_scored) - np.array(points_allowed)),
            'games_played': len(games)
        }

    def create_game_features(self, 
                           game_info: pd.DataFrame,
                           box_scores: pd.DataFrame,
                           season: int,
                           rankings_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """
        Create features for all games in the dataset.
        
        Args:
            game_info: DataFrame containing game information
            box_scores: DataFrame containing box scores
            season: Season year
            rankings_df: Optional pre-loaded rankings DataFrame
            
        Returns:
            DataFrame with features for each game
        """
        if rankings_df is None:
            rankings_df = self.rankings_manager.load_rankings(season)
        
        features_list = []
        
        for idx, game in game_info.iterrows():
            # Get game date in correct format
            game_date = pd.to_datetime(game['game_day']).strftime('%B %d, %Y')
            
            # Get rankings for this date
            date_rankings = rankings_df[rankings_df['date'] == game_date]
            
            if date_rankings.empty:
                continue
                
            # Get home and away team rankings
            home_team = game['home_team']
            away_team = game['away_team']
            
            home_rankings = date_rankings[date_rankings['team'] == home_team].iloc[0]
            away_rankings = date_rankings[date_rankings['team'] == away_team].iloc[0]
            
            # Get team stats up to this game
            prev_games = game_info[game_info['game_day'] < game['game_day']]
            home_stats = self._get_team_stats_for_date(prev_games, home_team)
            away_stats = self._get_team_stats_for_date(prev_games, away_team)
            
            # Combine all features
            features = {
                'game_id': game['game_id'],
                'game_day': game['game_day'],
                'home_team': home_team,
                'away_team': away_team,
                'is_conference': game['is_conference'],
                'is_neutral': game['is_neutral'],
                'is_postseason': game['is_postseason'],
                
                # Power Rankings Features
                'home_elo': home_rankings['elo'],
                'away_elo': away_rankings['elo'],
                'home_elo_standardized': home_rankings['elo_standardized'],
                'away_elo_standardized': away_rankings['elo_standardized'],
                'home_off_eff': home_rankings['efficiency_offensive'],
                'away_off_eff': away_rankings['efficiency_offensive'],
                'home_def_eff': home_rankings['efficiency_defensive'],
                'away_def_eff': away_rankings['efficiency_defensive'],
                'home_form': home_rankings['form'],
                'away_form': away_rankings['form'],
                
                # Team Stats Features
                'home_avg_points': home_stats['avg_points_scored'],
                'away_avg_points': away_stats['avg_points_scored'],
                'home_avg_points_allowed': home_stats['avg_points_allowed'],
                'away_avg_points_allowed': away_stats['avg_points_allowed'],
                'home_win_pct': home_stats['win_pct'],
                'away_win_pct': away_stats['win_pct'],
                'home_home_win_pct': home_stats['home_win_pct'],
                'away_away_win_pct': away_stats['away_win_pct'],
                'home_avg_margin': home_stats['avg_margin'],
                'away_avg_margin': away_stats['avg_margin'],
                'home_games_played': home_stats['games_played'],
                'away_games_played': away_stats['games_played'],
                
                # Target variables
                'home_score': game['home_score'],
                'away_score': game['away_score']
            }
            
            features_list.append(features)
        
        return pd.DataFrame(features_list)

def train_point_predictor(features_df: pd.DataFrame, target: str = 'home_score') -> xgb.XGBRegressor:
    """
    Train XGBoost quantile regression model.
    
    Args:
        features_df: DataFrame containing game features
        target: Which target to predict ('home_score' or 'away_score')
        
    Returns:
        Trained XGBoost model
    """
    # Define features to use
    feature_cols = [
        'home_elo_standardized', 'away_elo_standardized',
        'home_off_eff', 'away_off_eff',
        'home_def_eff', 'away_def_eff',
        'home_form', 'away_form',
        'home_avg_points', 'away_avg_points',
        'home_avg_points_allowed', 'away_avg_points_allowed',
        'home_win_pct', 'away_win_pct',
        'home_home_win_pct', 'away_away_win_pct',
        'home_avg_margin', 'away_avg_margin',
        'home_games_played', 'away_games_played',
        'is_conference', 'is_neutral', 'is_postseason'
    ]
    
    # Convert boolean columns to int
    for col in ['is_conference', 'is_neutral', 'is_postseason']:
        features_df[col] = features_df[col].astype(int)
    
    # Create model
    model = xgb.XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=0.5,  # Median prediction
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist'  # For faster training
    )
    
    # Train model
    model.fit(features_df[feature_cols], features_df[target])
    
    return model

# Example usage:
if __name__ == '__main__':
    from sports_modelling.data_management.NCAABManager import NCAABManager
    from sports_modelling.power_rankings.PowerRankingsManager import PowerRankingsManager
    
    # Initialize managers
    ncaab = NCAABManager()
    rankings_manager = PowerRankingsManager(ncaab.data_dir)
    
    # Load data
    data = ncaab.load_season_data(2024)
    
    # Load or calculate rankings
    try:
        rankings_df = rankings_manager.load_rankings(2024)
    except FileNotFoundError:
        rankings_df = rankings_manager.calculate_historical_rankings(
            game_info=data['game_info'],
            box_scores=data['box_scores']
        )
        rankings_manager.save_rankings(rankings_df, season=2024)
    
    # Create feature generator
    feature_generator = GameFeatureGenerator(rankings_manager)
    
    # Generate features
    features_df = feature_generator.create_game_features(
        game_info=data['game_info'],
        box_scores=data['box_scores'],
        season=2024,
        rankings_df=rankings_df
    )
    
    # Train models for home and away scoring
    home_model = train_point_predictor(features_df, target='home_score')
    away_model = train_point_predictor(features_df, target='away_score')
    
    # Example prediction for a game
    example_game = features_df.iloc[-1]
    home_pred = home_model.predict(example_game[home_model.feature_names_])[0]
    away_pred = away_model.predict(example_game[away_model.feature_names_])[0]
    
    print(f"Predicted score: {example_game['home_team']} {home_pred:.1f} - {away_pred:.1f} {example_game['away_team']}")
    print(f"Actual score: {example_game['home_team']} {example_game['home_score']} - {example_game['away_score']} {example_game['away_team']}")

KeyError: 'efficiency_offensive'

In [15]:
class GameFeatureGenerator:
    def __init__(self, rankings_manager):
        self.rankings_manager = rankings_manager
        
    def _get_team_stats_for_date(self, game_info: pd.DataFrame, team: str) -> Dict:
        """Calculate aggregated team stats from previous games."""
        # (Same as before)
        games = game_info[
            ((game_info['home_team'] == team) | (game_info['away_team'] == team))
        ]
        
        if games.empty:
            return {
                'avg_points_scored': 0,
                'avg_points_allowed': 0,
                'win_pct': 0,
                'home_win_pct': 0,
                'away_win_pct': 0,
                'avg_margin': 0,
                'games_played': 0
            }
        
        points_scored = []
        points_allowed = []
        wins = []
        home_games = 0
        home_wins = 0
        away_games = 0
        away_wins = 0
        
        for _, game in games.iterrows():
            if game['home_team'] == team:
                points_scored.append(game['home_score'])
                points_allowed.append(game['away_score'])
                wins.append(1 if game['home_score'] > game['away_score'] else 0)
                home_games += 1
                if game['home_score'] > game['away_score']:
                    home_wins += 1
            else:
                points_scored.append(game['away_score'])
                points_allowed.append(game['home_score'])
                wins.append(1 if game['away_score'] > game['home_score'] else 0)
                away_games += 1
                if game['away_score'] > game['home_score']:
                    away_wins += 1
        
        return {
            'avg_points_scored': np.mean(points_scored) if points_scored else 0,
            'avg_points_allowed': np.mean(points_allowed) if points_allowed else 0,
            'win_pct': np.mean(wins) if wins else 0,
            'home_win_pct': home_wins / home_games if home_games > 0 else 0,
            'away_win_pct': away_wins / away_games if away_games > 0 else 0,
            'avg_margin': np.mean(np.array(points_scored) - np.array(points_allowed)),
            'games_played': len(games)
        }

    def create_game_features(self, 
                           game_info: pd.DataFrame,
                           box_scores: pd.DataFrame,
                           season: int,
                           rankings_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """Create features for all games in the dataset."""
        if rankings_df is None:
            rankings_df = self.rankings_manager.load_rankings(season)
            
        # First, let's check what columns we actually have in the rankings
        print("Available ranking columns:", rankings_df.columns.tolist())
        
        features_list = []
        
        for idx, game in game_info.iterrows():
            # Get game date in correct format
            game_date = pd.to_datetime(game['game_day']).strftime('%B %d, %Y')
            
            # Get rankings for this date
            date_rankings = rankings_df[rankings_df['date'] == game_date]
            
            if date_rankings.empty:
                continue
                
            # Get home and away team rankings
            home_team = game['home_team']
            away_team = game['away_team']
            
            home_rankings = date_rankings[date_rankings['team'] == home_team].iloc[0]
            away_rankings = date_rankings[date_rankings['team'] == away_team].iloc[0]
            
            # Get team stats up to this game
            prev_games = game_info[game_info['game_day'] < game['game_day']]
            home_stats = self._get_team_stats_for_date(prev_games, home_team)
            away_stats = self._get_team_stats_for_date(prev_games, away_team)
            
            # Combine all features
            features = {
                'game_id': game['game_id'],
                'game_day': game['game_day'],
                'home_team': home_team,
                'away_team': away_team,
                'is_conference': game['is_conference'],
                'is_neutral': game['is_neutral'],
                'is_postseason': game['is_postseason'],
            }
            
            # Add power ranking features (checking for existence)
            if 'elo' in home_rankings:
                features.update({
                    'home_elo': home_rankings['elo'],
                    'away_elo': away_rankings['elo'],
                })
                
            if 'elo_standardized' in home_rankings:
                features.update({
                    'home_elo_standardized': home_rankings['elo_standardized'],
                    'away_elo_standardized': away_rankings['elo_standardized'],
                })
            
            if 'efficiency_offensive' in home_rankings:
                features.update({
                    'home_off_eff': home_rankings['efficiency_offensive'],
                    'away_off_eff': away_rankings['efficiency_offensive'],
                    'home_def_eff': home_rankings['efficiency_defensive'],
                    'away_def_eff': away_rankings['efficiency_defensive'],
                })
            elif 'efficiency' in home_rankings:  # Different naming convention
                features.update({
                    'home_off_eff': home_rankings['efficiency'],
                    'away_off_eff': away_rankings['efficiency'],
                })
            
            if 'form' in home_rankings:
                features.update({
                    'home_form': home_rankings['form'],
                    'away_form': away_rankings['form'],
                })
                
            # Add team stats features
            features.update({
                'home_avg_points': home_stats['avg_points_scored'],
                'away_avg_points': away_stats['avg_points_scored'],
                'home_avg_points_allowed': home_stats['avg_points_allowed'],
                'away_avg_points_allowed': away_stats['avg_points_allowed'],
                'home_win_pct': home_stats['win_pct'],
                'away_win_pct': away_stats['win_pct'],
                'home_home_win_pct': home_stats['home_win_pct'],
                'away_away_win_pct': away_stats['away_win_pct'],
                'home_avg_margin': home_stats['avg_margin'],
                'away_avg_margin': away_stats['avg_margin'],
                'home_games_played': home_stats['games_played'],
                'away_games_played': away_stats['games_played'],
                
                # Target variables
                'home_score': game['home_score'],
                'away_score': game['away_score']
            })
            
            features_list.append(features)
        
        # Create DataFrame and handle missing columns
        features_df = pd.DataFrame(features_list)
        
        # Print the columns we ended up with
        print("\nFinal feature columns:", features_df.columns.tolist())
        
        return features_df

def train_point_predictor(features_df: pd.DataFrame, target: str = 'home_score') -> xgb.XGBRegressor:
    """Train XGBoost quantile regression model."""
    # Get available feature columns (excluding certain columns)
    exclude_cols = ['game_id', 'game_day', 'home_team', 'away_team', 'home_score', 'away_score']
    feature_cols = [col for col in features_df.columns if col not in exclude_cols]
    
    # Convert boolean columns to int
    for col in ['is_conference', 'is_neutral', 'is_postseason']:
        if col in features_df.columns:
            features_df[col] = features_df[col].astype(int)
    
    # Create model
    model = xgb.XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=0.5,  # Median prediction
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist'
    )
    
    # Train model with available features
    model.fit(features_df[feature_cols], features_df[target])
    
    return model, feature_cols

In [16]:
# Initialize managers
ncaab = NCAABManager()
rankings_manager = PowerRankingsManager(ncaab.data_dir)

# Load data
data = ncaab.load_season_data(2024)
rankings_df = rankings_manager.load_rankings(2024)

# Create feature generator
feature_generator = GameFeatureGenerator(rankings_manager)

# Generate features
features_df = feature_generator.create_game_features(
    game_info=data['game_info'],
    box_scores=data['box_scores'],
    season=2024,
    rankings_df=rankings_df
)

# Train models with available features
home_model, feature_cols = train_point_predictor(features_df, target='home_score')
away_model, _ = train_point_predictor(features_df, target='away_score')

Available ranking columns: ['team', 'elo', 'efficiency_offensive_offensive', 'efficiency_offensive_defensive', 'efficiency_defensive_offensive', 'efficiency_defensive_defensive', 'form', 'date', 'elo_standardized', 'efficiency_offensive_offensive_standardized', 'efficiency_offensive_defensive_standardized', 'efficiency_defensive_offensive_standardized', 'efficiency_defensive_defensive_standardized', 'form_standardized']

Final feature columns: ['game_id', 'game_day', 'home_team', 'away_team', 'is_conference', 'is_neutral', 'is_postseason', 'home_elo', 'away_elo', 'home_elo_standardized', 'away_elo_standardized', 'home_form', 'away_form', 'home_avg_points', 'away_avg_points', 'home_avg_points_allowed', 'away_avg_points_allowed', 'home_win_pct', 'away_win_pct', 'home_home_win_pct', 'away_away_win_pct', 'home_avg_margin', 'away_avg_margin', 'home_games_played', 'away_games_played', 'home_score', 'away_score']


In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict

def train_test_split_by_date(features_df: pd.DataFrame, train_frac: float = 0.75) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the dataset by date.
    
    Args:
        features_df: DataFrame containing game features
        train_frac: Fraction of season to use for training
    
    Returns:
        train_df, test_df
    """
    # Convert game_day to datetime if it's not already
    features_df = features_df.copy()
    features_df['game_day'] = pd.to_datetime(features_df['game_day'])
    
    # Sort by date
    features_df = features_df.sort_values('game_day')
    
    # Find split date
    dates = features_df['game_day'].unique()
    split_idx = int(len(dates) * train_frac)
    split_date = dates[split_idx]
    
    # Split data
    train_df = features_df[features_df['game_day'] < split_date]
    test_df = features_df[features_df['game_day'] >= split_date]
    
    print(f"Training on games from {train_df['game_day'].min()} to {train_df['game_day'].max()}")
    print(f"Testing on games from {test_df['game_day'].min()} to {test_df['game_day'].max()}")
    print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
    
    return train_df, test_df

def evaluate_predictions(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    """Calculate various error metrics."""
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'Mean Error': np.mean(y_pred - y_true),
        'Median Error': np.median(y_pred - y_true)
    }

def predict_and_evaluate(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    exclude_cols: List[str] = ['game_id', 'game_day', 'home_team', 'away_team', 'home_score', 'away_score']
) -> Tuple[pd.DataFrame, Dict]:
    """
    Train models and make predictions on test set.
    
    Returns:
        DataFrame with predictions, Dict with evaluation metrics
    """
    # Get feature columns
    feature_cols = [col for col in train_df.columns if col not in exclude_cols]
    
    # Train home and away score models
    home_model = xgb.XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=0.5,
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist'
    )
    
    away_model = xgb.XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=0.5,
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist'
    )
    
    # Train models
    home_model.fit(train_df[feature_cols], train_df['home_score'])
    away_model.fit(train_df[feature_cols], train_df['away_score'])
    
    # Make predictions
    test_df = test_df.copy()
    test_df['predicted_home_score'] = home_model.predict(test_df[feature_cols])
    test_df['predicted_away_score'] = away_model.predict(test_df[feature_cols])
    
    # Calculate predicted winner
    test_df['predicted_winner'] = np.where(
        test_df['predicted_home_score'] > test_df['predicted_away_score'],
        test_df['home_team'],
        test_df['away_team']
    )
    
    # Calculate actual winner
    test_df['actual_winner'] = np.where(
        test_df['home_score'] > test_df['away_score'],
        test_df['home_team'],
        test_df['away_team']
    )
    
    # Calculate evaluation metrics
    metrics = {
        'home_score': evaluate_predictions(test_df['home_score'], test_df['predicted_home_score']),
        'away_score': evaluate_predictions(test_df['away_score'], test_df['predicted_away_score']),
        'prediction_accuracy': (test_df['predicted_winner'] == test_df['actual_winner']).mean()
    }
    
    return test_df, metrics

# Example usage:
if __name__ == '__main__':
    from sports_modelling.data_management.NCAABManager import NCAABManager
    from sports_modelling.power_rankings.PowerRankingsManager import PowerRankingsManager
    
    # Initialize managers
    ncaab = NCAABManager()
    rankings_manager = PowerRankingsManager(ncaab.data_dir)
    
    # Load data
    print("Loading data...")
    data = ncaab.load_season_data(2024)
    rankings_df = rankings_manager.load_rankings(2024)
    
    # Create features
    print("Generating features...")
    feature_generator = GameFeatureGenerator(rankings_manager)
    features_df = feature_generator.create_game_features(
        game_info=data['game_info'],
        box_scores=data['box_scores'],
        season=2024,
        rankings_df=rankings_df
    )
    
    # Split data
    print("\nSplitting data...")
    train_df, test_df = train_test_split_by_date(features_df, train_frac=0.9)
    
    # Train and evaluate
    print("\nTraining models and making predictions...")
    predictions_df, metrics = predict_and_evaluate(train_df, test_df)
    
    # Print results
    print("\nEvaluation Metrics:")
    print("\nHome Score Predictions:")
    for metric, value in metrics['home_score'].items():
        print(f"{metric}: {value:.2f}")
    
    print("\nAway Score Predictions:")
    for metric, value in metrics['away_score'].items():
        print(f"{metric}: {value:.2f}")
    
    print(f"\nWinner Prediction Accuracy: {metrics['prediction_accuracy']:.3f}")
    
    # Show some example predictions
    print("\nExample Predictions:")
    example_preds = predictions_df[['game_day', 'home_team', 'away_team', 
                                  'home_score', 'predicted_home_score',
                                  'away_score', 'predicted_away_score',
                                  'actual_winner', 'predicted_winner']].head()
    print(example_preds)
    
    # Calculate error distribution
    home_errors = predictions_df['predicted_home_score'] - predictions_df['home_score']
    away_errors = predictions_df['predicted_away_score'] - predictions_df['away_score']
    
    print("\nError Distribution:")
    print("\nHome Score Errors:")
    print(home_errors.describe())
    print("\nAway Score Errors:")
    print(away_errors.describe())

Loading data...
Generating features...
Available ranking columns: ['team', 'elo', 'efficiency_offensive_offensive', 'efficiency_offensive_defensive', 'efficiency_defensive_offensive', 'efficiency_defensive_defensive', 'form', 'date', 'elo_standardized', 'efficiency_offensive_offensive_standardized', 'efficiency_offensive_defensive_standardized', 'efficiency_defensive_offensive_standardized', 'efficiency_defensive_defensive_standardized', 'form_standardized']

Final feature columns: ['game_id', 'game_day', 'home_team', 'away_team', 'is_conference', 'is_neutral', 'is_postseason', 'home_elo', 'away_elo', 'home_elo_standardized', 'away_elo_standardized', 'home_form', 'away_form', 'home_avg_points', 'away_avg_points', 'home_avg_points_allowed', 'away_avg_points_allowed', 'home_win_pct', 'away_win_pct', 'home_home_win_pct', 'away_away_win_pct', 'home_avg_margin', 'away_avg_margin', 'home_games_played', 'away_games_played', 'home_score', 'away_score']

Splitting data...
Training on games from