In [3]:
# FILE: data_collector.py

import os
import time
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Any, Set

from sportsmonks_api_client import SportMonksAPI
from sportsmonks_database import SportMonksDB

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    filename='sportsmonks_collector.log'
)
logger = logging.getLogger(__name__)

class SportMonksCollector:
    """Class to collect data from SportMonks API and save to database"""
    
    def __init__(self, api_token: str, db_path: str = "sportmonks_football.db"):
        """Initialize the collector"""
        self.api = SportMonksAPI(api_token)
        self.db = SportMonksDB(db_path)
        
        # Create database tables if they don't exist
        with self.db:
            self.db.create_tables()
            self.db.create_metadata_table()
        
    def collect_reference_data(self):
        """Collect reference data (continents, countries, regions, cities)"""
        logger.info("Collecting reference data...")
        
        # 1. Continents
        logger.info("Collecting continents...")
        continents = self.api.fetch_core_data("continents")
        with self.db:
            self.db.insert_or_update_continents(continents)
        logger.info(f"Collected {len(continents)} continents")
        
        # 2. Countries
        logger.info("Collecting countries...")
        countries = self.api.fetch_core_data("countries")
        with self.db:
            self.db.insert_or_update_countries(countries)
        logger.info(f"Collected {len(countries)} countries")
        
        # 3. Regions
        logger.info("Collecting regions...")
        regions = self.api.fetch_core_data("regions")
        with self.db:
            self.db.insert_or_update_regions(regions)
        logger.info(f"Collected {len(regions)} regions")
        
        # 4. Cities
        logger.info("Collecting cities...")
        cities = self.api.fetch_core_data("cities")
        with self.db:
            self.db.insert_or_update_cities(cities)
        logger.info(f"Collected {len(cities)} cities")
        
        logger.info("Reference data collection completed")
        
    def collect_states_and_venues(self):
        """Collect match states and venues"""
        logger.info("Collecting match states...")
        states = self.api.fetch_football_data("states")
        with self.db:
            self.db.insert_or_update_states(states)
            
        logger.info("Collecting venues...")
        venues = self.api.fetch_football_data("venues")
        with self.db:
            self.db.insert_or_update_venues(venues)
            
        logger.info("States and venues collection completed")
    
    def collect_leagues(self, country_ids: List[int] = None) -> List[Dict]:
        """
        Collect leagues data, optionally filtered by country IDs
        
        Args:
            country_ids: Optional list of country IDs to filter leagues
            
        Returns:
            List of collected leagues
        """
        logger.info("Collecting leagues...")
        
        if country_ids:
            # Fetch leagues for specific countries
            leagues = []
            for country_id in country_ids:
                params = {"filter[country_id]": country_id}
                leagues.extend(self.api.fetch_football_data("leagues", params))
        else:
            # Fetch all leagues
            leagues = self.api.fetch_football_data("leagues")
        
        # Filter active leagues (optional)
        active_leagues = [league for league in leagues if league.get("active", False)]
        
        # Save to database
        with self.db:
            self.db.insert_or_update_leagues(leagues)
        
        logger.info(f"Collected {len(leagues)} leagues")
        logger.info("Leagues collection completed")
        
        return leagues
    
    def collect_seasons(self, league_ids: List[int] = None, years_back: int = 5) -> List[Dict]:
        """
        Collect seasons data for specified leagues
        
        Args:
            league_ids: List of league IDs to collect seasons for
            years_back: Number of years back to collect seasons
            
        Returns:
            List of collected seasons
        """
        logger.info("Collecting seasons for leagues...")
        
        # If no league IDs provided, get all leagues from database
        if league_ids is None:
            with self.db:
                leagues = self.db.get_all_leagues()
                league_ids = [league["id"] for league in leagues]
        
        # Calculate cutoff date for filtering seasons
        cutoff_date = (datetime.now() - timedelta(days=365 * years_back)).strftime("%Y-%m-%d")
        
        all_seasons = []
        for league_id in league_ids:
            # Get league name for logging
            with self.db:
                league_name = next((l["name"] for l in self.db.get_all_leagues() if l["id"] == league_id), f"ID: {league_id}")
            
            logger.info(f"Collecting seasons for league {league_id} ({league_name})...")
            seasons = self.api.fetch_seasons_by_league(league_id)
            
            # Filter seasons by start date if available
            recent_seasons = []
            for season in seasons:
                if "start_date" in season and season["start_date"] >= cutoff_date:
                    recent_seasons.append(season)
                elif len(recent_seasons) < years_back:  # If no date, keep most recent by ID
                    recent_seasons.append(season)
            
            # Save to database
            with self.db:
                self.db.insert_or_update_seasons(recent_seasons)
            
            all_seasons.extend(recent_seasons)
            logger.info(f"Collected {len(recent_seasons)} seasons for league {league_id}")
        
        logger.info(f"Seasons collection completed. Total: {len(all_seasons)} seasons")
        return all_seasons
    
    def collect_fixtures_for_league(self, league_id: int, recent_seasons_only: bool = True) -> int:
        """
        Collect fixtures for a league
        
        Args:
            league_id: League ID to collect fixtures for
            recent_seasons_only: If True, collect only for most recent 5 seasons
            
        Returns:
            Number of fixtures collected
        """
        logger.info(f"Collecting fixtures for league {league_id}…")
        
        # Get seasons for this league
        with self.db:
            seasons = self.db.get_seasons_by_league(league_id)
            
        if recent_seasons_only:
            # Sort by start date descending and take first 5
            seasons = sorted(seasons, key=lambda s: s.get("start_date", ""), reverse=True)[:5]
        
        fixture_count = 0
        
        # Collect fixtures for each season
        for season in seasons:
            season_id = season["id"]
            logger.info(f"  → Season {season_id}: fetching fixtures…")
            
            fixtures = self.api.fetch_fixtures_by_season(season_id)
            
            # Save basic fixture info to database
            with self.db:
                self.db.insert_or_update_fixtures(fixtures)
                
            fixture_count += len(fixtures)
            
            # Sleep briefly to avoid hitting rate limits
            time.sleep(0.1)
        
        logger.info("Fixtures collection completed")
        return fixture_count
    
    def collect_fixtures_for_all_leagues(self, league_ids: List[int] = None) -> int:
        """
        Collect fixtures for all leagues or specified league IDs
        
        Args:
            league_ids: Optional list of league IDs to collect fixtures for
            
        Returns:
            Total number of fixtures collected
        """
        logger.info("Collecting fixtures for all leagues...")
        
        # If no league IDs provided, get all leagues from database
        if league_ids is None:
            with self.db:
                leagues = self.db.get_all_leagues()
                league_ids = [league["id"] for league in leagues]
        
        total_fixtures = 0
        for league_id in league_ids:
            # Get league name for logging
            with self.db:
                league_name = next((l["name"] for l in self.db.get_all_leagues() if l["id"] == league_id), f"ID: {league_id}")
            
            logger.info(f"Processing league: {league_name} (ID: {league_id})")
            
            try:
                fixtures_count = self.collect_fixtures_for_league(league_id)
                total_fixtures += fixtures_count
            except Exception as e:
                logger.error(f"Error collecting fixtures for league {league_id}: {e}")
            
            logger.info(f"Fixtures collected for league: {league_name} (ID: {league_id})")
            
            # Sleep briefly to avoid hitting rate limits
            time.sleep(0.5)
        
        logger.info(f"All fixtures collection completed.")
        return total_fixtures
    
    def collect_fixture_details(self, fixture_id: int) -> Dict:
        """
        Collect detailed information for a fixture
        
        Args:
            fixture_id: Fixture ID to collect details for
            
        Returns:
            Fixture details
        """
        fixture_data = self.api.fetch_fixture_details(fixture_id)
        
        # Save fixture details to database
        if "data" in fixture_data:
            with self.db:
                self.db.insert_or_update_fixture(fixture_data["data"])
            
            return fixture_data["data"]
        
        return None
    
    def collect_fixture_details_batch(self, fixture_ids: List[int]) -> List[Dict]:
        """
        Collect detailed information for a batch of fixtures
        
        Args:
            fixture_ids: List of fixture IDs to collect details for
            
        Returns:
            List of fixture details
        """
        # Process in batches of 50 (SportMonks API limit)
        batch_size = 50
        all_details = []
        
        for i in range(0, len(fixture_ids), batch_size):
            batch = fixture_ids[i:i + batch_size]
            fixtures_data = self.api.fetch_fixture_details_batch(batch)
            
            # Save fixture details to database
            with self.db:
                for fixture in fixtures_data:
                    self.db.insert_or_update_fixture(fixture)
            
            all_details.extend(fixtures_data)
            
            # Sleep briefly to avoid hitting rate limits
            time.sleep(0.2)
        
        return all_details
    
    def collect_all_fixture_details(self, league_id: int = None, season_id: int = None):
        """
        Collect detailed information for all fixtures in database
        
        Args:
            league_id: Optional league ID to filter fixtures
            season_id: Optional season ID to filter fixtures
        """
        # Get fixtures from database
        with self.db:
            if league_id and season_id:
                fixtures = self.db.execute(
                    """
                    SELECT id FROM fixtures 
                    WHERE league_id = ? AND season_id = ?
                    """,
                    (league_id, season_id)
                ).fetchall()
            elif league_id:
                fixtures = self.db.execute(
                    """
                    SELECT id FROM fixtures 
                    WHERE league_id = ?
                    """,
                    (league_id,)
                ).fetchall()
            elif season_id:
                fixtures = self.db.execute(
                    """
                    SELECT id FROM fixtures 
                    WHERE season_id = ?
                    """,
                    (season_id,)
                ).fetchall()
            else:
                fixtures = self.db.execute("SELECT id FROM fixtures").fetchall()
        
        # Extract fixture IDs
        fixture_ids = [fixture["id"] for fixture in fixtures]
        
        # Process in batches to avoid memory issues
        batch_size = 50
        for i in range(0, len(fixture_ids), batch_size):
            batch = fixture_ids[i:i + batch_size]
            self.collect_fixture_details_batch(batch)
            logger.info(f"Processed {i + len(batch)}/{len(fixture_ids)} fixtures")
    
    def collect_standings(self, season_ids: List[int]) -> Dict[int, List[Dict]]:
        """
        Collect standings for specified seasons
        
        Args:
            season_ids: List of season IDs to collect standings for
            
        Returns:
            Dictionary mapping season IDs to standings
        """
        standings_by_season = {}
        
        for season_id in season_ids:
            try:
                standings = self.api.fetch_standings_by_season(season_id)
                
                # Save to database
                with self.db:
                    self.db.insert_or_update_standings(season_id, standings)
                
                standings_by_season[season_id] = standings
                
            except Exception as e:
                logger.error(f"Error collecting standings for season {season_id}: {e}")
            
            # Sleep briefly to avoid hitting rate limits
            time.sleep(0.2)
        
        return standings_by_season
    
    def collect_top_performers(self, season_ids: List[int]) -> Dict[int, Dict[str, List[Dict]]]:
        """
        Collect top performers for specified seasons
        
        Args:
            season_ids: List of season IDs to collect top performers for
            
        Returns:
            Dictionary mapping season IDs to top performers by category
        """
        performers_by_season = {}
        
        for season_id in season_ids:
            try:
                # Get top scorers
                top_scorers = self.api.fetch_topscorers_by_season(season_id)
                
                # Save to database
                with self.db:
                    self.db.insert_or_update_top_performers(season_id, "goals", top_scorers)
                
                performers_by_season[season_id] = {"goals": top_scorers}
                
            except Exception as e:
                logger.error(f"Error collecting top performers for season {season_id}: {e}")
            
            # Sleep briefly to avoid hitting rate limits
            time.sleep(0.2)
        
        return performers_by_season
    
    def collect_all_data(self, league_ids: List[int] = None, years_back: int = 5):
        """
        Collect all data following the recommended order:
        1. Reference data
        2. Leagues
        3. Seasons
        4. Fixtures
        5. Fixture details
        6. Standings and top performers
        
        Args:
            league_ids: Optional list of league IDs to focus on
            years_back: Number of years back to collect seasons
        """
        # Step 1: Collect reference data
        self.collect_reference_data()
        
        # Step 2: Collect states and venues
        self.collect_states_and_venues()
        
        # Step 3: Collect leagues
        leagues = self.collect_leagues()
        
        # If league_ids not specified, use major leagues
        if league_ids is None:
            # Filter for major leagues (Premier League, La Liga, etc.)
            major_leagues = [
                8,    # Premier League
                564,  # La Liga
                82,   # Bundesliga 
                384,  # Serie A
                301   # Ligue 1
            ]
            league_ids = major_leagues
        
        # Step 4: Collect seasons for specified leagues
        self.collect_seasons(league_ids, years_back)
        
        # Step 5: Collect fixtures for these leagues
        self.collect_fixtures_for_all_leagues(league_ids)
        
        # Step 6: Collect fixture details
        # Get all fixture IDs from database for the specified leagues
        fixture_ids = []
        with self.db:
            for league_id in league_ids:
                fixtures = self.db.execute(
                    "SELECT id FROM fixtures WHERE league_id = ?", 
                    (league_id,)
                ).fetchall()
                fixture_ids.extend([fixture["id"] for fixture in fixtures])
        
        # Process fixtures in batches
        self.collect_fixture_details_batch(fixture_ids)
        
        # Step 7: Collect standings and top performers
        with self.db:
            season_ids = []
            for league_id in league_ids:
                seasons = self.db.get_seasons_by_league(league_id, years_back)
                season_ids.extend([season["id"] for season in seasons])
        
        self.collect_standings(season_ids)
        self.collect_top_performers(season_ids)

# Set the API token only after defining the class
API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"

# Create the collector instance after the class definition
collector = SportMonksCollector(API_TOKEN)

# Example usage
if __name__ == "__main__":
    # Use premier league ID as an example
    premier_league_id = 8
    
    # Create SQLite connection for checking results
    import sqlite3
    conn = sqlite3.connect("sportmonks_football.db")
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()
    
    try:
        # Collect reference data (continents, countries, etc.)
        collector.collect_reference_data()
        
        # Collect states and venues
        collector.collect_states_and_venues()
        
        # Collect leagues
        collector.collect_leagues()
        
        # Collect seasons for Premier League
        collector.collect_seasons([premier_league_id])
        
        # Let's handle any potential errors with fixtures collection
        try:
            collector.collect_fixtures_for_league(premier_league_id)
            print("Premier League fixtures collected")
        except Exception as e:
            print(f"Error collecting fixtures: {e}")
            
            # If the error is related to API rate limiting, we can add a retry with delay
            print("Waiting 10 seconds before retry...")
            time.sleep(10)
            try:
                collector.collect_fixtures_for_league(premier_league_id)
                print("Premier League fixtures collected (retry succeeded)")
            except Exception as e2:
                print(f"Retry failed: {e2}")
                print("You may need to check the SportMonksAPI.fetch_fixtures_by_season method")
        
        # Verify the seasons we have
        try:
            seasons = collector.db.get_seasons_by_league(premier_league_id)
            print(f"Found {len(seasons)} seasons for Premier League")
            
            # Print the most recent 5 seasons
            for season in seasons[:5]:
                print(f"Season ID: {season['id']}, Name: {season.get('name', 'N/A')}")
                
            # Try a direct SQL query to see if fixtures were collected
            try:
                cursor.execute("SELECT COUNT(*) FROM fixtures WHERE league_id = ?", (premier_league_id,))
                fixture_count = cursor.fetchone()[0]
                print(f"Total Premier League fixtures in database: {fixture_count}")
            except sqlite3.OperationalError:
                print("Could not query fixtures table - it may not exist yet")
        except Exception as e:
            print(f"Error verifying seasons: {e}")
        
    finally:
        # Close the connection
        conn.close()

API request error: 400 Client Error: Bad Request for url: https://api.sportmonks.com/v3/football/seasons?filter%5Bleague_id%5D=8&per_page=1000&page=1&api_token=oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd
Response content: b'    {"message":"Filters should be passed as a string","link":"https:\\/\\/docs.sportmonks.com\\/football\\/api\\/response-codes\\/filtering-and-complexity-exceptions","code":5010}'


HTTPError: 400 Client Error: Bad Request for url: https://api.sportmonks.com/v3/football/seasons?filter%5Bleague_id%5D=8&per_page=1000&page=1&api_token=oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd