In [3]:
season_ids = pd.read_csv(SEASONS_FILE)
print(season_ids.head(10))


   league_id     league_name  season_id season_name  is_current
0          8  Premier League          2   2010/2011         NaN
1          8  Premier League          3   2013/2014         NaN
2          8  Premier League          6   2008/2009         NaN
3          8  Premier League          7   2012/2013         NaN
4          8  Premier League          8   2006/2007         NaN
5          8  Premier League          9   2011/2012         NaN
6          8  Premier League         10   2015/2016         NaN
7          8  Premier League         11   2009/2010         NaN
8          8  Premier League         12   2014/2015         NaN
9          8  Premier League         13   2016/2017         NaN


In [7]:
import sys
sys.path.append("/Users/sebastianvinther/Desktop/Sportsmonks/sportmonks_api")

# Test import
import sportmonks_calls
print("✅ Import succeeded!")

✅ Import succeeded!


In [5]:
# === SportMonks Teams Fetcher ===
import requests
import pandas as pd
import time
from tqdm.notebook import tqdm

# === CONFIG ===
API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"
OUTPUT_FILE = "/Users/sebastianvinther/Desktop/Sportsmonks/all_teams.csv"
BASE_URL = "https://api.sportmonks.com/v3/football/teams"
INCLUDE = "country;venue"
MAX_CALLS_PER_HOUR = 53000
MIN_SECONDS_BETWEEN_CALLS = 3600 / MAX_CALLS_PER_HOUR
PER_PAGE = 100  # Request maximum items per page

# === INITIAL SETUP ===
headers = {"Accept": "application/json"}
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10)
session.mount("https://", adapter)

all_teams = []
api_calls = 0
start_time = time.time()

# === FUNCTION TO FLATTEN NESTED DICTIONARIES ===
def flatten_dict(data, prefix=""):
    flat = {}
    for k, v in data.items():
        key = f"{prefix}_{k}" if prefix else k
        if isinstance(v, dict):
            flat.update(flatten_dict(v, key))
        else:
            flat[key] = v
    return flat

# === FUNCTION TO MAKE API CALL WITH RATE LIMITING ===
def make_api_call(url, params):
    global api_calls, start_time
    
    # Rate-limiting delay
    elapsed = time.time() - start_time
    ideal_elapsed = api_calls * MIN_SECONDS_BETWEEN_CALLS
    if elapsed < ideal_elapsed:
        time.sleep(ideal_elapsed - elapsed)
    
    try:
        response = session.get(url, headers=headers, params=params, timeout=15)
        api_calls += 1
        
        if response.status_code != 200:
            print(f"❌ Error {response.status_code}: {response.text}")
            return None
        
        return response.json()
    except Exception as e:
        print(f"❌ Exception: {e}")
        return None

# === FETCH ALL TEAMS ===
print("🚀 Starting to fetch all teams...")

# For first request, get page count
params = {
    "api_token": API_TOKEN, 
    "include": INCLUDE,
    "per_page": PER_PAGE,
    "page": 1
}

initial_response = make_api_call(BASE_URL, params)
if not initial_response:
    print("❌ Failed to get initial response")
else:
    # Extract pagination information
    pagination = initial_response.get("meta", {}).get("pagination", {})
    total_pages = pagination.get("total_pages", 1)
    total_count = pagination.get("total", 0)
    
    print(f"📊 Found {total_count} teams across {total_pages} pages")
    
    # Process first page data
    teams = initial_response.get("data", [])
    for team in teams:
        all_teams.append(flatten_dict(team))
    
    # Process remaining pages
    if total_pages > 1:
        for page in tqdm(range(2, total_pages + 1), desc="Fetching pages"):
            params["page"] = page
            response = make_api_call(BASE_URL, params)
            
            if not response:
                print(f"❌ Failed to get page {page}")
                continue
                
            teams = response.get("data", [])
            for team in teams:
                all_teams.append(flatten_dict(team))
            
            # Optional: Show progress
            if page % 5 == 0 or page == total_pages:
                print(f"📦 Processed {len(all_teams)} teams so far...")

# Save all teams
if all_teams:
    df = pd.DataFrame(all_teams)
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✅ Saved {len(df)} teams to {OUTPUT_FILE}")
    print(f"📋 DataFrame shape: {df.shape}")
    print(f"📊 Column preview: {', '.join(df.columns[:5])}...")
else:
    print("❌ No teams were fetched")

🚀 Starting to fetch all teams...
📊 Found 0 teams across 1 pages
✅ Saved 25 teams to /Users/sebastianvinther/Desktop/Sportsmonks/all_teams.csv
📋 DataFrame shape: (25, 34)
📊 Column preview: id, sport_id, country_id, venue_id, gender...


In [10]:
from sportmonks_api.sportmonks_calls import Teams


In [6]:
API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"

In [8]:
import requests

API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"

# Correct way to make the request
response = requests.get(
    "https://api.sportmonks.com/v3/football/teams",
    params={"api_token": API_TOKEN}
)
teams = response.json()

In [9]:
teams

{'data': [{'id': 1,
   'sport_id': 1,
   'country_id': 462,
   'venue_id': 214,
   'gender': 'male',
   'name': 'West Ham United',
   'short_code': 'WHU',
   'image_path': 'https://cdn.sportmonks.com/images/soccer/teams/1/1.png',
   'founded': 1895,
   'type': 'domestic',
   'placeholder': False,
   'last_played_at': '2025-04-26 14:00:00'},
  {'id': 2,
   'sport_id': 1,
   'country_id': 462,
   'venue_id': 8,
   'gender': 'male',
   'name': 'Blackburn Rovers',
   'short_code': 'BBR',
   'image_path': 'https://cdn.sportmonks.com/images/soccer/teams/2/2.png',
   'founded': 1875,
   'type': 'domestic',
   'placeholder': False,
   'last_played_at': '2025-04-26 14:00:00'},
  {'id': 3,
   'sport_id': 1,
   'country_id': 462,
   'venue_id': 212,
   'gender': 'male',
   'name': 'Sunderland',
   'short_code': 'SUN',
   'image_path': 'https://cdn.sportmonks.com/images/soccer/teams/3/3.png',
   'founded': 1879,
   'type': 'domestic',
   'placeholder': False,
   'last_played_at': '2025-04-26 14:00

In [10]:
import requests

API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"

# Make the request
response = requests.get(
    "https://api.sportmonks.com/v3/football/teams",
    params={"api_token": API_TOKEN}
)
result = response.json()

# Extract the total count from pagination metadata
total_teams = result.get("meta", {}).get("pagination", {}).get("total", 0)

# Show the total number of teams
print(f"Total teams available: {total_teams}")

# Show how many teams were returned in this response
teams_in_response = len(result.get("data", []))
print(f"Teams in current response: {teams_in_response}")

Total teams available: 0
Teams in current response: 25


In [13]:
# === SportMonks Teams Fetcher with Full Data (CSV Output) - Improved Pagination ===
import requests
import pandas as pd
import time
import os
from datetime import datetime
from tqdm.notebook import tqdm

# === CONFIG ===
API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"
BASE_URL = "https://api.sportmonks.com/v3/football/teams"
INCLUDE = "players;sport;sidelined;sidelinedHistory;coaches;rivals;statistics"
OUTPUT_DIR = "/Users/sebastianvinther/Desktop/Sportsmonks"
MAX_CALLS_PER_HOUR = 53000
MIN_SECONDS_BETWEEN_CALLS = 3600 / MAX_CALLS_PER_HOUR
PER_PAGE = 100  # Increased to get more teams per request

# Create output directory if it doesn't exist
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Generate timestamp for files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
TEAMS_FULL_FILE = f"{OUTPUT_DIR}/teams_full_{timestamp}.csv"
TEAMS_SUMMARY_FILE = f"{OUTPUT_DIR}/teams_summary_{timestamp}.csv"
PLAYERS_FILE = f"{OUTPUT_DIR}/team_players_{timestamp}.csv"
COACHES_FILE = f"{OUTPUT_DIR}/team_coaches_{timestamp}.csv"
SIDELINED_FILE = f"{OUTPUT_DIR}/team_sidelined_{timestamp}.csv"
STATS_FILE = f"{OUTPUT_DIR}/team_statistics_{timestamp}.csv"

# === INITIAL SETUP ===
headers = {"Accept": "application/json"}
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10)
session.mount("https://", adapter)

# Track API calls and timing
api_calls = 0
start_time = time.time()

# === FUNCTION TO MAKE API CALL WITH RATE LIMITING ===
def make_api_call(url, params):
    global api_calls, start_time
    
    # Rate-limiting delay
    elapsed = time.time() - start_time
    ideal_elapsed = api_calls * MIN_SECONDS_BETWEEN_CALLS
    if elapsed < ideal_elapsed:
        time.sleep(ideal_elapsed - elapsed)
    
    try:
        response = session.get(url, headers=headers, params=params, timeout=30)
        api_calls += 1
        
        if response.status_code != 200:
            print(f"❌ Error {response.status_code}: {response.text}")
            return None
        
        return response.json()
    except Exception as e:
        print(f"❌ Exception: {e}")
        return None

# === FUNCTIONS TO FLATTEN NESTED DATA ===
def flatten_team(team):
    """Extract and flatten main team data"""
    flat_team = {}
    
    # Copy all direct properties
    for key, value in team.items():
        # Skip nested arrays that we'll process separately
        if key in ['players', 'coaches', 'sidelined', 'sidelined_history', 'rivals', 'statistics']:
            continue
            
        # Handle country object
        if key == 'country' and isinstance(value, dict):
            for country_key, country_val in value.items():
                flat_team[f'country_{country_key}'] = country_val
        # Handle sport object
        elif key == 'sport' and isinstance(value, dict):
            for sport_key, sport_val in value.items():
                flat_team[f'sport_{sport_key}'] = sport_val
        # Handle venue object
        elif key == 'venue' and isinstance(value, dict):
            for venue_key, venue_val in value.items():
                flat_team[f'venue_{venue_key}'] = venue_val
        # Regular properties
        else:
            flat_team[key] = value
            
    # Add counts for related entities
    flat_team['players_count'] = len(team.get('players', []))
    flat_team['coaches_count'] = len(team.get('coaches', []))
    flat_team['sidelined_count'] = len(team.get('sidelined', []))
    flat_team['sidelined_history_count'] = len(team.get('sidelined_history', []))
    flat_team['rivals_count'] = len(team.get('rivals', []))
    flat_team['statistics_count'] = len(team.get('statistics', []))
    
    return flat_team

def process_players(team):
    """Extract and flatten players data"""
    players = []
    team_id = team.get('id')
    team_name = team.get('name')
    
    for player in team.get('players', []):
        flat_player = {'team_id': team_id, 'team_name': team_name}
        
        # Add all player properties
        for key, value in player.items():
            if isinstance(value, dict):
                for sub_key, sub_val in value.items():
                    flat_player[f'{key}_{sub_key}'] = sub_val
            else:
                flat_player[key] = value
                
        players.append(flat_player)
        
    return players

def process_coaches(team):
    """Extract and flatten coaches data"""
    coaches = []
    team_id = team.get('id')
    team_name = team.get('name')
    
    for coach in team.get('coaches', []):
        flat_coach = {'team_id': team_id, 'team_name': team_name}
        
        # Add all coach properties
        for key, value in coach.items():
            if isinstance(value, dict):
                for sub_key, sub_val in value.items():
                    flat_coach[f'{key}_{sub_key}'] = sub_val
            else:
                flat_coach[key] = value
                
        coaches.append(flat_coach)
        
    return coaches

def process_sidelined(team):
    """Extract and flatten sidelined data"""
    sidelined_list = []
    team_id = team.get('id')
    team_name = team.get('name')
    
    # Current sidelined
    for item in team.get('sidelined', []):
        flat_item = {'team_id': team_id, 'team_name': team_name, 'type': 'current'}
        
        # Add all sidelined properties
        for key, value in item.items():
            if isinstance(value, dict):
                for sub_key, sub_val in value.items():
                    flat_item[f'{key}_{sub_key}'] = sub_val
            else:
                flat_item[key] = value
                
        sidelined_list.append(flat_item)
    
    # Sidelined history
    for item in team.get('sidelined_history', []):
        flat_item = {'team_id': team_id, 'team_name': team_name, 'type': 'history'}
        
        # Add all sidelined properties
        for key, value in item.items():
            if isinstance(value, dict):
                for sub_key, sub_val in value.items():
                    flat_item[f'{key}_{sub_key}'] = sub_val
            else:
                flat_item[key] = value
                
        sidelined_list.append(flat_item)
        
    return sidelined_list

def process_statistics(team):
    """Extract and flatten statistics data"""
    stats_list = []
    team_id = team.get('id')
    team_name = team.get('name')
    
    for stat in team.get('statistics', []):
        flat_stat = {'team_id': team_id, 'team_name': team_name}
        
        # Add all statistics properties
        for key, value in stat.items():
            if isinstance(value, dict):
                for sub_key, sub_val in value.items():
                    flat_stat[f'{key}_{sub_key}'] = sub_val
            else:
                flat_stat[key] = value
                
        stats_list.append(flat_stat)
        
    return stats_list

# === DATA STORAGE ===
all_teams = []
all_players = []
all_coaches = []
all_sidelined = []
all_statistics = []

# === SAVE DATA FUNCTION ===
def save_progress(force_save=False):
    """Save collected data to CSV files"""
    # Save teams data
    if all_teams:
        df_teams = pd.DataFrame(all_teams)
        if not os.path.exists(TEAMS_FULL_FILE):
            df_teams.to_csv(TEAMS_FULL_FILE, index=False)
        else:
            df_teams.to_csv(TEAMS_FULL_FILE, mode='a', header=False, index=False)
            
    # Save players data
    if all_players:
        df_players = pd.DataFrame(all_players)
        if not os.path.exists(PLAYERS_FILE):
            df_players.to_csv(PLAYERS_FILE, index=False)
        else:
            df_players.to_csv(PLAYERS_FILE, mode='a', header=False, index=False)
            
    # Save coaches data
    if all_coaches:
        df_coaches = pd.DataFrame(all_coaches)
        if not os.path.exists(COACHES_FILE):
            df_coaches.to_csv(COACHES_FILE, index=False)
        else:
            df_coaches.to_csv(COACHES_FILE, mode='a', header=False, index=False)
    
    # Save sidelined data
    if all_sidelined:
        df_sidelined = pd.DataFrame(all_sidelined)
        if not os.path.exists(SIDELINED_FILE):
            df_sidelined.to_csv(SIDELINED_FILE, index=False)
        else:
            df_sidelined.to_csv(SIDELINED_FILE, mode='a', header=False, index=False)
            
    # Save statistics data
    if all_statistics:
        df_stats = pd.DataFrame(all_statistics)
        if not os.path.exists(STATS_FILE):
            df_stats.to_csv(STATS_FILE, index=False)
        else:
            df_stats.to_csv(STATS_FILE, mode='a', header=False, index=False)
    
    print(f"💾 Saved data: {len(all_teams)} teams, {len(all_players)} players, {len(all_coaches)} coaches, " +
          f"{len(all_sidelined)} sidelined, {len(all_statistics)} statistics")
    
    # Clear memory after saving
    all_teams.clear()
    all_players.clear()
    all_coaches.clear()
    all_sidelined.clear()
    all_statistics.clear()

# === FETCH ALL TEAMS WITH INCLUDES - IMPROVED PAGINATION STRATEGY ===
print("🚀 Starting to fetch all teams with detailed includes...")

# Initialize tracking variables
page = 1
more_data = True
total_teams_fetched = 0
save_counter = 0

# Loop until we get no more data
while more_data:
    # Prepare parameters for this page
    params = {
        "api_token": API_TOKEN, 
        "include": INCLUDE,
        "per_page": PER_PAGE,
        "page": page
    }
    
    print(f"📄 Fetching page {page}...")
    response = make_api_call(BASE_URL, params)
    
    if not response:
        print(f"❌ Failed to get page {page}, stopping.")
        break
    
    # Get teams from this page
    teams = response.get("data", [])
    teams_count = len(teams)
    
    if teams_count == 0:
        print("🛑 No more teams returned, finished fetching.")
        more_data = False
        break
    
    print(f"✅ Page {page}: Found {teams_count} teams")
    total_teams_fetched += teams_count
    
    # Process teams data
    for team in teams:
        # Process main team data
        all_teams.append(flatten_team(team))
        
        # Process related entities
        all_players.extend(process_players(team))
        all_coaches.extend(process_coaches(team))
        all_sidelined.extend(process_sidelined(team))
        all_statistics.extend(process_statistics(team))
    
    # Check if we should save progress (every 5 pages or 500 teams)
    save_counter += teams_count
    if save_counter >= 500 or page % 5 == 0:
        save_progress()
        save_counter = 0
    
    # Check pagination info if available
    pagination = response.get("meta", {}).get("pagination", {})
    current_page = pagination.get("current_page", page)
    next_page = pagination.get("next_page")
    
    # Determine if we should continue
    if next_page is None:
        # Check if we've reached the total pages or the API doesn't give pagination info
        total_pages = pagination.get("total_pages")
        if total_pages and current_page >= total_pages:
            more_data = False
            print(f"🏁 Reached final page {current_page} of {total_pages}")
        else:
            # If no pagination info but we got teams, try the next page
            page += 1
    else:
        # Use the next_page value if provided
        page = next_page

# Final save (in case we have leftover data)
save_progress(force_save=True)

# Create summary from full data
print("📊 Creating team summary...")
df_summary = pd.read_csv(TEAMS_FULL_FILE)
df_summary.to_csv(TEAMS_SUMMARY_FILE, index=False)

# Print final stats
elapsed_time = time.time() - start_time
print(f"🏁 DONE! Fetched {total_teams_fetched} teams with full details")
print(f"📞 API calls made: {api_calls}")
print(f"⏱️ Total time elapsed: {elapsed_time:.1f} seconds")
print(f"📁 Files saved:")
print(f"- Teams: {TEAMS_FULL_FILE}")
print(f"- Summary: {TEAMS_SUMMARY_FILE}")
print(f"- Players: {PLAYERS_FILE}")
print(f"- Coaches: {COACHES_FILE}")
print(f"- Sidelined: {SIDELINED_FILE}")
print(f"- Statistics: {STATS_FILE}")

# Display sample statistics
try:
    df_teams = pd.read_csv(TEAMS_FULL_FILE)
    df_players = pd.read_csv(PLAYERS_FILE)
    
    print("\n📊 Summary Statistics:")
    print(f"- Total teams collected: {len(df_teams)}")
    print(f"- Total players collected: {len(df_players)}")
    print(f"- Average players per team: {df_teams['players_count'].mean():.1f}")
    print(f"- Teams with most players: {df_teams.nlargest(3, 'players_count')[['name', 'players_count']].to_string(index=False)}")
except Exception as e:
    print(f"⚠️ Couldn't generate final statistics: {e}")

🚀 Starting to fetch all teams with detailed includes...
📄 Fetching page 1...
✅ Page 1: Found 25 teams
📄 Fetching page 2...
✅ Page 2: Found 25 teams
📄 Fetching page 3...
✅ Page 3: Found 25 teams
📄 Fetching page 4...
✅ Page 4: Found 25 teams
📄 Fetching page 5...
✅ Page 5: Found 25 teams
💾 Saved data: 125 teams, 3409 players, 508 coaches, 318 sidelined, 9469 statistics
📄 Fetching page 6...
✅ Page 6: Found 25 teams
📄 Fetching page 7...
✅ Page 7: Found 25 teams
📄 Fetching page 8...
✅ Page 8: Found 25 teams
📄 Fetching page 9...
✅ Page 9: Found 25 teams
📄 Fetching page 10...
✅ Page 10: Found 25 teams
💾 Saved data: 125 teams, 3445 players, 439 coaches, 217 sidelined, 8497 statistics
📄 Fetching page 11...
✅ Page 11: Found 25 teams
📄 Fetching page 12...
✅ Page 12: Found 25 teams
📄 Fetching page 13...
✅ Page 13: Found 25 teams
📄 Fetching page 14...
✅ Page 14: Found 25 teams
📄 Fetching page 15...
✅ Page 15: Found 25 teams
💾 Saved data: 125 teams, 3229 players, 422 coaches, 297 sidelined, 8054 stat

In [15]:
# ✅ Cell 1: Imports and Setup
import pandas as pd
import os
from tqdm import tqdm
from sportmonks_api.sportmonks_calls import Teams

# === CONFIG ===
API_TOKEN = "oYeoAVFUTQpu7MfoFqbvyiYfgRRkuBWW0p2atkZnySe4X3xrHkjgGhOvI0pd"
SEASONS_FILE = "/Users/sebastianvinther/Desktop/Sportsmonks/seasons.csv"
OUTPUT_FILE = "/Users/sebastianvinther/Desktop/Sportsmonks/teams_by_season.csv"
SAVE_EVERY = 20

In [16]:
# ✅ Cell 2: Load season_ids
season_df = pd.read_csv(SEASONS_FILE)
if 'season_id' not in season_df.columns:
    raise ValueError("❌ 'season_id' column not found in seasons.csv")
season_ids = season_df['season_id'].dropna().astype(int).unique().tolist()

if os.path.exists(OUTPUT_FILE):
    existing = pd.read_csv(OUTPUT_FILE)
    done_ids = existing['season_id'].unique().tolist()
    remaining_ids = [sid for sid in season_ids if sid not in done_ids]
    print(f"🔁 Resuming: {len(done_ids)} done, {len(remaining_ids)} remaining.")
else:
    remaining_ids = season_ids
    print(f"🆕 Starting fresh with {len(remaining_ids)} seasons.")

🆕 Starting fresh with 531 seasons.
