In [22]:
import json
import time
import numpy as np
import pandas as pd

from pydantic import BaseModel
from typing import List, Optional

from bs4 import BeautifulSoup
from selenium import webdriver

from supabase import create_client, Client

In [23]:
# Pydantic Models
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float] = None
    blocked_y: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str

In [24]:
# Core scraping function
def scrape_single_match(url, driver):
    """Scrape match events from a single WhoScored URL"""
    print(f"Scraping: {url}")
    
    driver.get(url)
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    if not element:
        print("❌ matchCentreData not found")
        return None
    
    # Extract and parse match data
    match_data_raw = element.text.split("matchCentreData: ")[1].split(',\n')[0]
    matchdict = json.loads(match_data_raw)
    
    # Process events
    match_events = matchdict['events']
    df = pd.DataFrame(match_events)
    
    if df.empty:
        print("⚠️ No events found")
        return None
    
    # Clean and transform data
    df.dropna(subset='playerId', inplace=True)
    df = df.where(pd.notnull(df), None)
    
    # Rename columns
    df = df.rename({
        'eventId': 'event_id',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    }, axis=1)
    
    # Add display names
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    
    # Drop original columns
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    # Handle missing columns
    if 'is_goal' not in df.columns:
        df['is_goal'] = False
    if 'card_type' not in df.columns:
        df['card_type'] = False
    
    # Filter and select columns
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name', 'period_display_name'
    ]]
    
    # Convert data types
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype('int64')
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    # Handle NaN values
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    # Validate with Pydantic
    validated_events = []
    for _, row in df.iterrows():
        try:
            event = MatchEvent(**row.to_dict())
            validated_events.append(event.dict())
        except Exception as e:
            print(f"Validation error: {e}")
    
    validated_df = pd.DataFrame(validated_events)
    print(f'✅ Successfully scraped {len(validated_df)} events')
    return validated_df

In [25]:
def get_eredivisie_match_urls(driver):
    """
    Navigate to Eredivisie fixtures page and extract URLs for completed matches
    """
    # Navigate to Eredivisie fixtures page with CORRECT season and stage IDs
    fixtures_url = "https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026"
    driver.get(fixtures_url)
    
    # Wait for page to load
    time.sleep(5)
    
    # Parse the page content - THIS LINE WAS MISSING
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all match links with CORRECTED lowercase pattern
    match_links = soup.find_all('a', href=lambda x: x and '/live/' in x.lower())
    
    completed_match_urls = []
    
    for link in match_links:
        # Check if match is completed (look for "FT" in the parent div)
        parent_div = link.find_parent('div')
        if parent_div and 'FT' in parent_div.get_text():
            full_url = f"https://www.whoscored.com{link['href']}"
            completed_match_urls.append(full_url)
    
    print(f"Found {len(completed_match_urls)} completed matches")
    return completed_match_urls

In [26]:
def scrape_single_eredivisie_match(match_url, driver):
    """
    Scrape a single Eredivisie match from WhoScored
    """
    print(f"Scraping match: {match_url}")
    
    driver.get(match_url)
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    if not element:
        print("❌ matchCentreData not found")
        return None
    
    # Extract and parse match data
    match_data_raw = element.text.split("matchCentreData: ")[1].split(',\n')[0]
    matchdict = json.loads(match_data_raw)
    
    # Process events
    match_events = matchdict['events']
    df = pd.DataFrame(match_events)
    
    if df.empty:
        print("⚠️ No events found")
        return None
    
    # Clean and transform data
    df.dropna(subset='playerId', inplace=True)
    df = df.where(pd.notnull(df), None)
    
    # Add display names
    df['type_display_name'] = df['type'].apply(lambda x: x.get('displayName', '') if x else '')
    df['outcome_type_display_name'] = df['outcomeType'].apply(lambda x: x.get('displayName', '') if x else '')
    df['period_display_name'] = df['period'].apply(lambda x: x.get('displayName', '') if x else '')
    
    print(f"✅ Successfully scraped {len(df)} events")
    return df

In [27]:
def scrape_all_eredivisie_matches(driver, include_future=False):
    """
    Scrape all Eredivisie matches from the fixtures page
    """
    # Navigate to Eredivisie fixtures page
    fixtures_url = "https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026"
    print(f"Loading fixtures page: {fixtures_url}")
    
    driver.get(fixtures_url)
    time.sleep(5)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all match links - FIXED: Use more specific selector to avoid duplicates
    match_links = soup.find_all('a', {'class': 'Match-module_score__5Ghhj'})
    
    match_urls = set()  # Use set to automatically remove duplicates
    
    for link in match_links:
        # Check match status
        parent_div = link.find_parent('div', class_='Match-module_match__XlKTY')
        if parent_div:
            status_element = parent_div.find('span', class_=['Match-module_FT__2rmH7', 'Match-module_startTime_lineup__H1Krq'])
            
            if status_element:
                status_text = status_element.get_text().strip()
                
                # Include completed matches (FT) and optionally future matches (time format)
                if status_text == 'FT' or (include_future and ':' in status_text):
                    full_url = f"https://www.whoscored.com{link['href']}"
                    match_urls.add(full_url)  # Add to set (removes duplicates)
    
    match_urls = list(match_urls)  # Convert back to list
    print(f"Found {len(match_urls)} unique matches to scrape")
    
    # Scrape each match with better error handling
    all_match_data = []
    
    for i, url in enumerate(match_urls, 1):
        print(f"\nScraping match {i}/{len(match_urls)}")
        
        try:
            match_df = scrape_single_eredivisie_match(url, driver)
            
            if match_df is not None:
                # Add match URL for reference
                match_df['match_url'] = url
                all_match_data.append(match_df)
        
        except KeyboardInterrupt:
            print("\n⚠️ Scraping interrupted by user")
            break
        except Exception as e:
            print(f"❌ Error scraping {url}: {str(e)}")
            continue
        
        # Small delay between requests
        time.sleep(2)
    
    if all_match_data:
        # Combine all match data
        combined_df = pd.concat(all_match_data, ignore_index=True)
        print(f"\n✅ Successfully scraped {len(all_match_data)} matches with {len(combined_df)} total events")
        return combined_df
    else:
        print("❌ No match data was scraped")
        return None

In [28]:
# Initialize driver
driver = webdriver.Chrome()

# Scrape a single match
#single_match_url = "https://www.whoscored.com/matches/1903733/live/netherlands-eredivisie-2025-2026-fortuna-sittard-go-ahead-eagles"
#df_single = scrape_single_eredivisie_match(single_match_url, driver)

# Scrape all completed matches
df_all = scrape_all_eredivisie_matches(driver, include_future=False)

# Scrape all matches including future ones
#df_all_with_future = scrape_all_eredivisie_matches(driver, include_future=True)

driver.quit()

Loading fixtures page: https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026
Found 18 unique matches to scrape

Scraping match 1/18
Scraping match: https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam
✅ Successfully scraped 1368 events

Scraping match 2/18
Scraping match: https://www.whoscored.com/matches/1903738/live/netherlands-eredivisie-2025-2026-pec-zwolle-twente
✅ Successfully scraped 1472 events

Scraping match 3/18
Scraping match: https://www.whoscored.com/matches/1903739/live/netherlands-eredivisie-2025-2026-ajax-telstar
✅ Successfully scraped 1556 events

Scraping match 4/18
Scraping match: https://www.whoscored.com/matches/1903735/live/netherlands-eredivisie-2025-2026-feyenoord-nac-breda
✅ Successfully scraped 1429 events

Scraping match 5/18
Scraping match: https://www.whoscored.com/matches/1903737/live/netherlands-eredivisie-2025-2026-psv-eindhoven-spart

In [30]:
#df_single.head()


In [31]:
df_all.head()

Unnamed: 0,id,eventId,minute,second,teamId,x,y,expandedMinute,period,type,...,isGoal,isShot,blockedX,blockedY,cardType,type_display_name,outcome_type_display_name,period_display_name,match_url,isOwnGoal
0,2833207000.0,3,0,0.0,287,50.1,49.7,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
1,2833207000.0,4,0,2.0,287,47.1,53.6,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
2,2833207000.0,5,0,6.0,287,34.9,24.2,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
3,2833207000.0,6,0,9.0,287,56.1,15.8,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
4,2833207000.0,7,0,12.0,287,45.2,57.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,


In [1]:
df_all.columns

NameError: name 'df_all' is not defined

In [32]:
df_all.to_csv('/Users/ricardoheredia/Desktop/automated-football-data-reports/data/all_matches.csv')


In [2]:
import pandas as pd

# Read the CSV file
df_loaded = pd.read_csv('/Users/ricardoheredia/Desktop/automated-football-data-reports/data/all_matches.csv', index_col=0)

# Display the first few rows
df_loaded.head()

Unnamed: 0,id,eventId,minute,second,teamId,x,y,expandedMinute,period,type,...,isGoal,isShot,blockedX,blockedY,cardType,type_display_name,outcome_type_display_name,period_display_name,match_url,isOwnGoal
0,2833207000.0,3,0,0.0,287,50.1,49.7,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
1,2833207000.0,4,0,2.0,287,47.1,53.6,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
2,2833207000.0,5,0,6.0,287,34.9,24.2,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
3,2833207000.0,6,0,9.0,287,56.1,15.8,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,
4,2833207000.0,7,0,12.0,287,45.2,57.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...,


In [3]:
df_loaded.columns

Index(['id', 'eventId', 'minute', 'second', 'teamId', 'x', 'y',
       'expandedMinute', 'period', 'type', 'outcomeType', 'qualifiers',
       'satisfiedEventsTypes', 'isTouch', 'playerId', 'endX', 'endY',
       'relatedEventId', 'relatedPlayerId', 'goalMouthZ', 'goalMouthY',
       'isGoal', 'isShot', 'blockedX', 'blockedY', 'cardType',
       'type_display_name', 'outcome_type_display_name', 'period_display_name',
       'match_url', 'isOwnGoal'],
      dtype='object')

### Improving scraping data extraction 

The goal is to extract the latest match fixtures from the latest match day. 

- Upserting to Supabase the recent scraped matches.
- Only scraping when there are new matches in the latest match day, not updating the old ones and keeping them.


In [20]:
import json
import time
import numpy as np
import pandas as pd

from pydantic import BaseModel
from typing import List, Optional

from bs4 import BeautifulSoup
from selenium import webdriver

#from supabase import create_client, Client

In [19]:
# Pydantic Models
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float] = None
    blocked_y: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str

In [8]:
# Pydantic Models
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float] = None
    blocked_y: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str
    match_url: str  # Nueva columna añadida

In [9]:
# Core scraping function
def scrape_single_match(url, driver):
    """Scrape match events from a single WhoScored URL"""
    print(f"Scraping: {url}")
    
    driver.get(url)
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    if not element:
        print("❌ matchCentreData not found")
        return None
    
    # Extract and parse match data
    match_data_raw = element.text.split("matchCentreData: ")[1].split(',\n')[0]
    matchdict = json.loads(match_data_raw)
    
    # Process events
    match_events = matchdict['events']
    df = pd.DataFrame(match_events)
    
    if df.empty:
        print("⚠️ No events found")
        return None
    
    # Clean and transform data
    df.dropna(subset='playerId', inplace=True)
    df = df.where(pd.notnull(df), None)
    
    # Rename columns
    df = df.rename({
        'eventId': 'event_id',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    }, axis=1)
    
    # Add display names
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    
    # Drop original columns
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    # Handle missing columns
    if 'is_goal' not in df.columns:
        df['is_goal'] = False
    if 'card_type' not in df.columns:
        df['card_type'] = False
    
    # Filter and select columns
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name', 'period_display_name'
    ]]
    
    # Convert data types
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype('int64')
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    # Handle NaN values
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    # Validate with Pydantic
    validated_events = []
    for _, row in df.iterrows():
        try:
            event = MatchEvent(**row.to_dict())
            validated_events.append(event.dict())
        except Exception as e:
            print(f"Validation error: {e}")
    
    validated_df = pd.DataFrame(validated_events)
    print(f'✅ Successfully scraped {len(validated_df)} events')
    return validated_df

In [10]:
def scrape_single_eredivisie_match(match_url, driver):
    """
    Scrape a single Eredivisie match from WhoScored
    """
    print(f"Scraping match: {match_url}")
    
    driver.get(match_url)
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    if not element:
        print("❌ matchCentreData not found")
        return None
    
    # Extract and parse match data
    match_data_raw = element.text.split("matchCentreData: ")[1].split(',\n')[0]
    matchdict = json.loads(match_data_raw)
    
    # Process events
    match_events = matchdict['events']
    df = pd.DataFrame(match_events)
    
    if df.empty:
        print("⚠️ No events found")
        return None
    
    # Clean and transform data
    df.dropna(subset='playerId', inplace=True)
    df = df.where(pd.notnull(df), None)
    
    # Add display names
    df['type_display_name'] = df['type'].apply(lambda x: x.get('displayName', '') if x else '')
    df['outcome_type_display_name'] = df['outcomeType'].apply(lambda x: x.get('displayName', '') if x else '')
    df['period_display_name'] = df['period'].apply(lambda x: x.get('displayName', '') if x else '')
    
    print(f"✅ Successfully scraped {len(df)} events")
    return df

In [11]:
def scrape_all_eredivisie_matches(driver, include_future=False):
    """
    Scrape all Eredivisie matches from the fixtures page
    """
    # Navigate to Eredivisie fixtures page
    fixtures_url = "https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026"
    print(f"Loading fixtures page: {fixtures_url}")
    
    driver.get(fixtures_url)
    time.sleep(5)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all match links - FIXED: Use more specific selector to avoid duplicates
    match_links = soup.find_all('a', {'class': 'Match-module_score__5Ghhj'})
    
    match_urls = set()  # Use set to automatically remove duplicates
    
    for link in match_links:
        # Check match status
        parent_div = link.find_parent('div', class_='Match-module_match__XlKTY')
        if parent_div:
            status_element = parent_div.find('span', class_=['Match-module_FT__2rmH7', 'Match-module_startTime_lineup__H1Krq'])
            
            if status_element:
                status_text = status_element.get_text().strip()
                
                # Include completed matches (FT) and optionally future matches (time format)
                if status_text == 'FT' or (include_future and ':' in status_text):
                    full_url = f"https://www.whoscored.com{link['href']}"
                    match_urls.add(full_url)  # Add to set (removes duplicates)
    
    match_urls = list(match_urls)  # Convert back to list
    print(f"Found {len(match_urls)} unique matches to scrape")
    
    # Scrape each match with better error handling
    all_match_data = []
    
    for i, url in enumerate(match_urls, 1):
        print(f"\nScraping match {i}/{len(match_urls)}")
        
        try:
            match_df = scrape_single_match(url, driver)
            
            if match_df is not None:
                # Add match URL for reference
                match_df['match_url'] = url
                all_match_data.append(match_df)
        
        except KeyboardInterrupt:
            print("\n⚠️ Scraping interrupted by user")
            break
        except Exception as e:
            print(f"❌ Error scraping {url}: {str(e)}")
            continue
        
        # Small delay between requests
        time.sleep(2)
    
    if all_match_data:
        # Combine all match data
        combined_df = pd.concat(all_match_data, ignore_index=True)
        print(f"\n✅ Successfully scraped {len(all_match_data)} matches with {len(combined_df)} total events")
        return combined_df
    else:
        print("❌ No match data was scraped")
        return None

In [12]:
# Initialize driver
driver = webdriver.Chrome()

# Scrape a single match
#single_match_url = "https://www.whoscored.com/matches/1903733/live/netherlands-eredivisie-2025-2026-fortuna-sittard-go-ahead-eagles"
#df_single = scrape_single_eredivisie_match(single_match_url, driver)

# Scrape all completed matches
df_all = scrape_all_eredivisie_matches(driver, include_future=False)

# Scrape all matches including future ones
#df_all_with_future = scrape_all_eredivisie_matches(driver, include_future=True)

driver.quit()

Loading fixtures page: https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026
Found 10 unique matches to scrape

Scraping match 1/10
Scraping: https://www.whoscored.com/matches/1903768/live/netherlands-eredivisie-2025-2026-ajax-pec-zwolle
Validation error: 1 validation error for MatchEvent
match_url
  Field required [type=missing, input_value={'id': 2845345635, 'event...play_name': 'FirstHalf'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
Validation error: 1 validation error for MatchEvent
match_url
  Field required [type=missing, input_value={'id': 2845345709, 'event...play_name': 'FirstHalf'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
Validation error: 1 validation error for MatchEvent
match_url
  Field required [type=missing, input_value={'id': 2845345739, 'event...play_name': 'FirstHalf'}, input_type=dict]
    For f

In [18]:
pd.set_option('display.max_columns', None)
df_single.head()

Unnamed: 0,id,eventId,minute,second,teamId,x,y,expandedMinute,period,type,outcomeType,qualifiers,satisfiedEventsTypes,isTouch,playerId,endX,endY,relatedEventId,relatedPlayerId,blockedX,blockedY,goalMouthZ,goalMouthY,isShot,isOwnGoal,isGoal,cardType,type_display_name,outcome_type_display_name,period_display_name
2,2832630000.0,3,0,0.0,874,50.1,49.9,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}","{'value': 1, 'displayName': 'Successful'}","[{'type': {'value': 140, 'displayName': 'PassE...","[91, 117, 30, 35, 38, 216, 218]",True,436147.0,38.6,38.0,,,,,,,,,,,Pass,Successful,FirstHalf
3,2832630000.0,4,0,3.0,874,39.3,37.7,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}","{'value': 1, 'displayName': 'Successful'}","[{'type': {'value': 213, 'displayName': 'Angle...","[91, 117, 30, 36, 37, 216, 218]",True,422633.0,41.6,60.5,,,,,,,,,,,Pass,Successful,FirstHalf
4,2832630000.0,5,0,5.0,874,39.6,54.4,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}","{'value': 0, 'displayName': 'Unsuccessful'}","[{'type': {'value': 56, 'displayName': 'Zone'}...","[91, 120, 124, 128, 36, 38, 217, 218]",True,445056.0,73.2,10.9,,,,,,,,,,,Pass,Unsuccessful,FirstHalf
5,2832630000.0,3,0,5.0,242,23.8,86.9,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 8, 'displayName': 'Interception'}","{'value': 1, 'displayName': 'Successful'}","[{'type': {'value': 15, 'displayName': 'Head'}...","[91, 101, 56]",True,400011.0,,,,,,,,,,,,,Interception,Successful,FirstHalf
6,2832630000.0,4,0,8.0,242,34.2,88.9,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 49, 'displayName': 'BallRecovery'}","{'value': 1, 'displayName': 'Successful'}",[],[93],False,400011.0,,,,,,,,,,,,,BallRecovery,Successful,FirstHalf


In [29]:
pd.set_option('display.max_colwidth', None)
df_all.head()

Unnamed: 0,id,event_id,minute,second,team_id,player_id,x,y,end_x,end_y,...,blocked_y,goal_mouth_z,goal_mouth_y,is_shot,card_type,is_goal,type_display_name,outcome_type_display_name,period_display_name,match_url
0,2833207221,3,0,0.0,287,422634,50.1,49.7,47.1,53.6,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam
1,2833207247,4,0,2.0,287,531002,47.1,53.6,34.9,26.0,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam
2,2833207303,5,0,6.0,287,380731,34.9,24.2,57.8,8.6,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam
3,2833207341,6,0,9.0,287,535678,56.1,15.8,45.8,55.2,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam
4,2833207391,7,0,12.0,287,436168,45.2,57.0,40.5,23.9,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam


In [16]:
df_all.columns

Index(['id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y',
       'end_x', 'end_y', 'qualifiers', 'is_touch', 'blocked_x', 'blocked_y',
       'goal_mouth_z', 'goal_mouth_y', 'is_shot', 'card_type', 'is_goal',
       'type_display_name', 'outcome_type_display_name', 'period_display_name',
       'match_url'],
      dtype='object')

In [22]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26557 entries, 0 to 26556
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         26557 non-null  int64  
 1   event_id                   26557 non-null  int64  
 2   minute                     26557 non-null  int64  
 3   second                     26557 non-null  float64
 4   team_id                    26557 non-null  int64  
 5   player_id                  26557 non-null  int64  
 6   x                          26557 non-null  float64
 7   y                          26557 non-null  float64
 8   end_x                      17617 non-null  float64
 9   end_y                      17617 non-null  float64
 10  qualifiers                 26557 non-null  object 
 11  is_touch                   26557 non-null  bool   
 12  blocked_x                  306 non-null    float64
 13  blocked_y                  306 non-null    flo

In [17]:
df_all.match_url.unique()

array(['https://www.whoscored.com/matches/1903736/live/netherlands-eredivisie-2025-2026-sc-heerenveen-fc-volendam',
       'https://www.whoscored.com/matches/1903739/live/netherlands-eredivisie-2025-2026-ajax-telstar',
       'https://www.whoscored.com/matches/1903747/live/netherlands-eredivisie-2025-2026-sparta-rotterdam-fc-utrecht',
       'https://www.whoscored.com/matches/1903744/live/netherlands-eredivisie-2025-2026-excelsior-feyenoord',
       'https://www.whoscored.com/matches/1903743/live/netherlands-eredivisie-2025-2026-fc-groningen-sc-heerenveen',
       'https://www.whoscored.com/matches/1903742/live/netherlands-eredivisie-2025-2026-telstar-pec-zwolle',
       'https://www.whoscored.com/matches/1903735/live/netherlands-eredivisie-2025-2026-feyenoord-nac-breda',
       'https://www.whoscored.com/matches/1903737/live/netherlands-eredivisie-2025-2026-psv-eindhoven-sparta-rotterdam',
       'https://www.whoscored.com/matches/1903749/live/netherlands-eredivisie-2025-2026-fc-volen

In [18]:
df_all.to_csv("/Users/ricardoheredia/Desktop/automated-football-data-reports/data/all_matches_matchday12_eredivisie.csv")

In [4]:
# Read the CSV file
df_all = pd.read_csv('/Users/ricardoheredia/Desktop/automated-football-data-reports/data/all_matches_matchday12_eredivisie.csv', index_col=0)

# Display the first few rows
df_all.head()

Unnamed: 0,id,event_id,minute,second,team_id,player_id,x,y,end_x,end_y,...,blocked_y,goal_mouth_z,goal_mouth_y,is_shot,card_type,is_goal,type_display_name,outcome_type_display_name,period_display_name,match_url
0,2833207221,3,0,0.0,287,422634,50.1,49.7,47.1,53.6,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
1,2833207247,4,0,2.0,287,531002,47.1,53.6,34.9,26.0,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
2,2833207303,5,0,6.0,287,380731,34.9,24.2,57.8,8.6,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
3,2833207341,6,0,9.0,287,535678,56.1,15.8,45.8,55.2,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
4,2833207391,7,0,12.0,287,436168,45.2,57.0,40.5,23.9,...,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...


#### Supabase Configs

In [5]:
project_url = 'https://qhdejetgajuancazelgs.supabase.co'
api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InFoZGVqZXRnYWp1YW5jYXplbGdzIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTEwOTAzNjIsImV4cCI6MjA2NjY2NjM2Mn0.WvouVFrRK3BDxyQysXcYpwnyyGb_yUGrxqScIGF3VXY'
supabase_password = 'Vanepirlo8.'

In [6]:
supabase = create_client(project_url, api_key)

In [18]:
pd.set_option('display.max_columns', None)
df_all[:5]

Unnamed: 0,id,event_id,minute,second,team_id,player_id,x,y,end_x,end_y,qualifiers,is_touch,blocked_x,blocked_y,goal_mouth_z,goal_mouth_y,is_shot,card_type,is_goal,type_display_name,outcome_type_display_name,period_display_name,match_url
0,2833207221,3,0,0.0,287,422634,50.1,49.7,47.1,53.6,"[{'type': {'value': 56, 'displayName': 'Zone'}...",True,,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
1,2833207247,4,0,2.0,287,531002,47.1,53.6,34.9,26.0,"[{'type': {'value': 140, 'displayName': 'PassE...",True,,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
2,2833207303,5,0,6.0,287,380731,34.9,24.2,57.8,8.6,"[{'type': {'value': 213, 'displayName': 'Angle...",True,,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
3,2833207341,6,0,9.0,287,535678,56.1,15.8,45.8,55.2,"[{'type': {'value': 212, 'displayName': 'Lengt...",True,,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...
4,2833207391,7,0,12.0,287,436168,45.2,57.0,40.5,23.9,"[{'type': {'value': 56, 'displayName': 'Zone'}...",True,,,,,False,False,False,Pass,Successful,FirstHalf,https://www.whoscored.com/matches/1903736/live...


In [16]:
df_all.qualifiers[0]

"[{'type': {'value': 56, 'displayName': 'Zone'}, 'value': 'Back'}, {'type': {'value': 141, 'displayName': 'PassEndY'}, 'value': '53.6'}, {'type': {'value': 213, 'displayName': 'Angle'}, 'value': '2.44'}, {'type': {'value': 140, 'displayName': 'PassEndX'}, 'value': '47.1'}, {'type': {'value': 212, 'displayName': 'Length'}, 'value': '4.1'}]"

In [19]:
import math, ast, json
from typing import Any, List, Dict

In [20]:
def parse_qualifiers(df: pd.DataFrame) -> pd.DataFrame:
    """Convierte `qualifiers` (string) a lista/dict JSON-compatibles y tipa valores numéricos."""
    def to_jsonable(cell: Any) -> Any:
        if cell is None or (isinstance(cell, float) and (math.isnan(cell) or math.isinf(cell))):
            return None
        if isinstance(cell, str):
            try:
                cell = ast.literal_eval(cell)
            except Exception:
                return None
        def walk(x: Any) -> Any:
            if isinstance(x, dict):
                return {k: (float(v) if k == "value" and isinstance(v, (int, float, str)) and str(v).replace('.','',1).isdigit() else walk(v))
                        for k, v in x.items()}
            if isinstance(x, list):
                return [walk(v) for v in x]
            if isinstance(x, (np.floating,)):
                return None if (np.isnan(x) or np.isinf(x)) else float(x)
            if isinstance(x, float):
                return None if (math.isnan(x) or math.isinf(x)) else x
            return x
        return walk(cell)
    df = df.copy()
    df['qualifiers'] = df['qualifiers'].apply(to_jsonable)
    return df

In [21]:
def sanitize_records(df: pd.DataFrame) -> List[Dict[str, Any]]:
    """Reemplaza NaN/Inf por None en TODO el payload final (top-level y nested)."""
    records = df.to_dict('records')
    def clean(x: Any) -> Any:
        if isinstance(x, dict):
            return {k: clean(v) for k, v in x.items()}
        if isinstance(x, list):
            return [clean(v) for v in x]
        if isinstance(x, (np.integer,)):
            return int(x)
        if isinstance(x, (np.floating,)):
            return None if (np.isnan(x) or np.isinf(x)) else float(x)
        if isinstance(x, float):
            return None if (math.isnan(x) or math.isinf(x)) else x
        return x
    return [clean(r) for r in records]

In [22]:
# 1) Cargar matches (FK) y 2) upsert de eventos en batches
def upsert_events_minimal(supabase, df_all: pd.DataFrame, batch_size: int = 2000) -> None:
    # 1) Upsert de matches por FK
    urls = sorted(set(df_all['match_url'].tolist()))
    if urls:
        supabase.table('eredivisie_matches').upsert(
            [{'match_url': u} for u in urls], on_conflict='match_url'
        ).execute()

    # 2) Limpiar y upsert de eventos
    df_clean = parse_qualifiers(df_all)
    payload = sanitize_records(df_clean)

    print(f"Upserting {len(payload)} eventos en batches de {batch_size}...")
    for i in range(0, len(payload), batch_size):
        supabase.table('eredivisie_match_events').upsert(
            payload[i:i+batch_size], on_conflict='id'
        ).execute()
        print(f"✅ Batch {(i//batch_size)+1}: {len(payload[i:i+batch_size])}")
    print("Listo.")

In [23]:
upsert_events_minimal(supabase, df_all, batch_size=2000)

Upserting 26557 eventos en batches de 2000...
✅ Batch 1: 2000
✅ Batch 2: 2000
✅ Batch 3: 2000
✅ Batch 4: 2000
✅ Batch 5: 2000
✅ Batch 6: 2000
✅ Batch 7: 2000
✅ Batch 8: 2000
✅ Batch 9: 2000
✅ Batch 10: 2000
✅ Batch 11: 2000
✅ Batch 12: 2000
✅ Batch 13: 2000
✅ Batch 14: 557
Listo.


### Going deeper

17/09/25 

This approach is working, the one from before not. 

In [14]:
import json
import time
import numpy as np
import pandas as pd

from pydantic import BaseModel
from typing import List, Optional

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

#from supabase import create_client, Client

In [15]:
# Pydantic Models
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float] = None
    blocked_y: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str
    match_url: str

In [23]:
# Core scraping function
def scrape_single_match(url, driver):
    """Scrape match events from a single WhoScored URL"""
    print(f"Scraping: {url}")
    
    driver.get(url)
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    if not element:
        print("❌ matchCentreData not found")
        return None
    
    # Extract and parse match data
    match_data_raw = element.text.split("matchCentreData: ")[1].split(',\n')[0]
    matchdict = json.loads(match_data_raw)
    
    # Process events
    match_events = matchdict['events']
    df = pd.DataFrame(match_events)
    
    if df.empty:
        print("⚠️ No events found")
        return None
    
    # Clean and transform data
    df.dropna(subset='playerId', inplace=True)
    df = df.where(pd.notnull(df), None)
    
    # Rename columns
    df = df.rename({
        'eventId': 'event_id',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    }, axis=1)
    
    # Add display names
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    
    # Drop original columns
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    # Handle missing columns
    if 'is_goal' not in df.columns:
        df['is_goal'] = False
    if 'card_type' not in df.columns:
        df['card_type'] = False
    
    # Filter and select columns
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name', 'period_display_name'
    ]]
    
    # Convert data types
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype('int64')
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    # Handle NaN values
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    # Add match URL to each row
    df['match_url'] = url
    
    # Validate with Pydantic
    validated_events = []
    for _, row in df.iterrows():
        try:
            event = MatchEvent(**row.to_dict())
            validated_events.append(event.model_dump())
        except Exception as e:
            print(f"Validation error: {e}")
    
    validated_df = pd.DataFrame(validated_events)
    print(f'✅ Successfully scraped {len(validated_df)} events')
    return validated_df

In [17]:
def scrape_all_eredivisie_matches_with_debug(driver, include_future=False):
    """
    Enhanced scraper with comprehensive debug logging to understand what's on the page
    """
    # Navigate to Eredivisie fixtures page
    fixtures_url = "https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026"
    print(f"Loading fixtures page: {fixtures_url}")
    
    driver.get(fixtures_url)
    time.sleep(5)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    print("\n" + "="*60)
    print("🔍 DEBUG ANALYSIS OF FIXTURES PAGE")
    print("="*60)
    
    # 1. Check accordion sections (dates)
    accordion_sections = soup.find_all('div', class_='Accordion-module_header__HqzWD')
    print(f"\n📅 Date sections found: {len(accordion_sections)}")
    for i, section in enumerate(accordion_sections):
        date_text = section.get_text().strip()
        print(f"  {i+1}. {date_text}")
    
    # 2. Check all match containers
    all_match_containers = soup.find_all('div', class_='Match-module_match__XlKTY')
    print(f"\n⚽ Total match containers: {len(all_match_containers)}")
    
    # 3. Check match statuses
    ft_matches = soup.find_all('span', class_='Match-module_FT__2rmH7')
    upcoming_matches = soup.find_all('span', class_='Match-module_startTime__c49c8')
    lineup_matches = soup.find_all('span', class_='Match-module_startTime_lineup__H1Krq')
    
    print(f"📊 Match statuses:")
    print(f"  - Finished (FT): {len(ft_matches)}")
    print(f"  - Upcoming (time): {len(upcoming_matches)}")
    print(f"  - Lineup confirmed: {len(lineup_matches)}")
    
    # 4. Check score links (what your original code uses)
    score_links = soup.find_all('a', {'class': 'Match-module_score__5Ghhj'})
    print(f"🔗 Score links found: {len(score_links)}")
    
    # 5. Detailed analysis of each match
    print(f"\n📋 DETAILED MATCH ANALYSIS:")
    print("-" * 40)
    
    match_urls = set()
    match_details = []
    
    for i, container in enumerate(all_match_containers):
        # Find status
        status_ft = container.find('span', class_='Match-module_FT__2rmH7')
        status_time = container.find('span', class_='Match-module_startTime__c49c8')
        status_lineup = container.find('span', class_='Match-module_startTime_lineup__H1Krq')
        
        if status_ft:
            status = "FT"
        elif status_lineup:
            status = status_lineup.get_text().strip()
        elif status_time:
            status = status_time.get_text().strip()
        else:
            status = "UNKNOWN"
        
        # Find teams
        team_elements = container.find_all('a', class_='Match-module_teamNameText__Dqv-G')
        teams = [team.get_text().strip() for team in team_elements] if team_elements else ["Unknown", "Unknown"]
        
        # Find score link
        score_link = container.find('a', class_='Match-module_score__5Ghhj')
        match_url = None
        if score_link and score_link.get('href'):
            match_url = f"https://www.whoscored.com{score_link['href']}"
            if status == "FT" or (include_future and ':' in status):
                match_urls.add(match_url)
        
        match_info = {
            'teams': ' vs '.join(teams),
            'status': status,
            'url': match_url,
            'include_in_scrape': match_url is not None and (status == "FT" or (include_future and ':' in status))
        }
        match_details.append(match_info)
        
        print(f"  {i+1:2d}. {match_info['teams']:<35} | {status:<8} | Include: {match_info['include_in_scrape']}")
    
    # 6. Calendar navigation check
    print(f"\n🗓️  CALENDAR CONTROLS:")
    prev_button = soup.find('button', id='dayChangeBtn-prev')
    next_button = soup.find('button', id='dayChangeBtn-next')
    calendar_toggle = soup.find('button', id='toggleCalendar')
    
    print(f"  - Previous button: {'Found' if prev_button else 'Not found'}")
    print(f"  - Next button: {'Found' if next_button else 'Not found'}")
    print(f"  - Calendar toggle: {'Found' if calendar_toggle else 'Not found'}")
    
    # 7. Summary
    print(f"\n📈 SUMMARY:")
    print(f"  - Total matches visible: {len(all_match_containers)}")
    print(f"  - Finished matches (FT): {len([m for m in match_details if m['status'] == 'FT'])}")
    print(f"  - Matches to scrape: {len(match_urls)}")
    print(f"  - Date range visible: {accordion_sections[0].get_text().strip() if accordion_sections else 'None'} to {accordion_sections[-1].get_text().strip() if accordion_sections else 'None'}")
    
    print("="*60)
    print("🔍 END DEBUG ANALYSIS")
    print("="*60 + "\n")
    
    # Convert to list and proceed with scraping
    match_urls = list(match_urls)
    print(f"Proceeding to scrape {len(match_urls)} matches...")
    
    # Scrape each match with better error handling
    all_match_data = []
    
    for i, url in enumerate(match_urls, 1):
        print(f"\nScraping match {i}/{len(match_urls)}")
        
        try:
            match_df = scrape_single_match(url, driver)
            
            if match_df is not None:
                all_match_data.append(match_df)
        
        except KeyboardInterrupt:
            print("\n⚠️ Scraping interrupted by user")
            break
        except Exception as e:
            print(f"❌ Error scraping {url}: {str(e)}")
            continue
        
        # Small delay between requests
        time.sleep(2)
    
    if all_match_data:
        # Combine all match data
        combined_df = pd.concat(all_match_data, ignore_index=True)
        print(f"\n✅ Successfully scraped {len(all_match_data)} matches with {len(combined_df)} total events")
        return combined_df
    else:
        print("❌ No match data was scraped")
        return None

In [18]:
def investigate_calendar_navigation(driver):
    """
    Test if clicking calendar controls reveals more historical matches
    """
    print("\n🔍 INVESTIGATING CALENDAR NAVIGATION...")
    
    fixtures_url = "https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026"
    driver.get(fixtures_url)
    time.sleep(5)
    
    initial_soup = BeautifulSoup(driver.page_source, 'html.parser')
    initial_count = len(initial_soup.find_all('span', class_='Match-module_FT__2rmH7'))
    print(f"Initial FT matches: {initial_count}")
    
    # Try clicking previous button to go back in time
    try:
        prev_button = driver.find_element(By.ID, "dayChangeBtn-prev")
        print("Found previous button, clicking to go back...")
        
        for i in range(5):  # Try going back 5 times
            prev_button.click()
            time.sleep(3)  # Wait for content to load
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            ft_count = len(soup.find_all('span', class_='Match-module_FT__2rmH7'))
            
            # Check current visible date range
            accordion_sections = soup.find_all('div', class_='Accordion-module_header__HqzWD')
            if accordion_sections:
                first_date = accordion_sections[0].get_text().strip()
                last_date = accordion_sections[-1].get_text().strip()
                print(f"  Click {i+1}: {ft_count} FT matches | Date range: {first_date} to {last_date}")
            else:
                print(f"  Click {i+1}: {ft_count} FT matches | No date sections found")
            
            # Update the prev_button reference (page might have reloaded)
            try:
                prev_button = driver.find_element(By.ID, "dayChangeBtn-prev")
            except:
                print("  Previous button no longer found, stopping")
                break
                
    except Exception as e:
        print(f"Could not find or click previous button: {e}")

In [24]:
def collect_all_season_matches(driver):
    """Navigate through calendar to collect all season matches"""
    fixtures_url = "https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026"
    driver.get(fixtures_url)
    time.sleep(5)
    
    all_match_urls = set()
    
    # Go back to August
    for i in range(15):
        try:
            prev_button = driver.find_element(By.ID, "dayChangeBtn-prev")
            prev_button.click()
            time.sleep(2)
        except:
            break
    
    # Go forward collecting all finished matches
    for period in range(30):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        containers = soup.find_all('div', class_='Match-module_match__XlKTY')
        for container in containers:
            if container.find('span', class_='Match-module_FT__2rmH7'):
                score_link = container.find('a', class_='Match-module_score__5Ghhj')
                if score_link and score_link.get('href'):
                    match_url = f"https://www.whoscored.com{score_link['href']}"
                    all_match_urls.add(match_url)
        
        try:
            next_button = driver.find_element(By.ID, "dayChangeBtn-next")
            next_button.click()
            time.sleep(2)
        except:
            break
    
    return list(all_match_urls)

In [6]:
# Main execution
if __name__ == "__main__":
    # Initialize driver
    driver = webdriver.Chrome()
    
    try:
        print("🚀 Starting Enhanced WhoScored Scraper with Debug Analysis")
        
        # First, run the debug analysis
        df_all = scrape_all_eredivisie_matches_with_debug(driver, include_future=False)
        
        if df_all is not None:
            print(f"\n📊 Final Results:")
            print(f"  - Total events scraped: {len(df_all)}")
            print(f"  - Unique matches: {df_all['match_url'].nunique()}")
            print(f"  - Sample of data:")
            print(df_all[['match_url', 'type_display_name', 'minute']].head())
        
        # Optional: Test calendar navigation
        print("\n" + "="*60)
        user_input = input("Do you want to test calendar navigation? (y/n): ")
        if user_input.lower() == 'y':
            investigate_calendar_navigation(driver)
    
    finally:
        driver.quit()
        print("\n🏁 Scraping completed and browser closed.")

🚀 Starting Enhanced WhoScored Scraper with Debug Analysis
Loading fixtures page: https://www.whoscored.com/Regions/155/Tournaments/13/Seasons/10752/Stages/24542/Fixtures/Netherlands-Eredivisie-2025-2026

🔍 DEBUG ANALYSIS OF FIXTURES PAGE

📅 Date sections found: 10
  1. Saturday, Sep 13 2025
  2. Sunday, Sep 14 2025
  3. Wednesday, Sep 17 2025
  4. Friday, Sep 19 2025
  5. Saturday, Sep 20 2025
  6. Sunday, Sep 21 2025
  7. Wednesday, Sep 24 2025
  8. Friday, Sep 26 2025
  9. Saturday, Sep 27 2025
  10. Sunday, Sep 28 2025

⚽ Total match containers: 29
📊 Match statuses:
  - Finished (FT): 10
  - Upcoming (time): 19
  - Lineup confirmed: 0
🔗 Score links found: 29

📋 DETAILED MATCH ANALYSIS:
----------------------------------------
   1. Ajax vs PEC Zwolle                  | FT       | Include: True
   2. NEC Nijmegen vs PSV Eindhoven       | FT       | Include: True
   3. Go Ahead Eagles vs FC Volendam      | FT       | Include: True
   4. Feyenoord vs SC Heerenveen          | FT       |

/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1465 events

Scraping match 2/10
Scraping: https://www.whoscored.com/matches/1903775/live/netherlands-eredivisie-2025-2026-fc-utrecht-fc-groningen


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1476 events

Scraping match 3/10
Scraping: https://www.whoscored.com/matches/1903772/live/netherlands-eredivisie-2025-2026-twente-nac-breda


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1427 events

Scraping match 4/10
Scraping: https://www.whoscored.com/matches/1903776/live/netherlands-eredivisie-2025-2026-telstar-fortuna-sittard
❌ Error scraping https://www.whoscored.com/matches/1903776/live/netherlands-eredivisie-2025-2026-telstar-fortuna-sittard: HTTPConnectionPool(host='localhost', port=49952): Read timed out. (read timeout=120)

Scraping match 5/10
Scraping: https://www.whoscored.com/matches/1903770/live/netherlands-eredivisie-2025-2026-nec-nijmegen-psv-eindhoven


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1471 events

Scraping match 6/10
Scraping: https://www.whoscored.com/matches/1903773/live/netherlands-eredivisie-2025-2026-excelsior-sparta-rotterdam


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1537 events

Scraping match 7/10
Scraping: https://www.whoscored.com/matches/1903769/live/netherlands-eredivisie-2025-2026-go-ahead-eagles-fc-volendam


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1391 events

Scraping match 8/10
Scraping: https://www.whoscored.com/matches/1903774/live/netherlands-eredivisie-2025-2026-heracles-az-alkmaar


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1433 events

Scraping match 9/10
Scraping: https://www.whoscored.com/matches/1903777/live/netherlands-eredivisie-2025-2026-feyenoord-fortuna-sittard


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1418 events

Scraping match 10/10
Scraping: https://www.whoscored.com/matches/1903771/live/netherlands-eredivisie-2025-2026-feyenoord-sc-heerenveen


/var/folders/b5/qz8x1m_510q3zk9lktbr3gx40000gn/T/ipykernel_1675/1672100987.py:90: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  validated_events.append(event.dict())


✅ Successfully scraped 1360 events

✅ Successfully scraped 9 matches with 12978 total events

📊 Final Results:
  - Total events scraped: 12978
  - Unique matches: 9
  - Sample of data:
                                           match_url type_display_name  minute
0  https://www.whoscored.com/matches/1903768/live...              Pass       0
1  https://www.whoscored.com/matches/1903768/live...              Pass       0
2  https://www.whoscored.com/matches/1903768/live...            Aerial       0
3  https://www.whoscored.com/matches/1903768/live...            Aerial       0
4  https://www.whoscored.com/matches/1903768/live...         Clearance       0


🔍 INVESTIGATING CALENDAR NAVIGATION...
Initial FT matches: 10
Could not find or click previous button: name 'By' is not defined

🏁 Scraping completed and browser closed.


#### Collecting all season matches

Testing locally first then moving on to script

In [25]:
def collect_and_scrape_complete_season(driver):
    """Complete season collection with detailed progress output"""
    
    print("Step 1: Collecting all match URLs...")
    all_match_urls = collect_all_season_matches(driver)
    print(f"Found {len(all_match_urls)} finished matches\n")
    
    if not all_match_urls:
        print("No matches found. Exiting.")
        return None
    
    print("Step 2: Scraping match data...")
    all_match_data = []
    
    for i, url in enumerate(all_match_urls, 1):
        print(f"Scraping match {i}/{len(all_match_urls)}")
        print(f"Scraping: {url}")
        
        try:
            match_df = scrape_single_match(url, driver)
            
            if match_df is not None:
                all_match_data.append(match_df)
                print(f"✅ Successfully scraped {len(match_df)} events\n")
            else:
                print("❌ No data returned\n")
                
        except Exception as e:
            print(f"❌ Error scraping {url}: {str(e)}\n")
            continue
        
        time.sleep(2)
    
    if all_match_data:
        combined_df = pd.concat(all_match_data, ignore_index=True)
        print(f"✅ Successfully scraped {len(all_match_data)} matches with {len(combined_df)} total events")
        return combined_df
    else:
        print("❌ No match data was scraped")
        return None

In [26]:
# Main execution
driver = webdriver.Chrome()

try:
    print("Starting complete season collection...\n")
    season_data = collect_and_scrape_complete_season(driver)
    
    if season_data is not None:
        print(f"\nFinal Results:")
        print(f"- Total events: {len(season_data)}")
        print(f"- Unique matches: {season_data['match_url'].nunique()}")
        
        # Save for MinIO upload
        season_data.to_csv('eredivisie_2025_2026_backfill.csv', index=False)
        print("- Saved to eredivisie_2025_2026_backfill.csv")

finally:
    driver.quit()

Starting complete season collection...

Step 1: Collecting all match URLs...
Found 44 finished matches

Step 2: Scraping match data...
Scraping match 1/44
Scraping: https://www.whoscored.com/matches/1903775/live/netherlands-eredivisie-2025-2026-fc-utrecht-fc-groningen
Scraping: https://www.whoscored.com/matches/1903775/live/netherlands-eredivisie-2025-2026-fc-utrecht-fc-groningen
✅ Successfully scraped 1476 events
✅ Successfully scraped 1476 events

Scraping match 2/44
Scraping: https://www.whoscored.com/matches/1903772/live/netherlands-eredivisie-2025-2026-twente-nac-breda
Scraping: https://www.whoscored.com/matches/1903772/live/netherlands-eredivisie-2025-2026-twente-nac-breda
✅ Successfully scraped 1427 events
✅ Successfully scraped 1427 events

Scraping match 3/44
Scraping: https://www.whoscored.com/matches/1903765/live/netherlands-eredivisie-2025-2026-sparta-rotterdam-feyenoord
Scraping: https://www.whoscored.com/matches/1903765/live/netherlands-eredivisie-2025-2026-sparta-rotterd