In [1]:
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pydantic import BaseModel, ValidationError
from typing import List, Optional
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from supabase import create_client, Client
import warnings
from dotenv import load_dotenv
import os
from datetime import datetime

# Suppress all warnings
warnings.filterwarnings('ignore')

# EventModel class definition
class EventModel(BaseModel):
    id: int
    match_id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    x_coordinate: float
    y_coordinate: float
    expanded_minute: int
    satisfied_events_types: List[int]
    is_touch: bool
    player_id: int
    end_x_coordinate: Optional[float] = None
    end_y_coordinate: Optional[float] = None
    related_event_id: Optional[int] = None
    related_player_id: Optional[int] = None
    blocked_x_coordinate: Optional[float] = None
    blocked_y_coordinate: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_own_goal: bool
    is_goal: bool
    period_display_name: str
    type_display_name: str
    outcome_type_display_name: str
    qualifiers_type_display_name: List[str]

# Updated Player model to include team_name
class Player(BaseModel):
    player_id: int
    shirt_no: int
    name: str
    age: int
    position: str
    team_id: int
    team_name: str

# Function to initialize the WebDriver
def init_webdriver():
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Function to get the page source using Selenium
def get_page_source(driver, url):
    driver.get(url)
    return driver.page_source

# Function to parse the page source with BeautifulSoup
def parse_page_source(page_source):
    return BeautifulSoup(page_source, "html.parser")

# Function to extract match data from the soup
def extract_match_data(soup):
    element = soup.select_one("script:-soup-contains(matchCentreData)")
    if not element:
        return None
    match_data_json = element.text.split("matchCentreData: ")[1].split(',\n')[0]
    return json.loads(match_data_json)

def transform_events_to_df(events, match_id):
    df_events = pd.DataFrame(events)
    df_events = df_events.rename(
        {
            'id': 'id',
            'eventId': 'event_id',
            'minute': 'minute',
            'second': 'second',
            'teamId': 'team_id',
            'x': 'x_coordinate',
            'y': 'y_coordinate',
            'expandedMinute': 'expanded_minute',
            'period': 'period',
            'type': 'event_type',
            'outcomeType': 'outcome_type',
            'qualifiers': 'qualifiers',
            'satisfiedEventsTypes': 'satisfied_events_types',
            'isTouch': 'is_touch',
            'playerId': 'player_id',
            'endX': 'end_x_coordinate',
            'endY': 'end_y_coordinate',
            'relatedEventId': 'related_event_id',
            'relatedPlayerId': 'related_player_id',
            'blockedX': 'blocked_x_coordinate',
            'blockedY': 'blocked_y_coordinate',
            'goalMouthZ': 'goal_mouth_z',
            'goalMouthY': 'goal_mouth_y',
            'isShot': 'is_shot',
            'cardType': 'card_type',
            'isOwnGoal': 'is_own_goal',
            'isGoal': 'is_goal'
        }, axis=1
    )

    # Add match_id to the DataFrame
    df_events['match_id'] = match_id

    # Process DataFrame columns
    df_events.dropna(subset=['player_id'], inplace=True)
    df_events['id'] = df_events['id'].astype(int)
    df_events['player_id'] = df_events['player_id'].astype(int)
    df_events['second'] = df_events['second'].astype(float, errors='ignore')
    df_events = df_events.where(pd.notnull(df_events), None)

    bool_columns = ['is_shot', 'is_own_goal', 'is_goal', 'card_type']
    for col in bool_columns:
        if col in df_events.columns:
            df_events[col] = df_events[col].astype(bool).fillna(False)
        else:
            df_events[col] = False

    if 'period' in df_events.columns:
        df_events['period_display_name'] = df_events['period'].apply(lambda x: x['displayName'] if isinstance(x, dict) and x is not None else None)
    if 'event_type' in df_events.columns:
        df_events['type_display_name'] = df_events['event_type'].apply(lambda x: x['displayName'] if isinstance(x, dict) and x is not None else None)
    if 'outcome_type' in df_events.columns:
        df_events['outcome_type_display_name'] = df_events['outcome_type'].apply(lambda x: x['displayName'] if isinstance(x, dict) and x is not None else None)

    if 'qualifiers' in df_events.columns:
        df_events['qualifiers_type_display_name'] = df_events['qualifiers'].apply(extract_qualifier_display_names)

    columns_to_drop = ['period', 'event_type', 'outcome_type', 'qualifiers']
    df_events.drop(columns=[col for col in columns_to_drop if col in df_events.columns], inplace=True)

    df_events = df_events[df_events['type_display_name'] != 'OffsideGiven']

    for col in df_events.columns:
        if df_events[col].dtype in [np.float64, np.float32]:
            df_events[col] = np.where(pd.notnull(df_events[col]), df_events[col], None)

    return df_events

# Function to extract display names from qualifiers
def extract_qualifier_display_names(qualifiers):
    if qualifiers is None:
        return None
    return [qualifier['displayName'] for qualifier in qualifiers if 'displayName' in qualifier]

# Function to initialize Supabase client
def init_supabase_client():
    load_dotenv(dotenv_path='keys.env')
    SUPABASE_URL = os.getenv('SUPABASE_URL')
    SUPABASE_KEY = os.getenv('SUPABASE_KEY')
    return create_client(SUPABASE_URL, SUPABASE_KEY)

# Function to insert match events into Supabase
def insert_match_events(df_events, supabase: Client):
    events = []
    for x in df_events.to_dict(orient='records'):
        try:
            event = EventModel(**x).dict()
            events.append(event)
        except ValidationError as e:
            print(f"Validation error for record {x['id']}: {e}")

    if events:
        execution = supabase.table('euros_2024_match_events').upsert(events).execute()
        return execution
    else:
        print("No valid events to insert.")
        return None

# Function to insert player data into Supabase
def insert_players(team_info, supabase: Client):
    players = []

    for team in team_info:
        team_name = team.get('name')
        for player in team.get('players', []):
            player_id = player.get('player_id') or player.get('playerId')
            team_id = team.get('team_id') or team.get('teamId')
            shirt_no = player.get('shirtNo') or player.get('shirt_no')
            name = player.get('name')
            position = player.get('position')
            age = player.get('age')

            if player_id is not None and team_id is not None and shirt_no is not None and name is not None and position is not None and age is not None:
                try:
                    player_data = {
                        'player_id': player_id,
                        'team_id': team_id,
                        'shirt_no': shirt_no,
                        'name': name,
                        'position': position,
                        'age': age,
                        'team_name': team_name
                    }
                    player_obj = Player(**player_data).dict()
                    players.append(player_obj)
                except ValidationError as e:
                    print(f"Validation error for player {player_id}: {e}")
            else:
                print(f"Skipping player with missing data: {player}")

    if players:
        execution = supabase.table('players_euros_2024').upsert(players).execute()
        return execution
    else:
        print("No valid players to insert.")
        return None

# Function to fetch the page source using Selenium
def fetch_page_source(url):
    driver = init_webdriver()
    driver.get(url)
    driver.implicitly_wait(10)
    page_source = driver.page_source
    driver.quit()
    return page_source

# Function to parse the fixture URLs from the main page
def parse_fixture_urls(page_source):
    soup = BeautifulSoup(page_source, "html.parser")
    fixtures = []
    for link in soup.find_all('a', href=True):
        if '/Live' in link['href']:
            fixtures.append("https://www.whoscored.com" + link['href'])
    return fixtures

# Function to check if URL is already processed
def is_url_processed(url, supabase: Client):
    response = supabase.table('processed_urls').select('*').eq('url', url).execute()
    return len(response.data) > 0

# Function to mark URL as processed
def mark_url_as_processed(url, match_id, supabase: Client):
    data = {
        'url': url,
        'match_id': match_id,
        'processed_at': datetime.utcnow().isoformat()
    }
    supabase.table('processed_urls').insert(data).execute()

# Main function to run the script
def main():
    # Fetch the main fixtures page
    main_url = "https://www.whoscored.com/Regions/247/Tournaments/124/Seasons/9299/Stages/23157/Fixtures/International-European-Championship-2024"
    main_page_source = fetch_page_source(main_url)
    fixture_urls = parse_fixture_urls(main_page_source)

    # Initialize Supabase client
    supabase = init_supabase_client()

    # Loop over each fixture URL and fetch match data
    for url in fixture_urls:
        if is_url_processed(url, supabase):
            print(f"Data for {url} already processed, skipping.")
            continue

        try:
            match_page_source = fetch_page_source(url)
            soup = parse_page_source(match_page_source)
            matchdict = extract_match_data(soup)
            if not matchdict:
                print(f"No data for {url}, skipping.")
                continue

            # Get the match_id
            match_id = int(url.split('/')[4])

            # Process events data
            df_events = transform_events_to_df(matchdict['events'], match_id)
            insert_match_events(df_events, supabase)

            # Process team and player data
            team_info = [
                {
                    'team_id': matchdict['home']['teamId'],
                    'name': matchdict['home']['name'],
                    'country_name': matchdict['home']['countryName'],
                    'manager_name': matchdict['home']['managerName'],
                    'players': matchdict['home']['players'],
                },
                {
                    'team_id': matchdict['away']['teamId'],
                    'name': matchdict['away']['name'],
                    'country_name': matchdict['away']['countryName'],
                    'manager_name': matchdict['away']['managerName'],
                    'players': matchdict['away']['players'],
                }
            ]
            insert_players(team_info, supabase)

            # Mark URL as processed
            mark_url_as_processed(url, match_id, supabase)

            print(f"Data fetched and inserted for {url}")

        except Exception as e:
            print(f"Error fetching data for {url}: {e}, skipping.")

    print(f"Total matches processed: {len(fixture_urls)}")

if __name__ == "__main__":
    main()


No data for https://www.whoscored.com/LiveScores, skipping.
Data fetched and inserted for https://www.whoscored.com/Matches/1787315/Live/International-European-Championship-Germany-Scotland
Data for https://www.whoscored.com/Matches/1787315/Live/International-European-Championship-Germany-Scotland already processed, skipping.
Data fetched and inserted for https://www.whoscored.com/Matches/1787328/Live/International-European-Championship-Hungary-Switzerland
Data for https://www.whoscored.com/Matches/1787328/Live/International-European-Championship-Hungary-Switzerland already processed, skipping.
Data fetched and inserted for https://www.whoscored.com/Matches/1787317/Live/International-European-Championship-Spain-Croatia
Data for https://www.whoscored.com/Matches/1787317/Live/International-European-Championship-Spain-Croatia already processed, skipping.
Data fetched and inserted for https://www.whoscored.com/Matches/1787330/Live/International-European-Championship-Italy-Albania
Data for 