### Import Libraries

In [135]:
import json
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pydantic import BaseModel
from typing import List, Optional
from selenium import webdriver
from supabase import create_client, Client
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [136]:
driver = webdriver.Chrome()

In [137]:
whoscored_url = "https://www.whoscored.com/Matches/1787315/Live/International-European-Championship-Germany-Scotland"

In [138]:
driver.get(whoscored_url)

In [139]:
soup = BeautifulSoup(driver.page_source, "html.parser")

In [140]:
element = soup.select_one("script:-soup-contains(matchCentreData)")

In [167]:
matchdict = element.text.split("matchCentreData: ")[1].split(',\n')[0]
matchdict = json.loads(matchdict)


In [143]:
# Extract events data from the match dictionary
events = matchdict['events']

# Convert events data to a pandas DataFrame
df_events = pd.DataFrame(events)

df_events = df_events.rename(
    {
    'id': 'id',
    'eventId': 'event_id',
    'minute': 'minute',
    'second': 'second',
    'teamId': 'team_id',
    'x': 'x_coordinate',
    'y': 'y_coordinate',
    'expandedMinute': 'expanded_minute',
    'period': 'period',
    'type': 'event_type',
    'outcomeType': 'outcome_type',
    'qualifiers': 'qualifiers',
    'satisfiedEventsTypes': 'satisfied_events_types',
    'isTouch': 'is_touch',
    'playerId': 'player_id',
    'endX': 'end_x_coordinate',
    'endY': 'end_y_coordinate',
    'relatedEventId': 'related_event_id',
    'relatedPlayerId': 'related_player_id',
    'blockedX': 'blocked_x_coordinate',
    'blockedY': 'blocked_y_coordinate',
    'goalMouthZ': 'goal_mouth_z',
    'goalMouthY': 'goal_mouth_y',
    'isShot': 'is_shot',
    'cardType': 'card_type',
    'isOwnGoal': 'is_own_goal',
    'isGoal': 'is_goal'
}, axis=1
)

In [169]:
# Remove rows with NaN in 'player_id'
df_events.dropna(subset=['player_id'], inplace=True)

# Convert 'id', 'player_id', and 'second' columns to the appropriate types
df_events['id'] = df_events['id'].astype(int)
df_events['player_id'] = df_events['player_id'].astype(int)
df_events['second'] = df_events['second'].astype(float)

# Replace NaN values with None for the entire DataFrame
df_events = df_events.where(pd.notnull(df_events), None)
# Convert boolean columns and handle missing columns
bool_columns = ['is_shot', 'is_own_goal', 'is_goal', 'card_type']
for col in bool_columns:
    if col in df_events.columns:
        df_events[col] = df_events[col].astype(bool).fillna(False)
    else:
        df_events[col] = False

# Extract and create new columns for display names if the columns exist
if 'period' in df_events.columns:
    df_events['period_display_name'] = df_events['period'].apply(lambda x: x['displayName'] if x is not None else None)
if 'event_type' in df_events.columns:
    df_events['type_display_name'] = df_events['event_type'].apply(lambda x: x['displayName'] if x is not None else None)
if 'outcome_type' in df_events.columns:
    df_events['outcome_type_display_name'] = df_events['outcome_type'].apply(lambda x: x['displayName'] if x is not None else None)

# Function to extract display names from qualifiers
def extract_qualifier_display_names(qualifiers):
    if qualifiers is None:
        return None
    return [qualifier['displayName'] for qualifier in qualifiers if 'displayName' in qualifier]

# Apply the function to create the new column for qualifiers display names if the column exists
if 'qualifiers' in df_events.columns:
    df_events['qualifiers_type_display_name'] = df_events['qualifiers'].apply(extract_qualifier_display_names)

# Drop the original columns if they exist
columns_to_drop = ['period', 'event_type', 'outcome_type', 'qualifiers']
existing_columns_to_drop = [col for col in columns_to_drop if col in df_events.columns]
df_events.drop(columns=existing_columns_to_drop, inplace=True)

# Exclude rows where 'type_display_name' is 'OffsideGiven'
df_events = df_events[df_events['type_display_name'] != 'OffsideGiven']

# Display the cleaned DataFrame
df_events.head()


Unnamed: 0,id,event_id,minute,team_id,player_id,x_coordinate,y_coordinate,expanded_minute,satisfied_events_types,is_touch,...,is_goal,is_shot,blocked_x_coordinate,blocked_y_coordinate,card_type,is_own_goal,period_display_name,type_display_name,outcome_type_display_name,qualifiers_type_display_name
3,2696027193,3,0,336,326413,50.0,50.0,0,"[91, 117, 30, 35, 37, 216, 218]",True,...,False,False,,,False,False,FirstHalf,Pass,Successful,[]
4,2696027199,4,0,336,261020,36.3,71.6,0,"[91, 119, 117, 127, 205, 36, 38, 217, 218]",True,...,False,False,,,False,False,FirstHalf,Pass,Successful,[]
5,2696027201,5,0,336,326413,77.6,42.2,0,"[197, 199]",False,...,False,False,,,False,False,FirstHalf,Aerial,Successful,[]
6,2696027197,3,0,424,280210,22.4,57.8,0,"[198, 200]",False,...,False,False,,,False,False,FirstHalf,Aerial,Unsuccessful,[]
7,2696027205,6,0,336,326413,82.5,43.0,0,"[91, 120, 29, 139, 35, 37, 217, 218]",True,...,False,False,,,False,False,FirstHalf,Pass,Unsuccessful,[]


In [146]:
for col in df_events.columns:
    if df_events[col].dtype == np.float64 or df_events[col].dtype == np.float32:
        df_events[col] = np.where(pd.notnull(df_events[col]), df_events[col], None)


In [147]:
df_events.head()

Unnamed: 0,id,event_id,minute,team_id,player_id,x_coordinate,y_coordinate,expanded_minute,satisfied_events_types,is_touch,...,is_goal,is_shot,blocked_x_coordinate,blocked_y_coordinate,card_type,is_own_goal,period_display_name,type_display_name,outcome_type_display_name,qualifiers_type_display_name
0,2696038265,11020,0,336,394786,67.8,59.5,0,[61],False,...,False,False,,,False,False,FirstHalf,OffsideGiven,Unsuccessful,[]
3,2696027193,3,0,336,326413,50.0,50.0,0,"[91, 117, 30, 35, 37, 216, 218]",True,...,False,False,,,False,False,FirstHalf,Pass,Successful,[]
4,2696027199,4,0,336,261020,36.3,71.6,0,"[91, 119, 117, 127, 205, 36, 38, 217, 218]",True,...,False,False,,,False,False,FirstHalf,Pass,Successful,[]
5,2696027201,5,0,336,326413,77.6,42.2,0,"[197, 199]",False,...,False,False,,,False,False,FirstHalf,Aerial,Successful,[]
6,2696027197,3,0,424,280210,22.4,57.8,0,"[198, 200]",False,...,False,False,,,False,False,FirstHalf,Aerial,Unsuccessful,[]


In [148]:
class EventModel(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    x_coordinate: float
    y_coordinate: float
    expanded_minute: int
    satisfied_events_types: List[int]
    is_touch: bool
    player_id: int
    end_x_coordinate: Optional[float] = None
    end_y_coordinate: Optional[float] = None
    related_event_id: Optional[int] = None
    related_player_id: Optional[int] = None
    blocked_x_coordinate: Optional[float] = None
    blocked_y_coordinate: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_own_goal: bool
    is_goal: bool
    period_display_name: str
    type_display_name: str
    outcome_type_display_name: str
    qualifiers_type_display_name: List[str]

In [149]:
from pydantic import ValidationError

# Assuming df_events is your DataFrame and EventModel is the Pydantic model defined earlier
for x in df_events.to_dict(orient="records"):
    try:
        event = EventModel(**x)
        # Process the validated event object
        print(event)
    except ValidationError as e:
        # Handle validation errors
        print(f"Validation error for record {x['id']}: {e}")


id=2696038265 event_id=11020 minute=0 second=None team_id=336 x_coordinate=67.8 y_coordinate=59.5 expanded_minute=0 satisfied_events_types=[61] is_touch=False player_id=394786 end_x_coordinate=None end_y_coordinate=None related_event_id=None related_player_id=None blocked_x_coordinate=None blocked_y_coordinate=None goal_mouth_z=None goal_mouth_y=None is_shot=False card_type=False is_own_goal=False is_goal=False period_display_name='FirstHalf' type_display_name='OffsideGiven' outcome_type_display_name='Unsuccessful' qualifiers_type_display_name=[]
id=2696027193 event_id=3 minute=0 second=0.0 team_id=336 x_coordinate=50.0 y_coordinate=50.0 expanded_minute=0 satisfied_events_types=[91, 117, 30, 35, 37, 216, 218] is_touch=True player_id=326413 end_x_coordinate=36.3 end_y_coordinate=76.5 related_event_id=None related_player_id=None blocked_x_coordinate=None blocked_y_coordinate=None goal_mouth_z=None goal_mouth_y=None is_shot=False card_type=False is_own_goal=False is_goal=False period_disp

In [150]:
from dotenv import load_dotenv
import os

# Load environment variables from keys.env file
load_dotenv(dotenv_path='keys.env')

# Access the environment variables
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_KEY')
SUPABASE_PASSWORD = os.getenv('SUPABASE_PASSWORD')

supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

In [151]:
# Function to insert match events
def insert_match_events(df_events, supabase: Client):
    events = []
    
    for x in df_events.to_dict(orient='records'):
        try:
            event = EventModel(**x).dict()
            events.append(event)
        except ValidationError as e:
            print(f"Validation error for record {x['id']}: {e}")
    
    if events:
        execution = supabase.table('euros_2024_match_events').upsert(events).execute()
        return execution
    else:
        print("No valid events to insert.")
        return None

response = insert_match_events(df_events, supabase)

In [160]:
team_info = []
team_info.append({
	'team_id': matchdict['home']['teamId'],
	'name': matchdict['home']['name'],
	'country_name': matchdict['home']['countryName'],
	'manager_name': matchdict['home']['managerName'],
	'players': matchdict['home']['players'],
})

team_info.append({
	'team_id': matchdict['away']['teamId'],
	'name': matchdict['away']['name'],
	'country_name': matchdict['away']['countryName'],
	'manager_name': matchdict['away']['managerName'],
	'players': matchdict['away']['players'],

})

In [165]:
matchdict['away']['teamId']

424

In [166]:
# Updated Player model to include team_name
class Player(BaseModel):
    player_id: int
    shirt_no: int
    name: str
    age: int
    position: str
    team_id: int
    team_name: str

# Function to insert player data
def insert_players(team_info, supabase: Client):
    players = []

    for team in team_info:
        team_name = team.get('name')
        for player in team.get('players', []):  # Assuming team_info contains a list of teams, each with a list of players
            # Normalize keys to a consistent format
            player_id = player.get('player_id') or player.get('playerId')
            team_id = team.get('team_id') or team.get('teamId')
            shirt_no = player.get('shirtNo') or player.get('shirt_no')
            name = player.get('name')
            position = player.get('position')
            age = player.get('age')
            
            # Ensure critical fields are present
            if player_id is not None and team_id is not None and shirt_no is not None and name is not None and position is not None and age is not None:
                try:
                    player_data = {
                        'player_id': player_id,
                        'team_id': team_id,
                        'shirt_no': shirt_no,
                        'name': name,
                        'position': position,
                        'age': age,
                        'team_name': team_name
                    }
                    player_obj = Player(**player_data).dict()
                    players.append(player_obj)
                except ValidationError as e:
                    print(f"Validation error for player {player_id}: {e}")
            else:
                print(f"Skipping player with missing data: {player}")

    if players:
        execution = supabase.table('players_euros_2024').upsert(players).execute()
        return execution
    else:
        print("No valid players to insert.")
        return None

# Assuming matchdict is your data structure with match information
team_info = []
team_info.append({
    'team_id': matchdict['home']['teamId'],
    'name': matchdict['home']['name'],
    'country_name': matchdict['home']['countryName'],
    'manager_name': matchdict['home']['managerName'],
    'players': matchdict['home']['players'],
})

team_info.append({
    'team_id': matchdict['away']['teamId'],
    'name': matchdict['away']['name'],
    'country_name': matchdict['away']['countryName'],
    'manager_name': matchdict['away']['managerName'],
    'players': matchdict['away']['players'],
})

# Insert the data into Supabase
response = insert_players(team_info, supabase)
print(response)


data=[{'player_id': 13754, 'shirt_no': 1, 'name': 'Manuel Neuer', 'age': 38, 'position': 'GK', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 283323, 'shirt_no': 6, 'name': 'Joshua Kimmich', 'age': 29, 'position': 'DR', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 104010, 'shirt_no': 2, 'name': 'Antonio Rüdiger', 'age': 31, 'position': 'DC', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 134946, 'shirt_no': 4, 'name': 'Jonathan Tah', 'age': 28, 'position': 'DC', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 261020, 'shirt_no': 18, 'name': 'Maximilian Mittelstädt', 'age': 27, 'position': 'DL', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 31772, 'shirt_no': 8, 'name': 'Toni Kroos', 'age': 34, 'position': 'DMC', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 122140, 'shirt_no': 23, 'name': 'Robert Andrich', 'age': 29, 'position': 'DMC', 'team_id': 336, 'team_name': 'Germany'}, {'player_id': 395252, 'shirt_no': 10, 'name': 'Jamal Musiala