In [1]:
import json
import requests
import numpy as np
import pandas as pd

from slugify import slugify
from pprint import pprint
from bs4 import BeautifulSoup

In [2]:
def get_country_athletes_total() -> None: ...
#     """Get the number of athletes each country sent to the 2024 Olympics."""
    # https://olympics.com/en/paris-2024/athletes/all-disc/argentina
    # <div class="mirs-pagination-right"><span class="pe-3">143 Elements</span>
    # 1. get list of countries.
    # 2. query base url for each and extract number from "143 Elements"

In [3]:

# headers = {
#     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
# }
# url_medallists = "https://olympics.com/en/paris-2024/medals/medallists"
# r = requests.get(url_medallists, headers=headers)

In [21]:
"""Calculate win percentage `win_pct`

# of medals won
divided by
# of events competed in
"""

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
}


def is_team_event(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["is_team_event"] = np.where(df["event"].str.contains("TEAM"), True, False)
    return df


def scraped_medal_count_data(
    medal_url: str = "https://olympics.com/en/paris-2024/medals/medallists"
) -> dict:
    
    r = requests.get(medal_url, headers=HEADERS)

    soup = BeautifulSoup(r.content, 'html.parser')
    script_tag = soup.find('script', id='__NEXT_DATA__')
    data: dict = json.loads(script_tag.string)
    return data


def raw_medal_tbl(scraped_medal_count_data: dict) -> pd.DataFrame:
    medal_tbl = pd.json_normalize(
        scraped_medal_count_data["props"]["pageProps"]['initialMedallist']["athletes"],
        record_path='medals',
        meta=[
            'organisation', 
            'organisationName', 
            'code', 
            'fullName', 
            'initialName', 
            'tvName', 
            'tvInitialName', 
            'gender', 
            'medalsGold', 
            'medalsSilver', 
            'medalsBronze', 
            'medalsTotal'
        ]
    )

    return medal_tbl


def normalized_medal_tbl(
    raw_medal_tbl: pd.DataFrame
) -> pd.DataFrame:
    medal_count = (
        raw_medal_tbl.pipe(is_team_event)
        .groupby([
            "organisation", 
            "disciplineName",
            # "event", 
            "eventName", 
            "medalType"
        ])
        .agg(n_medalled_athletes=("fullName", "nunique"))
        .reset_index()
        .assign(
            norm_medals_total=lambda _df: np.where(
                _df["n_medalled_athletes"] != 1, 1, _df["n_medalled_athletes"]
            )
        )
        .assign(
            total_country_medals=lambda _df: _df.groupby("organisation")[
                "norm_medals_total"
            ].transform("sum")
        )
        .assign(
            total_country_medalled_athletes=lambda _df: _df.groupby("organisation")[
                "n_medalled_athletes"
            ].transform("sum")
        )
        # check total medals for each country
        # .loc[:, ["organisation", "total_country_medals"]].drop_duplicates()
        # .sort_values("total_country_medals", ascending=False)
        # end check
        # TODO - calculate this properly
        # .assign(ratio=lambda _df: _df["n_medalled_athletes"] / _df["total_country_medals"])
    )

    return medal_count

In [5]:
url: str = "https://olympics.com/en/paris-2024/athletes/sitemap-en.xml"
 
r=requests.get("https://olympics.com/en/paris-2024/athletes", headers=HEADERS)

In [6]:
url: str = "https://olympics.com/en/paris-2024/athletes/sitemap-en.xml"


In [9]:
# get the country endpoints to extract total athletes
from hamilton.htypes import Parallelizable, Collect
import requests
from bs4 import BeautifulSoup


def sitemap_urls(
    # url: str = "https://olympics.com/en/paris-2024/athletes/sitemap-en.xml"
    url: str = "https://olympics.com/OG2024/assets/sitemap-en.xml"

) -> list:
    try:
        # Send a GET request to the sitemap URL
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)

        # Parse the sitemap XML content
        sitemap = response.content
        soup = BeautifulSoup(sitemap, 'xml')  # Use 'xml' parser for XML content

        # Find all <url> tags in the sitemap
        urls = soup.find_all('url')

        # Extract the loc (location) for each URL
        sitemap_urls = [url.find('loc').text for url in urls]

        return sitemap_urls

    except requests.exceptions.RequestException as e:
        print(f"Error fetching sitemap: {e}")
        return []


def all_country_endpoint_slugs(sitemap_urls: list) -> list:
    """Returns the country slug for their respective endpoints."""
    return sorted([
        url.split("/")[-1] 
        for url in sitemap_urls 
        if "noc-entries" in url
    ])


def generated_country_athlete_url(all_country_endpoint_slugs: list) -> Parallelizable[str]:
    base_url = "https://olympics.com/en/paris-2024/athletes"
    for country in all_country_endpoint_slugs:
        print(f"{base_url}/all-disc/{country}")
        yield f"{base_url}/all-disc/{country}" 
    


import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options


def scraped_country_athlete_count(generated_country_athlete_url: str) -> pd.DataFrame:
    """Get the number of athletes each country sent to the 2024 Olympics."""
    # https://olympics.com/en/paris-2024/athletes/all-disc/argentina
    # <div class="mirs-pagination-right"><span class="pe-3">143 Elements</span>
    # 1. get list of countries.
    # 2. query base url for each and extract number from "143 Elements"
    
    country_slug = generated_country_athlete_url.split("/")[-1]

    # Set up Chrome options
    chrome_options = Options()
    # does not seem to work in headless mode
    # chrome_options.add_argument("--headless")  # Enable headless mode
    # chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    # chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems

    # Set up the Chrome WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        # Navigate to the website
        driver.get(generated_country_athlete_url)

        # Wait for javascript to render page
        elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.mirs-pagination-right span.pe-3'))
        ) # represents number of athletes for country
    
        m = re.search("(?P<n_athletes>\d+) Elements", [e.text for e in elements][0])
        n_athletes = int(m.group("n_athletes")) 

    finally:
        # Close the WebDriver
        driver.quit()

    return pd.DataFrame(dict(country=[country_slug], n_athletes=[n_athletes]))


def all_country_athlete_counts(scraped_country_athlete_count: Collect[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(scraped_country_athlete_count, ignore_index=True)

In [10]:
if __name__ == "__main__":
    
    import __main__

    from hamilton import driver, base
    from hamilton.execution import executors

    dr = (
        driver.Builder()
        .with_modules(__main__)
        .with_adapter(base.PandasDataFrameResult())
        .enable_dynamic_execution(allow_experimental_mode=True)
        .with_local_executor(executors.SynchronousLocalTaskExecutor())
        # .with_remote_executor(executors.MultiProcessingExecutor(max_tasks=5))
        .build()
    )

    # dr.visualize_execution(["all_country_athlete_counts"], "dag.png")
    df = dr.execute(["all_country_athlete_counts"])

https://olympics.com/en/paris-2024/athletes/all-disc/afghanistan
https://olympics.com/en/paris-2024/athletes/all-disc/ain
https://olympics.com/en/paris-2024/athletes/all-disc/albania
https://olympics.com/en/paris-2024/athletes/all-disc/algeria
https://olympics.com/en/paris-2024/athletes/all-disc/american-samoa
https://olympics.com/en/paris-2024/athletes/all-disc/andorra
https://olympics.com/en/paris-2024/athletes/all-disc/angola
https://olympics.com/en/paris-2024/athletes/all-disc/antigua-and-barbuda
https://olympics.com/en/paris-2024/athletes/all-disc/argentina
https://olympics.com/en/paris-2024/athletes/all-disc/armenia
https://olympics.com/en/paris-2024/athletes/all-disc/aruba
https://olympics.com/en/paris-2024/athletes/all-disc/australia
https://olympics.com/en/paris-2024/athletes/all-disc/austria
https://olympics.com/en/paris-2024/athletes/all-disc/azerbaijan
https://olympics.com/en/paris-2024/athletes/all-disc/bahamas
https://olympics.com/en/paris-2024/athletes/all-disc/bahrain
h

In [22]:
if __name__ == "__main__":
    
    import __main__

    from hamilton import driver, base
    from hamilton.execution import executors

    dr = (
        driver.Builder()
        .with_modules(__main__)
        .with_adapter(base.PandasDataFrameResult())
        .enable_dynamic_execution(allow_experimental_mode=True)
        .with_local_executor(executors.SynchronousLocalTaskExecutor())
        # .with_remote_executor(executors.MultiProcessingExecutor(max_tasks=5))
        .build()
    )


# df.to_csv("country-athlete-count.csv")

In [23]:
raw_medals = scraped_medal_count_data()
medal_count = dr.execute(["normalized_medal_tbl"])

In [48]:
nocs = pd.DataFrame({(noc["id"], noc["longName"]) for noc in raw_medals["props"]["pageProps"]["nocList"]}, columns=["countryId", "countryName"])
nocs.query("countryId.str.startswith('C')")

Unnamed: 0,countryId,countryName
0,CHA,Chad
20,CAM,Cambodia
38,CRC,Costa Rica
57,CAN,Canada
60,CHN,People's Republic of China
64,CHI,Chile
66,CIV,Côte d'Ivoire
86,COK,Cook Islands
91,CRO,Croatia
99,CAY,Cayman Islands


In [34]:
medal_count.sort_values("total_country_medals").head(50)

Unnamed: 0,organisation,disciplineName,eventName,medalType,n_medalled_athletes,norm_medals_total,total_country_medals,total_country_medalled_athletes
129,COL,Artistic Gymnastics,Men's Horizontal Bar,ME_SILVER,1,1,1,1
400,MGL,Judo,Women -48 kg,ME_SILVER,1,1,1,1
130,CPV,Boxing,Men's 51kg,ME_BRONZE,1,1,1,1
390,LCA,Athletics,Women's 100m,ME_GOLD,1,1,1,1
433,POR,Judo,Women -78 kg,ME_BRONZE,1,1,1,1
285,INA,Badminton,Women's Singles,ME_BRONZE,1,1,1,1
75,CHI,Shooting,Skeet Women,ME_GOLD,1,1,1,1
157,FIJ,Rugby Sevens,Men,ME_SILVER,14,1,1,14
39,AUT,Judo,Women -70 kg,ME_BRONZE,1,1,1,1
457,SVK,Canoe Slalom,Men's Canoe Single,ME_BRONZE,1,1,1,1


In [32]:
medal_count.query("organisation=='AUS'")

Unnamed: 0,organisation,disciplineName,eventName,medalType,n_medalled_athletes,norm_medals_total,total_country_medals,total_country_medalled_athletes
7,AUS,Athletics,Women's 20km Race Walk,ME_BRONZE,1,1,32,67
8,AUS,Athletics,Women's High Jump,ME_BRONZE,1,1,32,67
9,AUS,Athletics,Women's High Jump,ME_SILVER,1,1,32,67
10,AUS,Canoe Slalom,Women's Canoe Single,ME_GOLD,1,1,32,67
11,AUS,Canoe Slalom,Women's Kayak Cross,ME_GOLD,1,1,32,67
12,AUS,Canoe Slalom,Women's Kayak Single,ME_GOLD,1,1,32,67
13,AUS,Cycling BMX Freestyle,Women's Park,ME_BRONZE,1,1,32,67
14,AUS,Cycling BMX Racing,Women,ME_GOLD,1,1,32,67
15,AUS,Cycling Road,Women's Individual Time Trial,ME_GOLD,1,1,32,67
16,AUS,Equestrian,Eventing Individual,ME_SILVER,1,1,32,67


In [122]:
import numpy as np


def is_team_event(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["is_team_event"] = np.where(df["event"].str.contains("TEAM"), True, False)
    return df


(
    df.pipe(is_team_event)
    .groupby([
        "organisation", 
        "disciplineName",
        # "event", 
        "eventName", 
        "medalType"
    ])
    .agg(n_medalled_athletes=("fullName", "nunique"))
    .reset_index()
    .assign(
        norm_medals_total=lambda _df: np.where(
            _df["n_medalled_athletes"] != 1, 1, _df["n_medalled_athletes"]
        )
    )
    .assign(
        total_country_medals=lambda _df: _df.groupby("organisation")[
            "norm_medals_total"
        ].transform("sum")
    )
    .assign(
        total_country_medalled_athletes=lambda _df: _df.groupby("organisation")[
            "n_medalled_athletes"
        ].transform("sum")
    )
    # check total medals for each country
    # .loc[:, ["organisation", "total_country_medals"]].drop_duplicates()
    # .sort_values("total_country_medals", ascending=False)
    # end check

    # .assign(ratio=lambda _df: _df["n_medalled_athletes"] / _df["total_country_medals"])
)

Unnamed: 0,organisation,disciplineName,eventName,medalType,n_medalled_athletes,norm_medals_total,total_country_medals,total_country_medalled_athletes
0,AIN,Trampoline Gymnastics,Men,ME_GOLD,1,1,2,2
1,AIN,Trampoline Gymnastics,Women,ME_SILVER,1,1,2,2
2,ARG,Cycling BMX Freestyle,Men's Park,ME_GOLD,1,1,1,1
3,AUS,Athletics,Women's 20km Race Walk,ME_BRONZE,1,1,21,41
4,AUS,Canoe Slalom,Women's Canoe Single,ME_GOLD,1,1,21,41
...,...,...,...,...,...,...,...,...
336,USA,Swimming,Women's 400m Individual Medley,ME_BRONZE,1,1,42,94
337,USA,Swimming,Women's 400m Individual Medley,ME_SILVER,1,1,42,94
338,UZB,Judo,Men +100 kg,ME_BRONZE,1,1,3,3
339,UZB,Judo,Men -100 kg,ME_BRONZE,1,1,3,3


In [None]:
df.groupby(["organisation","disciplineName", "eventName"])["medalsTotal"].sum().unstack("disciplineName")

Unnamed: 0_level_0,disciplineName,Archery,Artistic Gymnastics,Canoe Slalom,Cycling BMX Freestyle,Cycling Mountain Bike,Cycling Road,Diving,Equestrian,Fencing,Rowing,Rugby Sevens,Sailing,Shooting,Skateboarding,Swimming
organisation,eventName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
USA,50m Rifle 3 Positions Women,,,,,,,,,,,,,1.0,,
USA,Jumping Team,,,,,,,,3.0,,,,,,,
USA,Men's 100m Backstroke,,,,,,,,,,,,,,,1.0
USA,Men's 100m Breaststroke,,,,,,,,,,,,,,,1.0
USA,Men's 200m Freestyle,,,,,,,,,,,,,,,2.0
USA,Men's 4 x 100m Freestyle Relay,,,,,,,,,,,,,,,7.0
USA,Men's 4 x 200m Freestyle Relay,,,,,,,,,,,,,,,10.0
USA,Men's 400m Individual Medley,,,,,,,,,,,,,,,2.0
USA,Men's 800m Freestyle,,,,,,,,,,,,,,,1.0
USA,Men's Foil Individual,,,,,,,,,1.0,,,,,,


In [51]:
df.query("disciplineName=='Rugby Sevens'").head()

Unnamed: 0,medalType,event,eventName,category,date,disciplineCode,disciplineName,official,extraData.detailUrl,organisation,...,code,fullName,initialName,tvName,tvInitialName,gender,medalsGold,medalsSilver,medalsBronze,medalsTotal
68,ME_BRONZE,RU7WTEAM7-------------------------,Women,W,2024-07-30,RU7,Rugby Sevens,True,/en/paris-2024/medals-and-ranking/rugby-sevens...,USA,...,1950485,CANETT Kayla,CANETT K,Kayla CANETT,K. CANETT,F,0,0,1,1
69,ME_BRONZE,RU7WTEAM7-------------------------,Women,W,2024-07-30,RU7,Rugby Sevens,True,/en/paris-2024/medals-and-ranking/rugby-sevens...,USA,...,1950487,DOYLE Lauren,DOYLE L,Lauren DOYLE,L. DOYLE,F,0,0,1,1
78,ME_BRONZE,RU7WTEAM7-------------------------,Women,W,2024-07-30,RU7,Rugby Sevens,True,/en/paris-2024/medals-and-ranking/rugby-sevens...,USA,...,1950494,KELTER Alev,KELTER A,Alev KELTER,A. KELTER,F,0,0,1,1
79,ME_BRONZE,RU7WTEAM7-------------------------,Women,W,2024-07-30,RU7,Rugby Sevens,True,/en/paris-2024/medals-and-ranking/rugby-sevens...,USA,...,1950495,KIRSHE Kristi,KIRSHE K,Kristi KIRSHE,K. KIRSHE,F,0,0,1,1
81,ME_BRONZE,RU7WTEAM7-------------------------,Women,W,2024-07-30,RU7,Rugby Sevens,True,/en/paris-2024/medals-and-ranking/rugby-sevens...,USA,...,1950496,LEVY Sarah,LEVY S,Sarah LEVY,S. LEVY,F,0,0,1,1


In [15]:
with open("arg-athletes.html", "rb") as f:
    df = pd.read_html(f.read())[0]
df

Unnamed: 0,Name Move up,Team/NOC Move up,Discipline Move up
0,ALBERTARRIO Agustina ALBERTARRIO A,Argentina,Hockey Hockey
1,ALMADA Thiago ALMADA T,Argentina,Football Football
2,ALONSO Agostina ALONSO A,Argentina,Hockey Hockey
3,ALVAREZ Julian ALVAREZ J,Argentina,Football Football
4,ALVAREZ Santiago ALVAREZ S,Argentina,Rugby Sevens Rugby Sevens
...,...,...,...
95,MOURINO Gaston MOURINO G,Argentina,Handball Handball
96,MOYANO Andres MOYANO A,Argentina,Handball Handball
97,NAVONE Mariano NAVONE M,Argentina,Tennis Tennis
98,OCAMPO Daiana OCAMPO D,Argentina,Athletics Athletics


In [19]:
sitemap = "https://olympics.com/OG2024/assets/sitemap-en.xml"
r = requests.get(sitemap, headers=HEADERS)

In [23]:
r

<Response [200]>

In [20]:
import requests
from bs4 import BeautifulSoup

def sitemap_urls(
    # url: str = "https://olympics.com/en/paris-2024/athletes/sitemap-en.xml"
    url: str = "https://olympics.com/OG2024/assets/sitemap-en.xml"
) -> list:
    try:
        # Send a GET request to the sitemap URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)

        # Parse the sitemap XML content
        sitemap = response.content
        soup = BeautifulSoup(sitemap, 'xml')  # Use 'xml' parser for XML content

        # Find all <url> tags in the sitemap
        urls = soup.find_all('url')

        # Extract the loc (location) for each URL
        sitemap_urls = [url.find('loc').text for url in urls]

        return sitemap_urls

    except requests.exceptions.RequestException as e:
        print(f"Error fetching sitemap: {e}")
        return []


def get_all_countries_endpoint_slugs(sitemap_urls: list) -> list:
    """Returns the country slug for their respective endpoints."""
    return sorted([
        url.split("/")[-1] 
        for url in sitemap_urls 
        if "noc-entries" in url
    ])


# URL of the sitemap to fetch
sitemap_url = 'https://olympics.com/en/paris-2024/athletes/sitemap-en.xml'
sitemap_url = "https://olympics.com/OG2024/assets/sitemap-en.xml"
urls = sitemap_urls(sitemap_url)

In [22]:
len(get_all_countries_endpoint_slugs(urls))

206

In [14]:
# https://olympics.com/en/paris-2024/profile/spain

sorted([url.split("/")[-1] for url in urls if "noc-entries" in url])


['afghanistan',
 'ain',
 'albania',
 'algeria',
 'american-samoa',
 'andorra',
 'angola',
 'antigua-and-barbuda',
 'argentina',
 'armenia',
 'aruba',
 'australia',
 'austria',
 'azerbaijan',
 'bahamas',
 'bahrain',
 'bangladesh',
 'barbados',
 'belgium',
 'belize',
 'benin',
 'bermuda',
 'bhutan',
 'bolivia',
 'bosnia-and-herzegovina',
 'botswana',
 'brazil',
 'brunei-darussalam',
 'bulgaria',
 'burkina-faso',
 'burundi',
 'cabo-verde',
 'cambodia',
 'cameroon',
 'canada',
 'cayman-islands',
 'centr-afric-rep',
 'chad',
 'chile',
 'china',
 'chinese-taipei',
 'colombia',
 'comoros',
 'congo',
 'cook-islands',
 'costa-rica',
 'cote-d-ivoire',
 'croatia',
 'cuba',
 'cyprus',
 'czechia',
 'denmark',
 'djibouti',
 'dominica',
 'dominican-republic',
 'dpr-korea',
 'dr-congo',
 'ecuador',
 'egypt',
 'el-salvador',
 'eor',
 'equatorial-guinea',
 'eritrea',
 'estonia',
 'eswatini',
 'ethiopia',
 'fiji',
 'finland',
 'france',
 'gabon',
 'gambia',
 'georgia',
 'germany',
 'ghana',
 'great-brita