## Introduction
This notebook scrapes climber names, country of the climber and total points by category from the International Federation of Sport Climbing (IFSC) rankings website. It extracts data for both boulder and lead categories, for men and women.


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

### Scraping IFSC with Robust Points Extraction (unchanged)
def scrape_ifsc_data(url, category_name):
    """Scrape climber names, countries, and points using Selenium with debugging."""
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    print(f"Scraping {category_name} from {url}")
    driver.get(url)

    # Handle cookie popup with a slight delay
    try:
        time.sleep(1)
        accept_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
        )
        driver.execute_script("arguments[0].click();", accept_button)
        print(f"Accepted cookie popup for {category_name}")
    except Exception as e:
        print(f"No cookie popup found or error accepting it for {category_name}: {e}")

    # Determine discipline from category_name
    if "combined" in category_name:
        discipline = "combined"
        tab_name = "Boulder & Lead"
    elif "lead" in category_name:
        discipline = "lead"
        tab_name = "Lead"
    else:
        discipline = "boulder"
        tab_name = "Boulder"

    # Click the appropriate tab
    try:
        tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable(
                (By.XPATH, f"//a[contains(@class, 'd3-ty-navigation-large') and contains(text(), '{tab_name}')]"))
        )
        tab.click()
        print(f"Clicked '{tab_name}' tab for {category_name}")

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "font-normal"))
        )
        print(f"{tab_name} rankings data loaded for {category_name}!")
    except Exception as e:
        print(f"Could not click '{tab_name}' tab or load data for {category_name}: {e}")
        print(f"Attempting to proceed with URL as-is...")

    # Capture page source and parse
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    driver.quit()

    # Extract climber data
    climbers = []
    rows = soup.find_all("tr")
    debug_printed = False
    for row in rows:
        fname = row.find("span", class_="font-normal")
        sname = row.find("span", class_="font-bold uppercase")
        if fname and sname:  # Only process rows with names
            full_name = f"{fname.text.strip()} {sname.text.strip()}"
            columns = row.find_all("td")
            if len(columns) >= 4:  # Expect picture, name, country, points
                # Country: Third column (index 2)
                country_td = columns[2]
                country_span = country_td.find("span")
                country = country_span.text.strip() if country_span else "N/A"

                # Points: Fourth column (index 3)
                points_td = columns[3]
                points_spans = points_td.find_all("span")  # Get all spans
                points = "0"
                for span in points_spans:
                    text = span.text.strip()
                    if text and any(c.isdigit() for c in text):  # Pick span with numbers
                        points = text
                        break

                # Debug if points are 0 for known climbers
                if points == "0" and full_name in ["jongwon CHON", "anze PEHARC"] and not debug_printed:
                    print(f"Debug for {full_name}:")
                    print(f"  Points TD: {points_td.prettify()[:300]}...")
                    print(f"  Points Spans Found: {[s.text.strip() for s in points_spans]}")
                    debug_printed = True

                # Convert points to float to handle decimals, default to 0 if invalid
                try:
                    points_value = float(points) if points else 0.0
                except ValueError:
                    points_value = 0.0

            else:
                country = "N/A"
                points_value = 0.0
                print(f"Debug: Row for {full_name} has {len(columns)} columns: {row.prettify()[:200]}...")

            climbers.append({
                "name": full_name,
                "country": country,
                f"{discipline}_points": points_value
            })

    print(f"Collected {len(climbers)} climbers for {category_name}")
    return climbers

#### Modified Merging and Saving Data to CSV
def merge_and_save_data(men_boulder, men_lead, men_combined, women_boulder, women_lead, women_combined):
    """Merge all climber data into one dataset with gender attribute and save to CSV."""
    os.makedirs(os.path.join("../data", "ifsc_data"), exist_ok=True)

    def merge_gender_data(boulder_data, lead_data, combined_data, gender):
        climbers_dict = {}
        for climber in boulder_data:
            climbers_dict[climber["name"]] = {
                "name": climber["name"],
                "country": climber["country"],
                "gender": gender,
                "boulder_points": climber["boulder_points"],
                "lead_points": 0.0,
                "combined_points": 0.0
            }
        for climber in lead_data:
            if climber["name"] in climbers_dict:
                climbers_dict[climber["name"]]["lead_points"] = climber["lead_points"]
            else:
                climbers_dict[climber["name"]] = {
                    "name": climber["name"],
                    "country": climber["country"],
                    "gender": gender,
                    "boulder_points": 0.0,
                    "lead_points": climber["lead_points"],
                    "combined_points": 0.0
                }
        for climber in combined_data:
            if climber["name"] in climbers_dict:
                climbers_dict[climber["name"]]["combined_points"] = climber["combined_points"]
            else:
                climbers_dict[climber["name"]] = {
                    "name": climber["name"],
                    "country": climber["country"],
                    "gender": gender,
                    "boulder_points": 0.0,
                    "lead_points": 0.0,
                    "combined_points": climber["combined_points"]
                }
        return list(climbers_dict.values())

    # Merge data for men and women with gender attribute
    men_data = merge_gender_data(men_boulder, men_lead, men_combined, "male")
    women_data = merge_gender_data(women_boulder, women_lead, women_combined, "female")

    # Combine men and women data
    all_climbers = men_data + women_data

    # Define columns including gender
    columns = ["name", "country", "gender", "boulder_points", "lead_points", "combined_points"]
    climbers_df = pd.DataFrame(all_climbers, columns=columns)

    # Save to single CSV file
    filepath = os.path.join("../data", "ifsc_data", "ifsc_climbers.csv")
    climbers_df.to_csv(filepath, index=False)

    print(f"Saved {len(all_climbers)} unique climbers (men and women) to {filepath}")

#### Running the Scraper (unchanged)
categories = [
    ("boulder_men", "https://www.ifsc-climbing.org/rankings/index?discipline=boulder&category=men"),
    ("boulder_women", "https://www.ifsc-climbing.org/rankings/index?discipline=boulder&category=women"),
    ("lead_men", "https://www.ifsc-climbing.org/rankings/index?discipline=lead&category=men"),
    ("lead_women", "https://www.ifsc-climbing.org/rankings/index?discipline=lead&category=women"),
    ("combined_men", "https://www.ifsc-climbing.org/rankings/index?discipline=boulder-lead&category=men"),
    ("combined_women", "https://www.ifsc-climbing.org/rankings/index?discipline=boulder-lead&category=women")
]

print("Starting scraping process...")
men_boulder = scrape_ifsc_data(categories[0][1], categories[0][0])
women_boulder = scrape_ifsc_data(categories[1][1], categories[1][0])
men_lead = scrape_ifsc_data(categories[2][1], categories[2][0])
women_lead = scrape_ifsc_data(categories[3][1], categories[3][0])
men_combined = scrape_ifsc_data(categories[4][1], categories[4][0])
women_combined = scrape_ifsc_data(categories[5][1], categories[5][0])
merge_and_save_data(men_boulder, men_lead, men_combined, women_boulder, women_lead, women_combined)
print("Scraping and merging process completed!")

Starting scraping process...
Scraping boulder_men from https://www.ifsc-climbing.org/rankings/index?discipline=boulder&category=men
Accepted cookie popup for boulder_men
Clicked 'Boulder' tab for boulder_men
Boulder rankings data loaded for boulder_men!
Collected 214 climbers for boulder_men
Scraping boulder_women from https://www.ifsc-climbing.org/rankings/index?discipline=boulder&category=women
Accepted cookie popup for boulder_women
Clicked 'Boulder' tab for boulder_women
Boulder rankings data loaded for boulder_women!
Collected 194 climbers for boulder_women
Scraping lead_men from https://www.ifsc-climbing.org/rankings/index?discipline=lead&category=men
Accepted cookie popup for lead_men
Clicked 'Lead' tab for lead_men
Lead rankings data loaded for lead_men!
Collected 189 climbers for lead_men
Scraping lead_women from https://www.ifsc-climbing.org/rankings/index?discipline=lead&category=women
Accepted cookie popup for lead_women
Clicked 'Lead' tab for lead_women
Lead rankings data 