### Find ifsc climbers profiles on 8a.nu using automated search

In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os
import urllib.parse
from fuzzywuzzy import fuzz

# Function to capitalize first letters only
def capitalize_name(name):
    return " ".join(word.capitalize() for word in name.split())

# Function to append a row to CSV
def append_to_csv(data, output_file):
    """Append a single row to the CSV file."""
    df = pd.DataFrame([data])
    if os.path.exists(output_file):
        df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        df.to_csv(output_file, mode='w', header=True, index=False)

# Optional nickname lookup table
NICKNAME_LOOKUP = {
    "Nikolay Rusev": ["Niki Rusev"],
    "Alexander Megos": ["Alex Megos"],
    # Add more known aliases here as needed
}

# Function to search for a climber on 8a.nu and collect the highest probable profile link
def search_8a_nu(climber_name, country, driver, output_file, similarity_threshold=90):
    """Search 8a.nu for a climber and save the highest probable profile link with ascents to CSV."""
    # Capitalize only first letters
    climber_name = capitalize_name(climber_name)
    encoded_name = urllib.parse.quote(climber_name)
    search_url = f"https://www.8a.nu/search/users?query={encoded_name}"
    print(f"Searching for {climber_name} from {country} on 8a.nu...")

    try:
        # Navigate directly to the search URL
        driver.get(search_url)

        # Wait for search results table rows to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "tr"))
        )

        # Parse the search results page
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find climber profile links with ascent counts
        candidates = []
        result_rows = soup.find_all("tr")
        nicknames = NICKNAME_LOOKUP.get(climber_name, [])  # Get possible nicknames
        for row in result_rows:
            name_link = row.find("a", href=lambda href: href and "/user/" in href)
            if name_link:
                link_text = name_link.text.strip()
                link_href = name_link["href"]
                # Calculate similarity with IFSC name or check nicknames
                similarity = fuzz.partial_ratio(climber_name.lower(), link_text.lower())
                is_nickname = any(nick.lower() in link_text.lower() for nick in nicknames)
                if similarity >= similarity_threshold or is_nickname:
                    # Extract country from the row
                    country_td = row.find("td", class_="col-flag")
                    found_country = "N/A"
                    country_code = "N/A"
                    if country_td:
                        country_text = country_td.text.strip()
                        found_country = country_text if country_text else "N/A"
                        country_span = country_td.find("span", class_=lambda x: x and x.startswith("f-"))
                        if country_span and country_span.get("class"):
                            try:
                                country_code = country_span["class"][0].split("-")[1].upper()
                            except (IndexError, AttributeError):
                                country_code = "N/A"

                    # Extract ascent count
                    ascent_td = row.find("td", class_="col-ascents")
                    ascent_count = 0
                    if ascent_td:
                        ascent_text = ascent_td.text.strip().replace(" ", "")
                        try:
                            ascent_count = int(ascent_text) if ascent_text else 0
                        except ValueError:
                            ascent_count = 0

                    # Only include profiles with ascents > 0
                    full_url = f"https://www.8a.nu{link_href}"
                    if ascent_count > 0:
                        candidates.append({
                            "url": full_url,
                            "name": link_text,
                            "similarity": similarity if not is_nickname else 100,  # Nicknames get max score
                            "country": found_country,
                            "country_code": country_code,
                            "ascents": ascent_count,
                            "verified": country == found_country or country == country_code
                        })
                        print(f"Found candidate profile for {climber_name}: {full_url} (Name: {link_text}, Similarity: {similarity}%, Country: {found_country}, Ascents: {ascent_count})")

        # Select the highest probable profile
        profile_link = None
        if candidates:
            best_candidate = max(candidates, key=lambda x: x["similarity"])  # Highest similarity
            profile_link = best_candidate["url"]
            if best_candidate["verified"]:
                print(f"Selected verified profile: {profile_link} (Similarity: {best_candidate['similarity']}%)")
            else:
                print(f"Selected potential profile: {profile_link} (Similarity: {best_candidate['similarity']}%, expected country: {country})")

        # Only save to CSV if a profile with ascents is found
        if profile_link:
            data = {"name": climber_name, "possible_profile_link_1": profile_link}
            print(f"Selected 1 profile with ascents for {climber_name}")
            append_to_csv(data, output_file)
            return data
        else:
            print(f"No profile with ascents found for {climber_name}")
            return None

    except Exception as e:
        print(f"Error searching for {climber_name}: {e}")
        return None

# Main script to process IFSC climbers and save to CSV incrementally
def process_ifsc_climbers(ifsc_dir="../data/ifsc_data", output_dir="../data/8anu_data"):
    """Process IFSC climbers and save their highest probable 8a.nu profile link with ascents to CSV."""
    men_df = pd.read_csv(f"{ifsc_dir}/men_climbers.csv")
    women_df = pd.read_csv(f"{ifsc_dir}/women_climbers.csv")
    climbers_df = pd.concat([men_df, women_df], ignore_index=True)

    output_file = f"{output_dir}/8a_nu_profiles.csv"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize a single WebDriver instance
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    total_climbers = len(climbers_df)
    processed_with_profiles = 0
    try:
        for index, row in climbers_df.iterrows():
            climber_name = row["name"]
            country = row["country"]
            result = search_8a_nu(climber_name, country, driver, output_file)
            if result:
                processed_with_profiles += 1
            print(f"Processed {index + 1}/{total_climbers} climbers ({processed_with_profiles} with profiles and ascents)")
    finally:
        driver.quit()  # Ensure driver closes even if an error occurs

# Run the script
if __name__ == "__main__":
    print("Starting 8a.nu scraping process...")
    process_ifsc_climbers()
    print("8a.nu scraping process completed!")

Starting 8a.nu scraping process...
Searching for Sorato Anraku from JPN on 8a.nu...
No profile with ascents found for Sorato Anraku
Processed 1/558 climbers (0 with profiles and ascents)
Searching for Dohyun Lee from KOR on 8a.nu...
No profile with ascents found for Dohyun Lee
Processed 2/558 climbers (0 with profiles and ascents)
Searching for Meichi Narasaki from JPN on 8a.nu...
No profile with ascents found for Meichi Narasaki
Processed 3/558 climbers (0 with profiles and ascents)
Searching for Tomoa Narasaki from JPN on 8a.nu...
No profile with ascents found for Tomoa Narasaki
Processed 4/558 climbers (0 with profiles and ascents)
Searching for Sohta Amagasa from JPN on 8a.nu...
No profile with ascents found for Sohta Amagasa
Processed 5/558 climbers (0 with profiles and ascents)
Searching for Toby Roberts from GBR on 8a.nu...
Found candidate profile for Toby Roberts: https://www.8a.nu/user/toby-roberts-e1619 (Name: Toby  Roberts, Similarity: 92%, Country: GBR, Ascents: 4)
Selected

***

### Scrape max grade, avg grade data from 8a.nu

In [7]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import re
import time
import os
import random
from selenium.common.exceptions import TimeoutException, WebDriverException

# Function to convert climbing grade to numerical value for comparison and ranking
def grade_to_number(grade):
    grade = grade.strip()
    # Handle sport climbing grades (e.g., 8a+, 9a)
    match = re.match(r"(\d+)([abc])(\+)?", grade)
    if match:
        number, letter, plus = match.groups()
        number = int(number)

        # Base grade based on letter
        if letter == "a":
            base = number
        elif letter == "b":
            base = number + 0.25
        elif letter == "c":
            base = number + 0.5

        # Add 0.25 if there's a "+"
        if plus:
            base += 0.25
        return base

    return 0  # Invalid grade

# Function to convert climbing grade to linear scale (7a=1, 7a+=2, etc.)
def grade_to_linear_scale(grade):
    grade = grade.strip()
    match = re.match(r"(\d+)([abc])(\+)?", grade)
    if not match:
        return 0

    number, letter, plus = match.groups()
    number = int(number)

    # Base value starts at 7a = 1
    base_value = (number - 7) * 6  # Each number adds 6 values (a, a+, b, b+, c, c+)

    # Add for the letter
    if letter == "a":
        letter_value = 0
    elif letter == "b":
        letter_value = 2
    elif letter == "c":
        letter_value = 4

    # Add for the plus
    plus_value = 1 if plus else 0

    # Final value (7a = 1, 7a+ = 2, 7b = 3, etc.)
    linear_value = base_value + letter_value + plus_value + 1

    return linear_value

# Function to check the total number of routes
def get_total_ascents(driver, url, retries=2):
    for attempt in range(retries):
        try:
            driver.get(url)
            # Wait for the second-line div to load
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "second-line"))
            )
            soup = BeautifulSoup(driver.page_source, "html.parser")

            second_line = soup.find("div", {"class": "second-line"})
            if not second_line:
                print(f"Could not find second-line div to check total ascents for {url}")
                return 0

            stats = second_line.find("div", {"class": "statistics"})
            if not stats:
                print(f"Could not find statistics div to check total ascents for {url}")
                return 0

            routes = 0
            stat_cells = stats.find_all("div", {"class": "statistics-cell"})
            for cell in stat_cells:
                title = cell.find("div", {"class": "statistics-title"}).text.strip()
                value = cell.find("div", {"class": "statistics-value"}).text.strip().replace(" ", "")
                if title == "Routes":
                    routes = int(value)

            return routes

        except TimeoutException as e:
            print(f"Timeout on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return 0
            time.sleep(random.uniform(2, 5))
        except Exception as e:
            print(f"Error on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return 0
            time.sleep(random.uniform(2, 5))


# Function to get sport climbing data: highest grade, 8c+ count, and average of first 5 unique grade rows
def get_sport_climbing_data(driver, url, retries=2):
    for attempt in range(retries):
        try:
            driver.get(url)
            # Wait for the page to load key elements
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, "statistics-header"))
            )
            time.sleep(3)  # Brief pause for JavaScript to settle

            # Target the correct dropdown more precisely
            try:
                # Scope to statistics-header to avoid other dropdowns
                stats_header = driver.find_element(By.CLASS_NAME, "statistics-header")
                # Use a general selector for the dropdown input
                dropdown_input = stats_header.find_element(By.CSS_SELECTOR, "input[type='text']")
                current_value = dropdown_input.get_attribute("placeholder") or dropdown_input.get_attribute("value")
                print(f"Current dropdown value: {current_value}")

                # If not on "All Time," adjust it
                if "All Time" not in current_value:
                    ActionChains(driver).move_to_element(dropdown_input).click().perform()
                    print("Clicked dropdown input")
                    time.sleep(1)  # Wait for options to appear

                    # Select "All Time" from options
                    all_time_option = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'All Time')]"))
                    )
                    ActionChains(driver).move_to_element(all_time_option).click().perform()
                    print("Selected 'All Time' option")
                    time.sleep(3)  # Wait for page update
                else:
                    print("Already on 'All Time', no change needed")

            except Exception as e:
                print(f"Error interacting with dropdown: {e}")
                print("Proceeding with current selection")

            # Extract data
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CLASS_NAME, "statistics-body"))
            )
            soup = BeautifulSoup(driver.page_source, "html.parser")
            stats_lines = soup.find_all("div", {"class": "statistics-line stats"})

            if not stats_lines:
                print(f"No statistics lines found for {url}")
                return None, 0, 0

            print(f"Found {len(stats_lines)} statistics lines for {url}")

            # Highest grade (first row)
            highest_grade = None
            if stats_lines:
                grade_elem = stats_lines[0].find("span", {"class": "difficulty"})
                highest_grade = grade_elem.text.strip() if grade_elem else None

            # Count 8c+ or above ascents
            count_8c_plus = 0
            if highest_grade:
                highest_numerical = grade_to_number(highest_grade)
                if highest_numerical >= 8.75:  # 8c+ threshold
                    for line in stats_lines:
                        grade_elem = line.find("span", {"class": "difficulty"})
                        if grade_elem and grade_to_number(grade_elem.text.strip()) >= 8.75:
                            number_grid = line.find("div", {"class": "number-grid"})
                            if number_grid:
                                total = int(number_grid.find_all("div", {"class": "number-cell"})[-1].text.strip())
                                count_8c_plus += total

            # Average grade of first 5 unique rows
            max_rows = min(5, len(stats_lines))
            weighted_sum, total_ascents = 0, 0
            for i in range(max_rows):
                line = stats_lines[i]
                grade_elem = line.find("span", {"class": "difficulty"})
                if grade_elem:
                    grade = grade_elem.text.strip()
                    linear_value = grade_to_linear_scale(grade)
                    number_grid = line.find("div", {"class": "number-grid"})
                    if number_grid:
                        total_ascents_row = int(number_grid.find_all("div", {"class": "number-cell"})[-1].text.strip())
                        weighted_sum += linear_value * total_ascents_row
                        total_ascents += total_ascents_row

            avg_grade_linear = weighted_sum / total_ascents if total_ascents > 0 else 0
            return highest_grade, count_8c_plus, round(avg_grade_linear, 2)

        except TimeoutException as e:
            print(f"Timeout on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return None, 0, 0
            time.sleep(3)
        except WebDriverException as e:
            print(f"WebDriver error on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return None, 0, 0
            time.sleep(3)

    return None, 0, 0

# Main script
# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
chrome_options.set_capability("goog:loggingPrefs", {"browser": "ALL"})
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
except Exception as e:
    print(f"Failed to initialize WebDriver: {e}")
    exit(1)
csv_path = "../data/8anu_data/8a_nu_profiles.csv"
try:
    df = pd.read_csv(csv_path, header=None, names=["name", "url"])
    print(f"Loaded CSV with {len(df)} climbers.")
except FileNotFoundError:
    print(f"CSV file not found at {csv_path}.")
    driver.quit()
    exit(1)
except Exception as e:
    print(f"Error loading CSV from {csv_path}: {e}")
    driver.quit()
    exit(1)
filtered_data = []
for index, row in df.iterrows():
    name = row["name"]
    base_url = row["url"]
    print(f"Processing {name}...")
    try:
        sportclimbing_url = f"{base_url}/sportclimbing"
        highest_grade, count_8c_plus, avg_grade_first5 = get_sport_climbing_data(driver, sportclimbing_url)
        if highest_grade is None:
            print(f"Could not extract sport climbing data for {name}, skipping.")
            continue
        print(f"{name} Highest Grade: {highest_grade}, 8c+ Ascents: {count_8c_plus}, Avg Grade (First 5): {avg_grade_first5}")
        filtered_data.append({
            "name": name,
            "url": base_url,
            "highest_grade": highest_grade,
            "count_8c_plus": count_8c_plus,
            "avg_grade_first5": avg_grade_first5
        })
    except Exception as e:
        print(f"Error processing {name}: {e}")
        continue
    time.sleep(random.uniform(3, 6))
try:
    driver.quit()
except Exception as e:
    print(f"Error closing WebDriver: {e}")
print("Climbers included in the output CSV:")
for climber in filtered_data:
    print(f"- {climber['name']}: Highest Grade: {climber['highest_grade']}, 8c+ Ascents: {climber['count_8c_plus']}, Avg First 5: {climber['avg_grade_first5']}")
if filtered_data:
    filtered_df = pd.DataFrame(filtered_data)
else:
    filtered_df = pd.DataFrame(columns=["name", "url", "highest_grade", "count_8c_plus", "avg_grade_first5"])
output_dir = "../data/8anu_data"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "sport_climbing_data.csv")
filtered_df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

Loaded CSV with 200 climbers.
Processing Toby Roberts...
Current dropdown value: All Time
Already on 'All Time', no change needed
No statistics lines found for https://www.8a.nu/user/toby-roberts-e1619/sportclimbing
Could not extract sport climbing data for Toby Roberts, skipping.
Processing Anze Peharc...
Current dropdown value: All Time
Already on 'All Time', no change needed
Found 3 statistics lines for https://www.8a.nu/user/ane-peharc/sportclimbing
Anze Peharc Highest Grade: 8b, 8c+ Ascents: 0, Avg Grade (First 5): 7.78
Processing Hannes Van Duysen...
Current dropdown value: All Time
Already on 'All Time', no change needed
Found 1 statistics lines for https://www.8a.nu/user/hannes-van-duysen/sportclimbing
Hannes Van Duysen Highest Grade: 8a, 8c+ Ascents: 0, Avg Grade (First 5): 7.0
Processing Jakob Schubert...
Current dropdown value: All Time
Already on 'All Time', no change needed
Found 8 statistics lines for https://www.8a.nu/user/jakob-schubert/sportclimbing
Jakob Schubert High

KeyboardInterrupt: 