### Scrape max grade, avg grade data from 8a.nu profiles

In [33]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import re
import time
import os
import random
from selenium.common.exceptions import TimeoutException, WebDriverException

# Function to convert climbing grade to numerical value for comparison and ranking
def grade_to_number(grade):
    grade = grade.strip()
    # Handle sport climbing grades (e.g., 8a+, 9a)
    match = re.match(r"(\d+)([abc])(\+)?", grade)
    if match:
        number, letter, plus = match.groups()
        number = int(number)

        # Base grade based on letter
        if letter == "a":
            base = number
        elif letter == "b":
            base = number + 0.25
        elif letter == "c":
            base = number + 0.5

        # Add 0.25 if there's a "+"
        if plus:
            base += 0.25
        return base

    return 0  # Invalid grade

# Function to convert climbing grade to linear scale (7a=1, 7a+=2, etc.)
def grade_to_linear_scale(grade):
    grade = grade.strip()
    match = re.match(r"(\d+)([abc])(\+)?", grade)
    if not match:
        return 0

    number, letter, plus = match.groups()
    number = int(number)

    # Base value starts at 7a = 1
    base_value = (number - 7) * 6  # Each number adds 6 values (a, a+, b, b+, c, c+)

    # Add for the letter
    if letter == "a":
        letter_value = 0
    elif letter == "b":
        letter_value = 2
    elif letter == "c":
        letter_value = 4

    # Add for the plus
    plus_value = 1 if plus else 0

    # Final value (7a = 1, 7a+ = 2, 7b = 3, etc.)
    linear_value = base_value + letter_value + plus_value + 1

    return linear_value

# Function to get sport climbing data: highest grade, 8c+ count, and average of first 5 unique grade rows
def get_sport_climbing_data(driver, url, retries=2):
    for attempt in range(retries):
        try:
            driver.get(url)
            # Wait for the page to load key elements
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "statistics-header"))
            )
            # Scroll the page down by 300 pixels to ensure the dropdown is visible
            driver.execute_script("window.scrollTo(0, 300);")
            time.sleep(1.5)  # Brief pause for JavaScript to settle

            # Target the correct dropdown more precisely
            try:
                # Scope to statistics-header to avoid other dropdowns
                stats_header = driver.find_element(By.CLASS_NAME, "statistics-header")
                # Use a general selector for the dropdown input
                dropdown_input = stats_header.find_element(By.CSS_SELECTOR, "input[type='text']")
                current_value = dropdown_input.get_attribute("placeholder") or dropdown_input.get_attribute("value")
                print(f"Current dropdown value: {current_value}")

                # If not on "All Time," adjust it
                if "All Time" not in current_value:
                    ActionChains(driver).move_to_element(dropdown_input).click().perform()
                    print("Clicked dropdown input")
                    time.sleep(0.5)  # Wait for options to appear

                    # Select "All Time" from options
                    all_time_option = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'All Time')]"))
                    )
                    ActionChains(driver).move_to_element(all_time_option).click().perform()
                    print("Selected 'All Time' option")
                    time.sleep(1)  # Wait for page update
                else:
                    print("Already on 'All Time', no change needed")

            except Exception as e:
                print(f"Error interacting with dropdown: {e}")
                print("Proceeding with current selection")

            # Extract data with optimized content extraction
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CLASS_NAME, "statistics-body"))
            )

            # Use optimized parsing approach
            soup = BeautifulSoup(driver.page_source, "html.parser")
            stats_lines = soup.find_all("div", {"class": "statistics-line stats"})

            if not stats_lines:
                print(f"No statistics lines found for {url}")
                return None, 0, 0

            # Highest grade (first row)
            highest_grade = None
            if stats_lines:
                grade_elem = stats_lines[0].find("span", {"class": "difficulty"})
                highest_grade = grade_elem.text.strip() if grade_elem else None

            # Count 8c+ or above ascents - optimize with early exit when possible
            count_8c_plus = 0
            if highest_grade:
                highest_numerical = grade_to_number(highest_grade)
                if highest_numerical >= 8.75:  # 8c+ threshold
                    # First check if we can extract all counts at once to avoid iterating
                    grades_over_threshold = []
                    counts_over_threshold = []

                    for line in stats_lines:
                        grade_elem = line.find("span", {"class": "difficulty"})
                        if grade_elem:
                            grade = grade_elem.text.strip()
                            if grade_to_number(grade) >= 8.75:
                                number_grid = line.find("div", {"class": "number-grid"})
                                if number_grid:
                                    total = int(number_grid.find_all("div", {"class": "number-cell"})[-1].text.strip())
                                    count_8c_plus += total

            # Average grade of first 5 unique rows - use direct indexing for efficiency
            max_rows = min(5, len(stats_lines))
            weighted_sum, total_ascents = 0, 0
            for i in range(max_rows):
                line = stats_lines[i]
                grade_elem = line.find("span", {"class": "difficulty"})
                if grade_elem:
                    grade = grade_elem.text.strip()
                    linear_value = grade_to_linear_scale(grade)
                    number_grid = line.find("div", {"class": "number-grid"})
                    if number_grid:
                        total_ascents_row = int(number_grid.find_all("div", {"class": "number-cell"})[-1].text.strip())
                        weighted_sum += linear_value * total_ascents_row
                        total_ascents += total_ascents_row

            avg_grade_linear = weighted_sum / total_ascents if total_ascents > 0 else 0
            return highest_grade, count_8c_plus, round(avg_grade_linear, 2)

        except TimeoutException as e:
            print(f"Timeout on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return None, 0, 0
            time.sleep(random.uniform(2, 4))
        except WebDriverException as e:
            print(f"WebDriver error on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return None, 0, 0
            time.sleep(random.uniform(2, 4))
        except Exception as e:
            print(f"Unexpected error on attempt {attempt + 1} for {url}: {e}")
            if attempt == retries - 1:
                return None, 0, 0
            time.sleep(random.uniform(2, 4))

    return None, 0, 0

# Main script
if __name__ == "__main__":
    # Set up Selenium WebDriver with reliability options
    chrome_options = Options()
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-extensions")  # Disable extensions for better stability

    # Set page load strategy to eager to speed up page load
    chrome_options.page_load_strategy = 'eager'

    chrome_options.set_capability("goog:loggingPrefs", {"browser": "ALL"})

    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        # Increase implicit wait to improve stability
        driver.implicitly_wait(10)
    except Exception as e:
        print(f"Failed to initialize WebDriver: {e}")
        exit(1)

    csv_path = "../data/8anu_data/8a_nu_profiles.csv"
    try:
        df = pd.read_csv(csv_path, header=None, names=["name", "url"])
        print(f"Loaded CSV with {len(df)} climbers.")
    except FileNotFoundError:
        print(f"CSV file not found at {csv_path}.")
        driver.quit()
        exit(1)
    except Exception as e:
        print(f"Error loading CSV from {csv_path}: {e}")
        driver.quit()
        exit(1)

    # Use a semirandom distribution of delays between requests to avoid detection
    delay_options = [3, 3.5, 4, 4.5, 5]

    # Create a smaller batch for processing to allow periodic breaks
    batch_size = 10
    total_climbers = len(df)
    filtered_data = []

    for batch_start in range(0, total_climbers, batch_size):
        batch_end = min(batch_start + batch_size, total_climbers)
        print(f"\nProcessing batch {batch_start//batch_size + 1} (climbers {batch_start+1}-{batch_end})...")

        # Process each climber in the batch
        for index in range(batch_start, batch_end):
            name = df.iloc[index]["name"]
            base_url = df.iloc[index]["url"]

            print(f"Processing {name} ({index+1}/{total_climbers})...")
            try:
                sportclimbing_url = f"{base_url}/sportclimbing"
                highest_grade, count_8c_plus, avg_grade_first5 = get_sport_climbing_data(driver, sportclimbing_url)

                if highest_grade is None:
                    print(f"Could not extract sport climbing data for {name}, skipping.")
                    continue

                print(f"{name} Highest Grade: {highest_grade}, 8c+ Ascents: {count_8c_plus}, Avg Grade (First 5): {avg_grade_first5}")

                filtered_data.append({
                    "name": name,
                    "url": base_url,
                    "highest_grade": highest_grade,
                    "count_8c_plus": count_8c_plus,
                    "avg_grade_first5": avg_grade_first5
                })

                # Save intermediate results every batch
                if len(filtered_data) % batch_size == 0 or index == total_climbers - 1:
                    temp_df = pd.DataFrame(filtered_data)
                    output_dir = "../data/8anu_data"
                    os.makedirs(output_dir, exist_ok=True)
                    temp_output_path = os.path.join(output_dir, "sport_climbing_data_partial.csv")
                    temp_df.to_csv(temp_output_path, index=False)
                    print(f"Intermediate results saved after processing {len(filtered_data)} climbers")

            except Exception as e:
                print(f"Error processing {name}: {e}")
                continue

    try:
        driver.quit()
    except Exception as e:
        print(f"Error closing WebDriver: {e}")

    print("\nClimbers included in the output CSV:")
    for climber in filtered_data:
        print(f"- {climber['name']}: Highest Grade: {climber['highest_grade']}, 8c+ Ascents: {climber['count_8c_plus']}, Avg First 5: {climber['avg_grade_first5']}")

    if filtered_data:
        filtered_df = pd.DataFrame(filtered_data)
    else:
        filtered_df = pd.DataFrame(columns=["name", "url", "highest_grade", "count_8c_plus", "avg_grade_first5"])

    output_dir = "../data/8anu_data"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "sport_climbing_data.csv")
    filtered_df.to_csv(output_path, index=False)
    print(f"Final results saved to {output_path}")

Loaded CSV with 200 climbers.

Processing batch 1 (climbers 1-10)...
Processing Toby Roberts (1/200)...
Current dropdown value: All Time
Already on 'All Time', no change needed
No statistics lines found for https://www.8a.nu/user/toby-roberts-e1619/sportclimbing
Could not extract sport climbing data for Toby Roberts, skipping.
Processing Anze Peharc (2/200)...
Current dropdown value: All Time
Already on 'All Time', no change needed
Anze Peharc Highest Grade: 8b, 8c+ Ascents: 0, Avg Grade (First 5): 7.78
Processing Hannes Van Duysen (3/200)...
Current dropdown value: All Time
Already on 'All Time', no change needed
Hannes Van Duysen Highest Grade: 8a, 8c+ Ascents: 0, Avg Grade (First 5): 7.0
Processing Jakob Schubert (4/200)...
Current dropdown value: All Time
Already on 'All Time', no change needed
Jakob Schubert Highest Grade: 9a+, 8c+ Ascents: 26, Avg Grade (First 5): 11.21
Processing Mejdi Schalck (5/200)...
Current dropdown value: Last 12 months
Clicked dropdown input
Selected 'All

KeyboardInterrupt: 