### Find ifsc climbers profiles on 8a.nu using automated search

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import os
import urllib.parse
from fuzzywuzzy import fuzz

# Function to capitalize first letters only
def capitalize_name(name):
    return " ".join(word.capitalize() for word in name.split())

# Function to append a row to CSV
def append_to_csv(data, output_file):
    """Append a single row to the CSV file."""
    df = pd.DataFrame([data])
    if os.path.exists(output_file):
        df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        df.to_csv(output_file, mode='w', header=True, index=False)

# Optional nickname lookup table
NICKNAME_LOOKUP = {
    "Nikolay Rusev": ["Niki Rusev"],
    "Alexander Megos": ["Alex Megos"],
    # Add more known aliases here as needed
}

# Function to search for a climber on 8a.nu and collect the highest probable profile link
def search_8a_nu(climber_name, country, driver, output_file, similarity_threshold=90):
    """Search 8a.nu for a climber and save the highest probable profile link with ascents to CSV."""
    # Capitalize only first letters
    climber_name = capitalize_name(climber_name)
    encoded_name = urllib.parse.quote(climber_name)
    search_url = f"https://www.8a.nu/search/users?query={encoded_name}"
    print(f"Searching for {climber_name} from {country} on 8a.nu...")

    try:
        # Navigate directly to the search URL
        driver.get(search_url)

        # Wait for search results table rows to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "tr"))
        )

        # Parse the search results page
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find climber profile links with ascent counts
        candidates = []
        result_rows = soup.find_all("tr")
        nicknames = NICKNAME_LOOKUP.get(climber_name, [])  # Get possible nicknames
        for row in result_rows:
            name_link = row.find("a", href=lambda href: href and "/user/" in href)
            if name_link:
                link_text = name_link.text.strip()
                link_href = name_link["href"]
                # Calculate similarity with IFSC name or check nicknames
                similarity = fuzz.partial_ratio(climber_name.lower(), link_text.lower())
                is_nickname = any(nick.lower() in link_text.lower() for nick in nicknames)
                if similarity >= similarity_threshold or is_nickname:
                    # Extract country from the row
                    country_td = row.find("td", class_="col-flag")
                    found_country = "N/A"
                    country_code = "N/A"
                    if country_td:
                        country_text = country_td.text.strip()
                        found_country = country_text if country_text else "N/A"
                        country_span = country_td.find("span", class_=lambda x: x and x.startswith("f-"))
                        if country_span and country_span.get("class"):
                            try:
                                country_code = country_span["class"][0].split("-")[1].upper()
                            except (IndexError, AttributeError):
                                country_code = "N/A"

                    # Extract ascent count
                    ascent_td = row.find("td", class_="col-ascents")
                    ascent_count = 0
                    if ascent_td:
                        ascent_text = ascent_td.text.strip().replace(" ", "")
                        try:
                            ascent_count = int(ascent_text) if ascent_text else 0
                        except ValueError:
                            ascent_count = 0

                    # Only include profiles with ascents > 0
                    full_url = f"https://www.8a.nu{link_href}"
                    if ascent_count > 0:
                        candidates.append({
                            "url": full_url,
                            "name": link_text,
                            "similarity": similarity if not is_nickname else 100,  # Nicknames get max score
                            "country": found_country,
                            "country_code": country_code,
                            "ascents": ascent_count,
                            "verified": country == found_country or country == country_code
                        })
                        print(f"Found candidate profile for {climber_name}: {full_url} (Name: {link_text}, Similarity: {similarity}%, Country: {found_country}, Ascents: {ascent_count})")

        # Select the highest probable profile
        profile_link = None
        if candidates:
            best_candidate = max(candidates, key=lambda x: x["similarity"])  # Highest similarity
            profile_link = best_candidate["url"]
            if best_candidate["verified"]:
                print(f"Selected verified profile: {profile_link} (Similarity: {best_candidate['similarity']}%)")
            else:
                print(f"Selected potential profile: {profile_link} (Similarity: {best_candidate['similarity']}%, expected country: {country})")

        # Only save to CSV if a profile with ascents is found
        if profile_link:
            data = {"name": climber_name, "possible_profile_link_1": profile_link}
            print(f"Selected 1 profile with ascents for {climber_name}")
            append_to_csv(data, output_file)
            return data
        else:
            print(f"No profile with ascents found for {climber_name}")
            return None

    except Exception as e:
        print(f"Error searching for {climber_name}: {e}")
        return None

# Main script to process IFSC climbers and save to CSV incrementally
def process_ifsc_climbers(ifsc_dir="../data/ifsc_data", output_dir="../data/8anu_data"):
    """Process IFSC climbers and save their highest probable 8a.nu profile link with ascents to CSV."""
    men_df = pd.read_csv(f"{ifsc_dir}/men_climbers.csv")
    women_df = pd.read_csv(f"{ifsc_dir}/women_climbers.csv")
    climbers_df = pd.concat([men_df, women_df], ignore_index=True)

    output_file = f"{output_dir}/8a_nu_profiles.csv"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize a single WebDriver instance
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    total_climbers = len(climbers_df)
    processed_with_profiles = 0
    try:
        for index, row in climbers_df.iterrows():
            climber_name = row["name"]
            country = row["country"]
            result = search_8a_nu(climber_name, country, driver, output_file)
            if result:
                processed_with_profiles += 1
            print(f"Processed {index + 1}/{total_climbers} climbers ({processed_with_profiles} with profiles and ascents)")
    finally:
        driver.quit()  # Ensure driver closes even if an error occurs

# Run the script
if __name__ == "__main__":
    print("Starting 8a.nu scraping process...")
    process_ifsc_climbers()
    print("8a.nu scraping process completed!")