In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import os
import subprocess

# First, make sure Chrome is installed for WSL2
try:
    # Check if Chrome is installed
    chrome_check = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
    if not chrome_check.stdout:
        print("Chrome not found. Installing Chrome dependencies for WSL...")
        %pip install --upgrade pip webdriver-manager selenium
        print("You may need to install Chrome in WSL with: sudo apt update && sudo apt install -y wget unzip fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libcairo2 libcups2 libcurl3-gnutls libdrm2 libgbm1 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 xdg-utils && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && sudo dpkg -i google-chrome-stable_current_amd64.deb && sudo apt-get -f install")
except Exception as e:
    print(f"Error checking Chrome installation: {e}")

# Ensure ChromeDriver has executable permissions after download
os.environ['WDM_LOG_LEVEL'] = '0'  # Suppress webdriver-manager logs
os.environ['WDM_PROGRESS_BAR'] = '0'  # Disable progress bar

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no browser window)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Check the Chrome version and install matching driver
try:
    # Get Chrome version
    chrome_version_cmd = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
    chrome_version = chrome_version_cmd.stdout.strip()
    print(f"Detected Chrome version: {chrome_version}")
except:
    print("Could not detect Chrome version. Make sure Chrome is installed.")

# Initialize the WebDriver
try:
    driver_path = ChromeDriverManager().install()
    print(f"Driver installed at: {driver_path}")
    
    # Make sure the driver is executable
    os.chmod(driver_path, 0o755)
    
    driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
except Exception as e:
    print(f"Error initializing WebDriver: {e}")
    print("If you're using WSL, you may need to install Chrome in the WSL environment")

# URL to scrape
url = "https://arknights.fandom.com/wiki/Operator/6-star"

try:
    # Navigate to the page
    driver.get(url)
    time.sleep(3)  # Wait for page to load
    
    # Find all operator name elements using the XPath
    operator_elements = driver.find_elements(By.XPATH, '/html/body/div[4]/div[4]/div[2]/main/div[3]/div/div[1]/div/div/table/tbody/tr/td[2]/a')
    
    # Extract the text (operator names) into a list
    operator_names = [element.text for element in operator_elements if element.text]
    
    print("6-Star Operators:")
    for name in operator_names:
        print(name)
        
    print(f"\nTotal 6-star operators found: {len(operator_names)}")

finally:
    # Close the browser
    driver.quit()

# The operator_names list now contains all the 6-star operator names

Detected Chrome version: Google Chrome 135.0.7049.114
Driver installed at: /home/semicolon/.wdm/drivers/chromedriver/linux64/135.0.7049.114/chromedriver-linux64/chromedriver
6-Star Operators:
Aak
Angelina
Archetto
Ash
Bagpipe
Blaze
Blemishine
Carnelian
Ceobe
Ch'en
Ch'en the Holungday
Chongyue
Dorothy
Dusk
Ebenholz
Eunectes
Executor the Ex Foedere
Exusiai
Eyjafjalla
Eyjafjalla the Hvít Aska
Fartooth
Fiammetta
Flametail
Gavial the Invincible
Gladiia
Gnosis
Goldenglow
Hellagur
Ho'olheyak
Horn
Hoshiguma
Ifrit
Ines
Irene
Jessica the Liberated
Kal'tsit
Kirin R Yato
Lee
Lin
Ling
Lumen
Magallan
Mizuki
Mostima
Mountain
Mudrock
Muelsyse
Młynar
Nearl the Radiant Knight
Nian
Nightingale
Pallas
Passenger
Penance
Phantom
Pozëmka
Qiubai
Reed the Flame Shadow
Rosa
Rosmontis
Saga
Saileach
Saria
Schwarz
Shining
Siege
Silence the Paradigmatic
SilverAsh
Skadi
Skadi the Corrupting Heart
Specter the Unchained
Stainless
Surtr
Suzuran
Swire the Elegant Wit
Texas the Omertosa
Thorns
Typhon
Vigil
W
Weedy

Total

In [62]:
def extract_range_with_selenium(driver, operator_name):
    """Extract the range information using Selenium."""
    range_data = {}
    try:
        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(5)

        # Locate the range table using the provided XPath
        range_table_xpath = '//*[@id="mw-content-text"]/div[1]/table[4]'
        range_table = driver.find_element(By.XPATH, range_table_xpath)
        print("DEBUG: Range table found.")

        # Debug: Print the range table HTML
        range_table_html = range_table.get_attribute("outerHTML")
        print("DEBUG: Range table HTML:")
        print(range_table_html)

        # Find all columns (Base, Elite 1, Elite 2) using their specific XPaths
        levels = ["Base", "Elite 1", "Elite 2"]
        for i, level in enumerate(levels):
            try:
                column_xpath = f'{range_table_xpath}/tbody/tr[2]/td[{i + 1}]'
                column = driver.find_element(By.XPATH, column_xpath)
                print(f"DEBUG: Found column for {level}.")

                # Extract the grid cells
                grid_cells = column.find_elements(By.XPATH, ".//span[@style]")
                print(f"DEBUG: Found {len(grid_cells)} grid cells for {level}.")

                # Build the 2D range representation
                range_representation = []
                row = []
                for idx, cell in enumerate(grid_cells):
                    # Replace dark squares with circles
                    symbol = '○' if 'background: #27A6F3' in cell.get_attribute("style") else '□'
                    row.append(symbol)

                    # Assume a 5x5 grid (adjust if necessary)
                    if (idx + 1) % 5 == 0:
                        range_representation.append(row)
                        row = []

                # Add the last row if not empty
                if row:
                    range_representation.append(row)

                print(f"DEBUG: Range for {level}: {range_representation}")

                # Add to range data
                range_data[level] = range_representation

            except Exception as e:
                print(f"DEBUG: Error extracting range for {level}: {e}")
                range_data[level] = "N/A"

    except Exception as e:
        print(f"Error extracting range with Selenium: {e}")

    return range_data

In [63]:
def operator_info(operator_name):
    # Construct the URL
    base_url = "https://arknights.fandom.com/wiki/"
    url = f"{base_url}{operator_name.replace(' ', '_')}"

    try:
        # Initialize the WebDriver
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print("DEBUG: WebDriver initialized successfully.")

        # Navigate to the operator's page
        driver.get(url)
        print(f"DEBUG: Navigated to {url}")

        # Extract range using Selenium
        operator_data = {}
        operator_data["range"] = extract_range_with_selenium(driver, operator_name)
        print(f"DEBUG: Extracted range: {operator_data['range']}")

        # Save to JSON file
        output_dir = "operator"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{operator_name}.json")
        with open(output_path, "w", encoding="utf-8") as json_file:
            json.dump(operator_data, json_file, ensure_ascii=False, indent=4)

        print(f"Operator data for {operator_name} saved to {output_path}")

    except Exception as e:
        print(f"Error scraping data for {operator_name}: {e}")

    finally:
        # Ensure the WebDriver is closed
        if 'driver' in locals():
            driver.quit()

In [64]:
operator_info("Aak")

DEBUG: WebDriver initialized successfully.
DEBUG: Navigated to https://arknights.fandom.com/wiki/Aak
DEBUG: Range table found.
DEBUG: Range table HTML:
<table class="mrfz-btable" width="55%" cellpadding="5" cellspacing="0" style="table-layout:fixed; text-align:center; color:white;">
<tbody><tr>
<td colspan="3" id="Range" style="text-align:left; font-weight:bold; font-family:'Roboto Condensed'; font-size:16px;">Range
</td></tr>
<tr style="font-size:14px">
<td valign="top"><b>Base</b><div style="margin:0.5em auto; display:table;"><div><span style="display:inline-block; margin:1px; float:left; vertical-align:top; width:12px; height:12px; border:1px solid transparent; background:; color:transparent;">　</span><span style="display:inline-block; margin:1px; float:left; vertical-align:top; width:12px; height:12px; border:1px solid gray; background:none; color:transparent;">□</span><span style="display:inline-block; margin:1px; float:left; vertical-align:top; width:12px; height:12px; border:1px

In [65]:
import json
import os

def extract_and_write_stats_with_selenium(operator_name):
    """Extract the stats information using Selenium, split it by elite levels, and write it directly to JSON."""
    stats_data = {}
    driver = None  # Initialize driver variable
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize a new WebDriver instance
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print("DEBUG: WebDriver initialized successfully.")

        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(5)

        # Locate the stats table using the provided XPath
        stats_table_xpath = '//*[@id="mw-content-text"]/div[1]/table[5]'
        stats_table = driver.find_element(By.XPATH, stats_table_xpath)
        print("DEBUG: Stats table found.")

        # Extract rows from the stats table
        rows = stats_table.find_elements(By.XPATH, ".//tr")
        print(f"DEBUG: Found {len(rows)} rows in the stats table.")

        # Define the elite levels
        elite_levels = ["Base", "Max", "Elite 1", "Elite 2", "Trust"]

        # Process each row
        for row in rows[1:]:  # Skip the header row
            try:
                # Extract the attribute name (e.g., HP, ATK, DEF)
                attribute_name = row.find_element(By.XPATH, ".//th").text.strip()

                # Extract the stats for each level (Base, Max, Elite 1, Elite 2, Trust)
                stats = [cell.text.strip() for cell in row.find_elements(By.XPATH, ".//td")]

                # Split stats into a dictionary based on elite levels
                stats_data[attribute_name] = {level: stats[i] if i < len(stats) else "" for i, level in enumerate(elite_levels)}

                print(f"DEBUG: Extracted stats for {attribute_name}: {stats_data[attribute_name]}")
            except Exception as e:
                print(f"DEBUG: Error processing row: {e}")

        # Write the extracted stats data directly to JSON
        output_dir = "operator"
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, f"{operator_name}.json")

        # Load existing data if the file exists
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as json_file:
                operator_data = json.load(json_file)
        else:
            operator_data = {}

        # Update the "stats" field with the extracted stats data
        operator_data["stats"] = stats_data

        # Write the updated data back to the JSON file
        with open(file_path, "w", encoding="utf-8") as json_file:
            json.dump(operator_data, json_file, ensure_ascii=False, indent=4)

        print(f"Stats data for {operator_name} saved to {file_path}")

    except Exception as e:
        print(f"Error extracting stats with Selenium: {e}")

    finally:
        # Ensure the WebDriver is closed
        if driver:
            driver.quit()

    return stats_data

In [66]:
extract_and_write_stats_with_selenium("Aak")

DEBUG: WebDriver initialized successfully.
DEBUG: Stats table found.
DEBUG: Found 9 rows in the stats table.
DEBUG: Extracted stats for HP: {'Base': '865', 'Max': '1236', 'Elite 1': '1627', 'Elite 2': '2034', 'Trust': '+300'}
DEBUG: Extracted stats for ATK: {'Base': '247', 'Max': '413', 'Elite 1': '583', 'Elite 2': '703', 'Trust': '+50'}
DEBUG: Extracted stats for DEF: {'Base': '58', 'Max': '90', 'Elite 1': '121', 'Elite 2': '152', 'Trust': ''}
DEBUG: Extracted stats for RES: {'Base': '10', 'Max': '10', 'Elite 1': '10', 'Elite 2': '10', 'Trust': ''}
DEBUG: Extracted stats for Rdpl. time: {'Base': '70s', 'Max': '70s', 'Elite 1': '70s', 'Elite 2': '70s', 'Trust': ''}
DEBUG: Extracted stats for DP cost: {'Base': '11', 'Max': '11', 'Elite 1': '13', 'Elite 2': '13', 'Trust': ''}
DEBUG: Extracted stats for Blk. cnt.: {'Base': '1', 'Max': '1', 'Elite 1': '1', 'Elite 2': '1', 'Trust': ''}
DEBUG: Extracted stats for Atk. itvl.: {'Base': '1.3s', 'Max': '1.3s', 'Elite 1': '1.3s', 'Elite 2': '1.3s

{'HP': {'Base': '865',
  'Max': '1236',
  'Elite 1': '1627',
  'Elite 2': '2034',
  'Trust': '+300'},
 'ATK': {'Base': '247',
  'Max': '413',
  'Elite 1': '583',
  'Elite 2': '703',
  'Trust': '+50'},
 'DEF': {'Base': '58',
  'Max': '90',
  'Elite 1': '121',
  'Elite 2': '152',
  'Trust': ''},
 'RES': {'Base': '10',
  'Max': '10',
  'Elite 1': '10',
  'Elite 2': '10',
  'Trust': ''},
 'Rdpl. time': {'Base': '70s',
  'Max': '70s',
  'Elite 1': '70s',
  'Elite 2': '70s',
  'Trust': ''},
 'DP cost': {'Base': '11',
  'Max': '11',
  'Elite 1': '13',
  'Elite 2': '13',
  'Trust': ''},
 'Blk. cnt.': {'Base': '1',
  'Max': '1',
  'Elite 1': '1',
  'Elite 2': '1',
  'Trust': ''},
 'Atk. itvl.': {'Base': '1.3s',
  'Max': '1.3s',
  'Elite 1': '1.3s',
  'Elite 2': '1.3s',
  'Trust': ''}}

In [67]:
import json
import os

def extract_potential_with_selenium(operator_name):
    """Extract the potential information using Selenium and write it directly to JSON."""
    potential_data = {}
    driver = None  # Initialize driver variable
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize a new WebDriver instance
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print("DEBUG: WebDriver initialized successfully.")

        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(5)

        # Locate the potential table using the provided XPath
        potential_table_xpath = '//*[@id="mw-content-text"]/div[1]/table[6]'
        potential_table = driver.find_element(By.XPATH, potential_table_xpath)
        print("DEBUG: Potential table found.")

        # Extract rows from the potential table
        rows = potential_table.find_elements(By.XPATH, ".//tr")
        print(f"DEBUG: Found {len(rows)} rows in the potential table.")

        # Create a standardized format for potentials (Pot 1, Pot 2, etc.)
        # Pot 1 is always empty as it's the base form
        potential_data["Pot 1"] = "Base form"
        
        # Process each row
        pot_counter = 1  # Start from Pot 2
        for row in rows[1:]:  # Skip the header rows
            try:
                # Skip rows that are not valid potential rows
                if "OR" in row.text and "Potential" not in row.text:
                    print("DEBUG: Skipping row containing 'OR'")
                    continue

                # Extract the potential effect (e.g., DP cost -1, Redeployment time -4 seconds)
                potential_effect = row.find_element(By.XPATH, ".//td[2]").text.strip()

                # Add to potential data with standardized key format
                potential_key = f"Pot {pot_counter}"
                potential_data[potential_key] = potential_effect
                print(f"DEBUG: Extracted {potential_key}: {potential_effect}")
                
                pot_counter += 1
            except Exception as e:
                print(f"DEBUG: Error processing row: {e}")

        # Write the extracted potential data directly to JSON
        output_dir = "operator"
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, f"{operator_name}.json")

        # Load existing data if the file exists
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as json_file:
                operator_data = json.load(json_file)
        else:
            operator_data = {}

        # Update with the potential data
        operator_data["potential"] = potential_data

        # Write the updated data back to the JSON file
        with open(file_path, "w", encoding="utf-8") as json_file:
            json.dump(operator_data, json_file, ensure_ascii=False, indent=4)

        print(f"Potential data for {operator_name} saved to {file_path}")

    except Exception as e:
        print(f"Error extracting potential with Selenium: {e}")

    finally:
        # Ensure the WebDriver is closed
        if driver:
            driver.quit()

    return potential_data

In [68]:
extract_potential_with_selenium("Aak")

DEBUG: WebDriver initialized successfully.
DEBUG: Potential table found.
DEBUG: Found 8 rows in the potential table.
DEBUG: Extracted Pot 1: DP cost -1
DEBUG: Extracted Pot 2: Redeployment time -4 seconds
DEBUG: Extracted Pot 3: Max HP +150
DEBUG: Extracted Pot 4: Improves Pharmaceutical Diffusion
DEBUG: Extracted Pot 5: DP cost -1
DEBUG: Skipping row containing 'OR'
DEBUG: Skipping row containing 'OR'
Potential data for Aak saved to operator/Aak.json


{'Pot 1': 'DP cost -1',
 'Pot 2': 'Redeployment time -4 seconds',
 'Pot 3': 'Max HP +150',
 'Pot 4': 'Improves Pharmaceutical Diffusion',
 'Pot 5': 'DP cost -1'}

In [89]:
def extract_promotion_with_selenium(operator_name):
    """Extract the promotion information, including material names from images and quantities, using Selenium."""
    promotion_data = {}
    driver = None  # Initialize driver variable
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize a new WebDriver instance
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print("DEBUG: WebDriver initialized successfully.")

        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(5)

        # Define the XPaths for Elite 1 and Elite 2 toggle buttons and content
        promotion_rows = {
            "Elite 1": {
                "button": '//*[@id="mw-content-text"]/div[1]/table[7]/tbody/tr[2]/td/div/button/span',
                "content": '//*[@id="mw-content-text"]/div[1]/table[7]/tbody/tr[2]/td/div/div[@class="mw-collapsible-content"]'
            },
            "Elite 2": {
                "button": '//*[@id="mw-content-text"]/div[1]/table[7]/tbody/tr[3]/td/div/button/span',
                "content": '//*[@id="mw-content-text"]/div[1]/table[7]/tbody/tr[3]/td/div/div[@class="mw-collapsible-content"]'
            }
        }

        # Process each promotion level
        for level, xpaths in promotion_rows.items():
            try:
                # Locate and click the toggle button to expand the content
                toggle_button = driver.find_element(By.XPATH, xpaths["button"])
                driver.execute_script("arguments[0].click();", toggle_button)
                time.sleep(1)  # Wait for the content to expand
                print(f"DEBUG: Clicked toggle button for {level}")

                # Extract the expanded content
                content_div = driver.find_element(By.XPATH, xpaths["content"])
                details = content_div.text.strip()
                print(f"DEBUG: Found content div for {level}")

                # Replace \n with actual newlines
                details = details.replace("\\n", "\n")
                
                # Extract all material images
                material_images = content_div.find_elements(By.XPATH, ".//img")
                print(f"DEBUG: Found {len(material_images)} material images for {level}")
                
                # Extract all numbers that could be quantities
                # Find all numbers in the text content, focusing on patterns like "30K", "5", etc.
                text_items = details.split()
                quantities = []
                
                for item in text_items:
                    # Look for patterns like "30K", "5", "10", etc.
                    item = item.strip()
                    if item.isdigit() or (item[:-1].isdigit() and item[-1] == 'K'):
                        quantities.append(item)
                
                print(f"DEBUG: Found possible quantities: {quantities}")
                
                # Find LMD quantity (always has 'K' suffix)
                lmd_quantity = None
                other_quantities = []
                
                for quantity in quantities:
                    if quantity.endswith('K') and lmd_quantity is None:
                        lmd_quantity = quantity
                    else:
                        other_quantities.append(quantity)
                
                print(f"DEBUG: LMD quantity: {lmd_quantity}")
                print(f"DEBUG: Other quantities: {other_quantities}")
                
                # Match images with quantities
                materials = []
                
                # Get only valid material images (usually 3-4 per promotion level)
                valid_images = []
                for img in material_images:
                    # Get image name from various attributes
                    img_name = (
                        img.get_attribute("alt") or 
                        img.get_attribute("data-image-name") or
                        img.get_attribute("src").split("/")[-1]
                    )
                    
                    # Skip non-material images (like icons, etc.)
                    if not img_name or "icon" in img_name.lower():
                        continue
                    
                    # Remove file extension
                    if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                        img_name = img_name.rsplit('.', 1)[0]
                        
                    valid_images.append(img_name)
                
                print(f"DEBUG: Found valid material images: {valid_images}")
                
                # Assign quantities to materials
                # First material is always LMD
                if valid_images and lmd_quantity:
                    # First material is LMD with K quantity
                    materials.append({"name": valid_images[0], "quantity": lmd_quantity})
                    print(f"DEBUG: Matched {valid_images[0]} with LMD quantity {lmd_quantity}")
                    
                    # Remaining materials get other quantities
                    for i, img_name in enumerate(valid_images[1:]):
                        if i < len(other_quantities):
                            materials.append({"name": img_name, "quantity": other_quantities[i]})
                            print(f"DEBUG: Matched {img_name} with quantity {other_quantities[i]}")
                        else:
                            materials.append({"name": img_name, "quantity": "1"})
                            print(f"DEBUG: No quantity found for {img_name}, using default '1'")
                else:
                    # Fallback: if we can't identify LMD, just assign quantities in order
                    for i, img_name in enumerate(valid_images):
                        if i < len(quantities):
                            materials.append({"name": img_name, "quantity": quantities[i]})
                            print(f"DEBUG: Matched {img_name} with quantity {quantities[i]}")
                        else:
                            materials.append({"name": img_name, "quantity": "1"})
                            print(f"DEBUG: No quantity found for {img_name}, using default '1'")

                # Add details and materials to promotion data
                promotion_data[level] = {
                    "details": details,
                    "materials": materials
                }
                print(f"DEBUG: Extracted promotion {level} with {len(materials)} materials")
            except Exception as e:
                print(f"DEBUG: Error processing {level}: {e}")

        # Write the extracted promotion data directly to JSON
        output_dir = "operator"
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, f"{operator_name}.json")

        # Load existing data if the file exists
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as json_file:
                operator_data = json.load(json_file)
        else:
            operator_data = {}

        # Update the "promotion" field with the extracted promotion data
        operator_data["promotion"] = promotion_data

        # Write the updated data back to the JSON file
        with open(file_path, "w", encoding="utf-8") as json_file:
            json.dump(operator_data, json_file, ensure_ascii=False, indent=4)

        print(f"Promotion data for {operator_name} saved to {file_path}")

    except Exception as e:
        print(f"Error extracting promotion with Selenium: {e}")

    finally:
        # Ensure the WebDriver is closed
        if driver:
            driver.quit()

    return promotion_data

In [90]:
extract_promotion_with_selenium("Aak")

DEBUG: WebDriver initialized successfully.
DEBUG: Clicked toggle button for Elite 1
DEBUG: Found content div for Elite 1
DEBUG: Found 4 material images for Elite 1
DEBUG: Found possible quantities: ['50', '30K', '5', '8', '5']
DEBUG: LMD quantity: 30K
DEBUG: Other quantities: ['50', '5', '8', '5']
DEBUG: Found valid material images: ['LMD', 'Specialist Chip', 'Sugar', 'Oriron']
DEBUG: Matched LMD with LMD quantity 30K
DEBUG: Matched Specialist Chip with quantity 50
DEBUG: Matched Sugar with quantity 5
DEBUG: Matched Oriron with quantity 8
DEBUG: Extracted promotion Elite 1 with 4 materials
DEBUG: Clicked toggle button for Elite 2
DEBUG: Found content div for Elite 2
DEBUG: Found 4 material images for Elite 2
DEBUG: Found possible quantities: ['1', '80', '180K', '4', '4', '7']
DEBUG: LMD quantity: 180K
DEBUG: Other quantities: ['1', '80', '4', '4', '7']
DEBUG: Found valid material images: ['LMD', 'Specialist Dualchip', 'D32 Steel', 'Polymerized Gel']
DEBUG: Matched LMD with LMD quantity

{'Elite 1': {'details': 'Maximum attributes increased.\nDP cost +2.\nNew skill: Type-γ Stimpack.\nDrug Cocktail Administration improved.\nRange extended.\nLevel 50\n30K\n5\n8\n5',
  'materials': [{'name': 'LMD', 'quantity': '30K'},
   {'name': 'Specialist Chip', 'quantity': '50'},
   {'name': 'Sugar', 'quantity': '5'},
   {'name': 'Oriron', 'quantity': '8'}]},
 'Elite 2': {'details': 'Maximum attributes increased.\nNew skill: Durian-Flavored Stimpack.\nNew talent: Pharmaceutical Diffusion.\nDrug Cocktail Administration improved.\nElite 1 Level 80\n180K\n4\n4\n7',
  'materials': [{'name': 'LMD', 'quantity': '180K'},
   {'name': 'Specialist Dualchip', 'quantity': '1'},
   {'name': 'D32 Steel', 'quantity': '80'},
   {'name': 'Polymerized Gel', 'quantity': '4'}]}}

In [109]:
def extract_skills_with_selenium(operator_name):
    """Extract the skills information using Selenium and write it directly to JSON."""
    skills_data = []
    driver = None
    
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize WebDriver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print(f"Extracting skills for {operator_name}...")

        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)
        driver.implicitly_wait(5)
        
        # Find all potential skill containers
        main_content = driver.find_element(By.XPATH, '//*[@id="mw-content-text"]/div[1]')
        skill_divs = main_content.find_elements(By.XPATH, './div[contains(@class, "mw-collapsible")]')
        print(f"Found {len(skill_divs)} potential skill containers")
        
        # Process each skill div
        for skill_index, div in enumerate(skill_divs, 2):  # Start at index 2 for XPath consistency
            try:
                # Check if this is actually a skill container by looking for skill name
                try:
                    skill_name_elem = div.find_element(By.XPATH, './/b[@style="font-size:14px;"]')
                    skill_name = skill_name_elem.text.strip()
                    if not skill_name:
                        print(f"Div {skill_index} does not contain a skill name, skipping")
                        continue
                    print(f"Processing skill: {skill_name}")
                except:
                    print(f"Div {skill_index} does not contain a skill element, skipping")
                    continue
                
                # Get skill image
                try:
                    skill_image = div.find_element(By.XPATH, './/img').get_attribute('src')
                    print(f"Found skill image: {skill_image}")
                except:
                    skill_image = ""
                    print("Could not find skill image")
                
                # Get skill types using the specific XPath you provided
                types = []
                try:
                    # Using relative XPath based on your provided path
                    type_container = div.find_element(By.XPATH, './table/tbody/tr/td[2]/div')
                    type_divs = type_container.find_elements(By.XPATH, './div')
                    
                    for type_div in type_divs:
                        type_text = type_div.text.strip()
                        if type_text:
                            types.append(type_text)
                    
                    print(f"Found skill types: {types}")
                except Exception as e:
                    print(f"Error extracting skill types: {e}")
                
                # Click the button to expand the skill details using your exact XPath
                try:
                    # Using the exact button XPath you provided, but making it relative to current div
                    toggle_button = div.find_element(By.XPATH, './button/span')
                    toggle_text = toggle_button.text.strip()
                    
                    # Only click if it needs to be expanded
                    if "Show effects" in toggle_text:
                        print(f"Expanding skill details for {skill_name}")
                        driver.execute_script("arguments[0].click();", toggle_button)
                        time.sleep(1)  # Wait for expansion
                    else:
                        print(f"Skill details already expanded for {skill_name}")
                except Exception as e:
                    print(f"Error clicking toggle button: {e}")
                
                # Extract skill levels data using the tbody XPath you provided
                levels = []
                
                try:
                    # Using your tbody XPath, but making it relative to current div
                    tbody = div.find_element(By.XPATH, './div/table/tbody')
                    
                    # Get all rows except the header row
                    level_rows = tbody.find_elements(By.XPATH, './tr[position() > 1]')
                    print(f"Found {len(level_rows)} level rows")
                    
                    for row_index, row in enumerate(level_rows):
                        try:
                            # Get level number (from row index + 1)
                            if row_index < 7:  # Levels 1-7
                                level = str(row_index + 1)
                            else:  # Mastery levels
                                level = f"M{row_index - 6}"
                                
                            # Using your specific cell XPaths for the first skill, but adapting for any row
                            effect = row.find_element(By.XPATH, './td[1]').text.strip()
                            initial_sp = row.find_element(By.XPATH, './td[2]').text.strip()
                            sp_cost = row.find_element(By.XPATH, './td[3]').text.strip()
                            duration = row.find_element(By.XPATH, './td[4]').text.strip()
                            
                            # Add to levels data
                            level_data = {
                                "level": level,
                                "effect": effect,
                                "initial_sp": initial_sp,
                                "sp_cost": sp_cost,
                                "duration": duration
                            }
                            levels.append(level_data)
                            
                            print(f"Extracted Level {level}: {effect[:30]}...")
                        except Exception as e:
                            print(f"Error extracting row {row_index+1}: {e}")
                except Exception as e:
                    print(f"Error finding or processing tbody: {e}")
                
                # Only add the skill if we have level data
                if levels:
                    skill_data = {
                        "name": skill_name,
                        "image": skill_image,
                        "types": types,
                        "levels": levels
                    }
                    skills_data.append(skill_data)
                    print(f"Added skill {skill_name} with {len(levels)} levels")
                else:
                    print(f"No level data found for {skill_name}, skipping")
            
            except Exception as e:
                print(f"Error processing div #{skill_index}: {e}")
        
        # Save to JSON if skills were found
        if skills_data:
            output_dir = "operator"
            os.makedirs(output_dir, exist_ok=True)
            file_path = os.path.join(output_dir, f"{operator_name}.json")
            
            # Load existing data if available
            if os.path.exists(file_path):
                with open(file_path, "r", encoding="utf-8") as json_file:
                    operator_data = json.load(json_file)
            else:
                operator_data = {}
            
            # Update skills
            operator_data["skills"] = skills_data
            
            # Save to file
            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(operator_data, json_file, ensure_ascii=False, indent=4)
            
            print(f"Saved {len(skills_data)} skills for {operator_name}")
        else:
            print(f"No skills were found for {operator_name}")
    
    except Exception as e:
        print(f"Error in skill extraction: {e}")
    
    finally:
        if driver:
            driver.quit()
    
    return skills_data

In [110]:
extract_skills_with_selenium("Aak")

Extracting skills for Aak...
Found 7 potential skill containers
Processing skill: Rapid Fire
Found skill image: data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D
Found skill types: ['Auto Recovery', 'Manual Trigger', '20-30 sec.']
Expanding skill details for Rapid Fire
Found 10 level rows
Extracted Level 1: ASPD +30...
Extracted Level 2: ASPD +35...
Extracted Level 3: ASPD +40...
Extracted Level 4: ASPD +50...
Extracted Level 5: ASPD +55...
Extracted Level 6: ASPD +60...
Extracted Level 7: ASPD +70...
Extracted Level M1: ASPD +80...
Extracted Level M2: ASPD +90...
Extracted Level M3: ASPD +100...
Added skill Rapid Fire with 10 levels
Processing skill: Type-γ Stimpack
Found skill image: data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D
Found skill types: ['Auto Recovery', 'Manual Trigger', '30 sec.']
Expanding skill details for Type-γ Stimpack
Found 10 level rows
Extracted Level 1: Attacks an ally within range (..

[{'name': 'Rapid Fire',
  'image': 'data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D',
  'types': ['Auto Recovery', 'Manual Trigger', '20-30 sec.'],
  'levels': [{'level': '1',
    'effect': 'ASPD +30',
    'initial_sp': '15',
    'sp_cost': '30',
    'duration': '20s'},
   {'level': '2',
    'effect': 'ASPD +35',
    'initial_sp': '15',
    'sp_cost': '30',
    'duration': '21s'},
   {'level': '3',
    'effect': 'ASPD +40',
    'initial_sp': '15',
    'sp_cost': '30',
    'duration': '22s'},
   {'level': '4',
    'effect': 'ASPD +50',
    'initial_sp': '16',
    'sp_cost': '30',
    'duration': '23s'},
   {'level': '5',
    'effect': 'ASPD +55',
    'initial_sp': '16',
    'sp_cost': '30',
    'duration': '24s'},
   {'level': '6',
    'effect': 'ASPD +60',
    'initial_sp': '16',
    'sp_cost': '30',
    'duration': '25s'},
   {'level': '7',
    'effect': 'ASPD +70',
    'initial_sp': '17',
    'sp_cost': '30',
    'duration': '26s'},
   {'level':

In [127]:
def extract_skill_upgrade_costs(operator_name):
    """Extract skill upgrade costs for an operator and add to their JSON file."""
    import json
    import os
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from webdriver_manager.chrome import ChromeDriverManager
    import time
    
    upgrade_costs = {}
    mastery_costs = {
        "Skill1": {"masteries": {"M1": [], "M2": [], "M3": []}},
        "Skill2": {"masteries": {"M1": [], "M2": [], "M3": []}},
        "Skill3": {"masteries": {"M1": [], "M2": [], "M3": []}}
    }
    driver = None
    
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize WebDriver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print(f"Extracting skill upgrade costs for {operator_name}...")

        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)
        driver.implicitly_wait(5)
        
        # Extract the regular skill upgrade costs (levels 1-7) and mastery costs
        try:
            # Find the skill upgrades section
            skill_upgrades_section = driver.find_element(By.XPATH, '//*[@id="mw-content-text"]/div[1]/div[contains(., "Skill upgrades")]')
            
            # Check if it needs to be expanded
            try:
                toggle_button = skill_upgrades_section.find_element(By.XPATH, './/button/span')
                if "Show" in toggle_button.text:
                    driver.execute_script("arguments[0].click();", toggle_button)
                    time.sleep(1)  # Wait for expansion
                    print("Expanded skill upgrades section")
            except:
                print("Skill upgrades section already expanded or no toggle button")
            
            # Find the upgrade costs table
            upgrade_table = skill_upgrades_section.find_element(By.XPATH, './/table[contains(@class, "mrfz-wtable")]')
            
            # Extract regular skill levels (2-7)
            for level in range(2, 8):  # Level 2 to Level 7
                try:
                    # Find the row for this level
                    level_xpath = f'.//tr[th//div[@title="Level {level}"] or th//img[contains(@data-image-name, "Rank_{level}")]]'
                    level_row = upgrade_table.find_element(By.XPATH, level_xpath)
                    
                    # Extract materials from the row
                    materials_cell = level_row.find_element(By.XPATH, './td')
                    materials = []
                    
                    # Find all material divs
                    material_divs = materials_cell.find_elements(By.XPATH, './/div[contains(@style, "display:inline-block; margin:3px")]')
                    
                    for div in material_divs:
                        try:
                            # Get material image and name
                            material_img = div.find_element(By.XPATH, './/img')
                            material_name = material_img.get_attribute("alt")
                            
                            # Get quantity from the div at the bottom right
                            quantity_div = div.find_element(By.XPATH, './/div[contains(@style, "position:absolute; right:0px; bottom:0px")]/div')
                            quantity = quantity_div.text.strip()
                            
                            materials.append({
                                "name": material_name,
                                "quantity": quantity
                            })
                        except Exception as e:
                            print(f"Error extracting material for level {level}: {e}")
                    
                    # Add to upgrade costs
                    upgrade_costs[str(level)] = materials
                    print(f"Extracted costs for Level {level}: {len(materials)} materials")
                    
                except Exception as e:
                    print(f"Error processing level {level}: {e}")
            
            # Find all mastery rows
            mastery_rows = upgrade_table.find_elements(By.XPATH, 
                './/tr[th//div[contains(@title, "Mastery")] or ' +
                'th//img[contains(@data-image-name, "Rank_8") or ' +
                'contains(@data-image-name, "Rank_9") or ' +
                'contains(@data-image-name, "Rank_10")]]')
            
            print(f"Found {len(mastery_rows)} mastery rows in total")
            
            if mastery_rows:
                # Identify how many mastery levels we have (should be 9 for 3 skills with M1-M3 each)
                mastery_count = len(mastery_rows)
                
                # Collect all materials from all mastery rows
                all_mastery_materials = []
                
                for row in mastery_rows:
                    try:
                        # Extract materials from this row
                        materials_cell = row.find_element(By.XPATH, './td')
                        
                        # Find all material divs
                        material_divs = materials_cell.find_elements(By.XPATH, './/div[contains(@style, "display:inline-block; margin:3px")]')
                        
                        for div in material_divs:
                            try:
                                # Get material image and name
                                material_img = div.find_element(By.XPATH, './/img')
                                material_name = material_img.get_attribute("alt")
                                
                                # Get quantity from the div at the bottom right
                                quantity_div = div.find_element(By.XPATH, './/div[contains(@style, "position:absolute; right:0px; bottom:0px")]/div')
                                quantity = quantity_div.text.strip()
                                
                                all_mastery_materials.append({
                                    "name": material_name,
                                    "quantity": quantity
                                })
                            except Exception as e:
                                print(f"Error extracting mastery material: {e}")
                    except Exception as e:
                        print(f"Error processing mastery row: {e}")
                
                print(f"Collected {len(all_mastery_materials)} total mastery materials")
                
                # For simplicity, let's assume:
                # - Each skill has 3 mastery levels
                # - Each mastery level needs 3 materials
                # - Materials are simply distributed in order
                
                # Calculate materials per mastery level
                materials_per_mastery = len(all_mastery_materials) // 9  # 9 masteries total (3 skills * 3 levels)
                print(f"Distributing {materials_per_mastery} materials per mastery level")
                
                # Distribute materials evenly across all skills and mastery levels
                material_index = 0
                skills = ["Skill1", "Skill2", "Skill3"]
                mastery_levels = ["M1", "M2", "M3"]
                
                for skill in skills:
                    for mastery in mastery_levels:
                        # Get the next batch of materials for this skill's mastery level
                        end_index = min(material_index + materials_per_mastery, len(all_mastery_materials))
                        materials_for_this_mastery = all_mastery_materials[material_index:end_index]
                        
                        # Add to mastery costs
                        mastery_costs[skill]["masteries"][mastery] = materials_for_this_mastery
                        print(f"Assigned {len(materials_for_this_mastery)} materials to {skill} {mastery}")
                        
                        # Update index for next batch
                        material_index = end_index
            
            # Save to JSON
            if upgrade_costs or any(skill["masteries"] for skill in mastery_costs.values()):
                output_dir = "operator"
                os.makedirs(output_dir, exist_ok=True)
                file_path = os.path.join(output_dir, f"{operator_name}.json")
                
                # Load existing data if available
                if os.path.exists(file_path):
                    with open(file_path, "r", encoding="utf-8") as json_file:
                        operator_data = json.load(json_file)
                else:
                    operator_data = {}
                
                # Update skill upgrade costs
                operator_data["skill_upgrade_costs"] = upgrade_costs
                
                # Remove empty skill entries from mastery_costs
                clean_mastery_costs = {}
                for skill_key, skill_data in mastery_costs.items():
                    if any(skill_data["masteries"].values()):
                        clean_mastery_costs[skill_key] = skill_data
                
                # Add mastery costs
                operator_data["mastery_costs"] = clean_mastery_costs
                
                # Save to file
                with open(file_path, "w", encoding="utf-8") as json_file:
                    json.dump(operator_data, json_file, ensure_ascii=False, indent=4)
                
                print(f"Saved skill upgrade costs for {operator_name}")
                print(f"Regular levels: {len(upgrade_costs)}, Skills with mastery costs: {len(clean_mastery_costs)}")
                
        except Exception as e:
            print(f"Error finding or processing skill upgrades section: {e}")
    
    except Exception as e:
        print(f"Error in skill upgrade costs extraction: {e}")
    
    finally:
        if driver:
            driver.quit()
    
    return {"regular": upgrade_costs, "mastery": mastery_costs}

In [128]:
extract_skill_upgrade_costs("Aak")

Extracting skill upgrade costs for Aak...
Expanded skill upgrades section
Extracted costs for Level 2: 1 materials
Extracted costs for Level 3: 3 materials
Extracted costs for Level 4: 2 materials
Extracted costs for Level 5: 3 materials
Extracted costs for Level 6: 2 materials
Extracted costs for Level 7: 3 materials
Found 3 mastery rows in total
Collected 27 total mastery materials
Distributing 3 materials per mastery level
Assigned 3 materials to Skill1 M1
Assigned 3 materials to Skill1 M2
Assigned 3 materials to Skill1 M3
Assigned 3 materials to Skill2 M1
Assigned 3 materials to Skill2 M2
Assigned 3 materials to Skill2 M3
Assigned 3 materials to Skill3 M1
Assigned 3 materials to Skill3 M2
Assigned 3 materials to Skill3 M3
Saved skill upgrade costs for Aak
Regular levels: 6, Skills with mastery costs: 3


{'regular': {'2': [{'name': 'Skill Summary - 1', 'quantity': '5'}],
  '3': [{'name': 'Skill Summary - 1', 'quantity': '5'},
   {'name': 'Sugar Substitute', 'quantity': '5'},
   {'name': 'Diketon', 'quantity': '4'}],
  '4': [{'name': 'Skill Summary - 2', 'quantity': '8'},
   {'name': 'Polyester', 'quantity': '5'}],
  '5': [{'name': 'Skill Summary - 2', 'quantity': '8'},
   {'name': 'Oriron', 'quantity': '4'},
   {'name': 'Sugar', 'quantity': '3'}],
  '6': [{'name': 'Skill Summary - 2', 'quantity': '8'},
   {'name': 'Incandescent Alloy', 'quantity': '6'}],
  '7': [{'name': 'Skill Summary - 3', 'quantity': '8'},
   {'name': 'Aketon', 'quantity': '3'},
   {'name': 'Loxic Kohl', 'quantity': '6'}]},
 'mastery': {'Skill1': {'masteries': {'M1': [{'name': 'Skill Summary - 3',
      'quantity': '8'},
     {'name': 'Polymerized Gel', 'quantity': '4'},
     {'name': 'Orirock Cluster', 'quantity': '11'}],
    'M2': [{'name': 'Skill Summary - 3', 'quantity': '8'},
     {'name': 'Oriron Block', 'quan

In [133]:
import re

def extract_talents_with_selenium(operator_name):
    """Extract the talent information using Selenium and write it directly to JSON."""
    talents_data = []
    driver = None
    
    try:
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize WebDriver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        print(f"Extracting talents for {operator_name}...")

        # Navigate to the operator's page
        base_url = "https://arknights.fandom.com/wiki/"
        url = f"{base_url}{operator_name.replace(' ', '_')}"
        driver.get(url)
        driver.implicitly_wait(5)
        
        # Find all talent divs (they have a specific class/structure)
        talent_divs = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div[1]/div[contains(@class, "otherskill")]')
        
        print(f"Found {len(talent_divs)} potential talent sections")
        
        # Process each talent div
        for talent_index, div in enumerate(talent_divs):
            try:
                # Get talent name from the th element
                try:
                    talent_name = div.find_element(By.XPATH, './/th').text.strip()
                    print(f"Processing talent: {talent_name}")
                except:
                    talent_name = f"Talent {talent_index + 1}"
                    print(f"Could not find talent name, using default: {talent_name}")
                
                # Find the inner div that contains the levels
                inner_div = div.find_element(By.XPATH, './/div[contains(@class, "otherskill-inner")]')
                
                # Extract all levels of the talent
                elite_levels = []
                potential_levels = []
                
                # Find all rows in the talent table
                rows = inner_div.find_elements(By.XPATH, './/tr')
                
                # Process each row (represents different elite or potential levels)
                for row_index, row in enumerate(rows):
                    try:
                        # Get the cell with the level indicator (either elite level or potential)
                        level_cell = row.find_element(By.XPATH, './/td[1]')
                        
                        # Get the description cell
                        description_cell = row.find_element(By.XPATH, './/td[2]')
                        description = description_cell.text.strip()
                        
                        # Check if this is a potential upgrade
                        is_potential = False
                        potential_text = ""
                        
                        # Method 1: Check for potential text in description or row text
                        row_text = row.text.lower()
                        if "potential" in row_text:
                            is_potential = True
                            # Extract potential level if possible
                            potential_matches = re.findall(r"potential\s+(\d+)", row_text.lower())
                            potential_text = f"Potential {potential_matches[0]}" if potential_matches else "Potential Upgrade"
                        
                        # Method 2: Look for potential icon
                        try:
                            # Try to find potential icon in the level cell
                            img_elements = level_cell.find_elements(By.XPATH, './/img')
                            for img in img_elements:
                                img_src = img.get_attribute("src") or ""
                                img_alt = img.get_attribute("alt") or ""
                                img_title = img.get_attribute("title") or ""
                                
                                # Check if this is a potential icon
                                if ("potential" in img_src.lower() or 
                                    "potential" in img_alt.lower() or 
                                    "potential" in img_title.lower()):
                                    is_potential = True
                                    potential_text = img_title or img_alt or "Potential Upgrade"
                                    break
                        except:
                            pass
                        
                        # Check if this row is an "Elite X with Potential Y" combination
                        elite_with_potential = False
                        elite_level_text = ""
                        
                        if is_potential:
                            # This is a potential upgrade
                            potential_levels.append({
                                "level": potential_text,
                                "description": description
                            })
                            print(f"Extracted potential upgrade {potential_text}: {description[:30]}...")
                        else:
                            # This is an elite level upgrade
                            try:
                                level_img = level_cell.find_element(By.XPATH, './/img')
                                level_title = level_img.get_attribute("title") or level_img.get_attribute("alt") or "Base"
                                
                                # Add to elite levels
                                elite_levels.append({
                                    "level": level_title,
                                    "description": description
                                })
                                print(f"Extracted elite level {level_title}: {description[:30]}...")
                            except:
                                print(f"Could not determine elite level in row: {row.text[:30]}...")
                        
                    except Exception as e:
                        print(f"Error processing talent level row {row_index}: {e}")
                
                # Only add the talent if we have level data
                if elite_levels or potential_levels:
                    talent_data = {
                        "name": talent_name,
                        "elite_levels": elite_levels,
                    }
                    
                    # Only add potential_levels if they exist
                    if potential_levels:
                        talent_data["potential_upgrades"] = potential_levels
                    
                    talents_data.append(talent_data)
                    print(f"Added talent {talent_name} with {len(elite_levels)} elite levels and {len(potential_levels)} potential upgrades")
                else:
                    print(f"No level data found for talent {talent_name}, skipping")
            
            except Exception as e:
                print(f"Error processing talent div #{talent_index}: {e}")
        
        # Save to JSON if talents were found
        if talents_data:
            output_dir = "operator"
            os.makedirs(output_dir, exist_ok=True)
            file_path = os.path.join(output_dir, f"{operator_name}.json")
            
            # Load existing data if available
            if os.path.exists(file_path):
                with open(file_path, "r", encoding="utf-8") as json_file:
                    operator_data = json.load(json_file)
            else:
                operator_data = {}
            
            # Update talents
            operator_data["talents"] = talents_data
            
            # Save to file
            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(operator_data, json_file, ensure_ascii=False, indent=4)
            
            print(f"Saved {len(talents_data)} talents for {operator_name}")
        else:
            print(f"No talents were found for {operator_name}")
    
    except Exception as e:
        print(f"Error in talent extraction: {e}")
    
    finally:
        if driver:
            driver.quit()
    
    return talents_data

In [134]:
extract_talents_with_selenium("Aak")

Extracting talents for Aak...
Found 4 potential talent sections
Processing talent: Drug Cocktail Administration
Extracted elite level Base: Each attack will randomly trig...
Extracted elite level Elite 1: Each attack will randomly trig...
Extracted elite level Elite 2: Each attack will randomly trig...
Added talent Drug Cocktail Administration with 3 elite levels and 0 potential upgrades
Processing talent: Pharmaceutical Diffusion
Extracted elite level Elite 2: Increases healing effects on t...
Extracted potential upgrade Potential 5: Increases healing effects on t...
Added talent Pharmaceutical Diffusion with 1 elite levels and 1 potential upgrades
Processing talent: Neuroticism
Error processing talent level row 0: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//td[2]"}
  (Session info: chrome=135.0.7049.114); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-excep

[{'name': 'Drug Cocktail Administration',
  'elite_levels': [{'level': 'Base',
    'description': 'Each attack will randomly trigger one of the following effects: Restores own HP by 13%, that attack deals 130% damage, Slows the enemy for 1 second, or Stuns the enemy for 0.6 seconds'},
   {'level': 'Elite 1',
    'description': 'Each attack will randomly trigger one of the following effects: Restores own HP by 14%, that attack deals 140% damage, Slows the enemy for 1.2 second, or Stuns the enemy for 0.8 seconds'},
   {'level': 'Elite 2',
    'description': 'Each attack will randomly trigger one of the following effects: Restores own HP by 15%, that attack deals 150% damage, Slows the enemy for 1.4 second, or Stuns the enemy for 1 second'}]},
 {'name': 'Pharmaceutical Diffusion',
  'elite_levels': [{'level': 'Elite 2',
    'description': 'Increases healing effects on this unit by 20%'}],
  'potential_upgrades': [{'level': 'Potential 5',
    'description': 'Increases healing effects on th

In [None]:
# def extract_upgraded_modules(driver, operator_name):
#     """Extract the upgraded module information (Stages 1-3) without relying on specific module names."""
#     modules = []
    
#     try:
#         print(f"Looking for upgraded modules for {operator_name}...")
        
#         # Try different approaches to find upgraded module sections
#         module_divs = []
        
#         # Approach 1: Look for Stage icons in tables
#         try:
#             divs = driver.find_elements(By.XPATH, 
#                 '//div[contains(@class, "mw-collapsible")]//table[.//img[contains(@data-image-name, "Stage_") or contains(@alt, "Stage ")]]/..')
#             if divs:
#                 print(f"Found {len(divs)} module divs with stage icons")
#                 module_divs.extend(divs)
#         except Exception as e:
#             print(f"Error with approach 1: {e}")
        
#         # Approach 2: Direct XPath for upgraded module
#         try:
#             divs = driver.find_elements(By.XPATH, '//*[@id="mw-content-text"]/div[1]/div[12]')
#             if divs:
#                 print(f"Found {len(divs)} module divs with direct XPath (div[12])")
#                 for div in divs:
#                     if div not in module_divs:
#                         module_divs.append(div)
#         except Exception as e:
#             print(f"Error with approach 2: {e}")
        
#         # Process each potential upgraded module div
#         for div_index, div in enumerate(module_divs):
#             try:
#                 # First try to find the module name before expanding
#                 # Use the specific XPath for the module name
#                 module_name = f"{operator_name}'s Upgraded Module"  # Default fallback
                
#                 try:
#                     # Check if there's a div with a table 
#                     module_table_xpath = './/table[1]'
#                     module_tables = div.find_elements(By.XPATH, module_table_xpath)
                    
#                     if module_tables:
#                         module_table = module_tables[0]
                        
#                         # Try to get the module name using the exact XPath you provided
#                         try:
#                             name_div = module_table.find_element(By.XPATH, './tbody/tr[1]/th[1]/div[2]')
#                             if name_div:
#                                 extracted_name = name_div.text.strip()
#                                 if extracted_name:
#                                     module_name = extracted_name
#                                     print(f"Found module name with specific XPath: {module_name}")
#                         except Exception as e:
#                             print(f"Could not extract module name with specific XPath before expanding: {e}")
#                 except Exception as e:
#                     print(f"Error locating module table before expanding: {e}")
                
#                 # Now check if we need to expand the content
#                 try:
#                     toggle_buttons = div.find_elements(By.XPATH, './/button[contains(@class, "mw-collapsible-toggle")]/span[contains(text(), "Show")]')
#                     if toggle_buttons:
#                         print("Found collapsed module, expanding it")
#                         driver.execute_script("arguments[0].click();", toggle_buttons[0])
#                         time.sleep(1)  # Wait for content to expand
#                 except Exception as e:
#                     print(f"Error checking/expanding module: {e}")
                
#                 # Find the module table
#                 module_table = None
#                 try:
#                     # Try direct table in div
#                     tables = div.find_elements(By.XPATH, './/table')
#                     if tables:
#                         # Verify this is an upgraded module table by looking for stage icons
#                         for table in tables:
#                             stage_icons = table.find_elements(By.XPATH, './/img[contains(@data-image-name, "Stage_") or contains(@alt, "Stage ")]')
#                             if stage_icons:
#                                 module_table = table
#                                 print(f"Found upgraded module table with {len(stage_icons)} stage icons")
#                                 break
#                 except Exception as e:
#                     print(f"Error finding module table: {e}")
                
#                 if not module_table:
#                     print("Could not find upgraded module table, skipping")
#                     continue
                
#                 # Try again to extract module name after expansion using the specific XPath if we didn't get it before
#                 if module_name == f"{operator_name}'s Upgraded Module":
#                     try:
#                         # Try your specific XPath adapted to our context
#                         name_div = module_table.find_element(By.XPATH, './tbody/tr[1]/th[1]/div[2]')
#                         if name_div:
#                             extracted_name = name_div.text.strip()
#                             if extracted_name:
#                                 module_name = extracted_name
#                                 print(f"Found module name with specific XPath after expanding: {module_name}")
#                     except Exception as e:
#                         print(f"Could not extract module name with specific XPath after expanding: {e}")
                        
#                         # Try alternative approaches
#                         try:
#                             name_div = module_table.find_element(By.XPATH, './/div[contains(@style, "padding-top:5px")]')
#                             if name_div:
#                                 extracted_name = name_div.text.strip()
#                                 if extracted_name:
#                                     module_name = extracted_name
#                                     print(f"Found module name with alternative XPath: {module_name}")
#                         except Exception as e:
#                             print(f"Could not extract module name with alternative XPath: {e}")
                
#                 # Get module image URL
#                 image_url = ""
#                 try:
#                     module_img = module_table.find_element(By.XPATH, './/figure//img')
#                     image_url = module_img.get_attribute("src")
#                     print(f"Found module image URL: {image_url}")
#                 except Exception as e:
#                     print(f"Could not extract module image URL: {e}")
                
#                 # Process each stage (Stage 1, 2, 3)
#                 stage_rows = module_table.find_elements(By.XPATH, './tbody/tr')
                
#                 # Skip first row if it contains the module name/image (has rowspan)
#                 first_row_is_header = False
#                 try:
#                     if len(stage_rows) > 0 and stage_rows[0].find_elements(By.XPATH, './/th[@rowspan]'):
#                         first_row_is_header = True
#                         print("First row is a header row with rowspan")
#                 except:
#                     pass
                
#                 stage_data_rows = stage_rows[1:] if first_row_is_header else stage_rows
                
#                 # Process each stage row
#                 for row_index, row in enumerate(stage_data_rows):
#                     try:
#                         # Extract stage number
#                         stage_title = f"Stage {row_index + 1}"  # Default
#                         try:
#                             stage_cell = row.find_element(By.XPATH, './th')
#                             stage_img = stage_cell.find_element(By.XPATH, './/img')
#                             stage_title = stage_img.get_attribute("title") or stage_img.get_attribute("alt") or stage_title
#                             print(f"Processing {stage_title}")
#                         except Exception as e:
#                             print(f"Could not determine stage, using default: {stage_title}")
                        
#                         # Extract stage details
#                         details_text = ""
#                         trait_text = ""
#                         improved_text = ""
#                         attributes = {}
                        
#                         try:
#                             details_td = row.find_element(By.XPATH, './td')
#                             details_text = details_td.text.strip()
                            
#                             # Parse effects and attributes
#                             trait_match = re.search(r'New trait:\s*(.*?)(?=\n|$)', details_text)
#                             trait_text = trait_match.group(1).strip() if trait_match else ""
                            
#                             improved_match = re.search(r'improved:\s*(.*?)(?=\n|Attributes increased|$)', details_text, re.DOTALL)
#                             improved_text = improved_match.group(1).strip() if improved_match else ""
                            
#                             # Extract attribute increases
#                             attr_matches = re.finditer(r'([\w\s\.]+)\s+\+([\d]+)', details_text)
#                             for match in attr_matches:
#                                 attr_name = match.group(1).strip()
#                                 attr_value = match.group(2).strip()
#                                 attributes[attr_name] = f"+{attr_value}"
                            
#                             print(f"Extracted details for {stage_title} ({len(details_text)} chars)")
#                             if trait_text:
#                                 print(f"New trait: {trait_text[:30]}...")
#                             if improved_text:
#                                 print(f"Improved skill: {improved_text[:30]}...")
#                             if attributes:
#                                 print(f"Attributes: {attributes}")
#                         except Exception as e:
#                             print(f"Could not extract stage details: {e}")
                        
#                         # Create module data for this stage
#                         module_data = {
#                             "type": "Upgraded",
#                             "name": module_name,
#                             "image_url": image_url,
#                             "stage": stage_title,
#                             "details": details_text,
#                             "new_trait": trait_text,
#                             "improved_skill": improved_text,
#                             "attributes": attributes
#                         }
                        
#                         # Add to modules list
#                         modules.append(module_data)
#                         print(f"Added upgraded module stage: {stage_title}")
                        
#                     except Exception as e:
#                         print(f"Error processing stage row #{row_index}: {e}")
                
#             except Exception as e:
#                 print(f"Error processing upgraded module div #{div_index}: {e}")
        
#     except Exception as e:
#         print(f"Error extracting upgraded modules: {e}")
    
#     return modules

In [148]:
# extract_modules_with_selenium("Aak")

Extracting modules for Aak...
Looking for original module for Aak...
Found 1 divs with 'Original' text in header
Found 1 divs with direct XPath (div[11])
Could not find module table, skipping
Found collapsed module, expanding it
Found table directly in div
Found module type: Original
Could not extract module name, using default: Aak's Badge
Could not extract module description: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//td[@valign="top"]"}
  (Session info: chrome=135.0.7049.114); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x55fbbbdbd75a <unknown>
#1 0x55fbbb8704b0 <unknown>
#2 0x55fbbb8c19b3 <unknown>
#3 0x55fbbb8c1ba1 <unknown>
#4 0x55fbbb8b5476 <unknown>
#5 0x55fbbb8e75bd <unknown>
#6 0x55fbbb8b536a <unknown>
#7 0x55fbbb8e775e <unknown>
#8 0x55fbbb90d5e0 <unknown>
#9 0x55fbbb8e7363 <unknown>
#10 0x55fbbb8b3d63 <unknown>
#11 0x5

[{'type': 'Upgraded',
  'name': "Aak's Upgraded Module",
  'image_url': 'data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D',
  'stage': 'Stage 2',
  'details': '',
  'new_trait': '',
  'improved_skill': '',
  'attributes': {}},
 {'type': 'Upgraded',
  'name': "Aak's Upgraded Module",
  'image_url': 'data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D',
  'stage': 'Stage 3',
  'details': '',
  'new_trait': '',
  'improved_skill': '',
  'attributes': {}}]