In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import os
import subprocess

# First, make sure Chrome is installed for WSL2
try:
    # Check if Chrome is installed
    chrome_check = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
    if not chrome_check.stdout:
        print("Chrome not found. Installing Chrome dependencies for WSL...")
        %pip install --upgrade pip webdriver-manager selenium
        print("You may need to install Chrome in WSL with: sudo apt update && sudo apt install -y wget unzip fonts-liberation libasound2 libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libcairo2 libcups2 libcurl3-gnutls libdrm2 libgbm1 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 xdg-utils && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && sudo dpkg -i google-chrome-stable_current_amd64.deb && sudo apt-get -f install")
except Exception as e:
    print(f"Error checking Chrome installation: {e}")

# Ensure ChromeDriver has executable permissions after download
os.environ['WDM_LOG_LEVEL'] = '0'  # Suppress webdriver-manager logs
os.environ['WDM_PROGRESS_BAR'] = '0'  # Disable progress bar

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no browser window)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Check the Chrome version and install matching driver
try:
    # Get Chrome version
    chrome_version_cmd = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
    chrome_version = chrome_version_cmd.stdout.strip()
    print(f"Detected Chrome version: {chrome_version}")
except:
    print("Could not detect Chrome version. Make sure Chrome is installed.")

# Initialize the WebDriver
try:
    driver_path = ChromeDriverManager().install()
    print(f"Driver installed at: {driver_path}")
    
    # Make sure the driver is executable
    os.chmod(driver_path, 0o755)
    
    driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
except Exception as e:
    print(f"Error initializing WebDriver: {e}")
    print("If you're using WSL, you may need to install Chrome in the WSL environment")

# URL to scrape
url = "https://arknights.fandom.com/wiki/Operator/6-star"

try:
    # Navigate to the page
    driver.get(url)
    time.sleep(3)  # Wait for page to load
    
    # Find all operator name elements using the XPath
    operator_elements = driver.find_elements(By.XPATH, '/html/body/div[4]/div[4]/div[2]/main/div[3]/div/div[1]/div/div/table/tbody/tr/td[2]/a')
    
    # Extract the text (operator names) into a list
    operator_names = [element.text for element in operator_elements if element.text]
    
    print("6-Star Operators:")
    for name in operator_names:
        print(name)
        
    print(f"\nTotal 6-star operators found: {len(operator_names)}")

finally:
    # Close the browser
    driver.quit()

# The operator_names list now contains all the 6-star operator names

Detected Chrome version: Google Chrome 135.0.7049.114
Driver installed at: /home/semicolon/.wdm/drivers/chromedriver/linux64/135.0.7049.114/chromedriver-linux64/chromedriver
6-Star Operators:
Aak
Angelina
Archetto
Ash
Bagpipe
Blaze
Blemishine
Carnelian
Ceobe
Ch'en
Ch'en the Holungday
Chongyue
Dorothy
Dusk
Ebenholz
Eunectes
Executor the Ex Foedere
Exusiai
Eyjafjalla
Eyjafjalla the Hvít Aska
Fartooth
Fiammetta
Flametail
Gavial the Invincible
Gladiia
Gnosis
Goldenglow
Hellagur
Ho'olheyak
Horn
Hoshiguma
Ifrit
Ines
Irene
Jessica the Liberated
Kal'tsit
Kirin R Yato
Lee
Lin
Ling
Lumen
Magallan
Mizuki
Mostima
Mountain
Mudrock
Muelsyse
Młynar
Nearl the Radiant Knight
Nian
Nightingale
Pallas
Passenger
Penance
Phantom
Pozëmka
Qiubai
Reed the Flame Shadow
Rosa
Rosmontis
Saga
Saileach
Saria
Schwarz
Shining
Siege
Silence the Paradigmatic
SilverAsh
Skadi
Skadi the Corrupting Heart
Specter the Unchained
Stainless
Surtr
Suzuran
Swire the Elegant Wit
Texas the Omertosa
Thorns
Typhon
Vigil
W
Weedy

Total

In [39]:
def extract_range(soup):
    """Extract the range information from the operator's page."""
    range_data = {}
    range_table = soup.find('table', {'class': 'mrfz-btable'})
    if range_table:
        columns = range_table.find_all('td', valign="top")
        for col in columns:
            # Extract the title (e.g., "Base", "Elite 1", "Elite 2")
            title_element = col.find('b')
            if not title_element:
                continue
            title = title_element.text.strip()

            # Extract the grid cells
            grid_cells = col.find_all('span', style=True)
            range_representation = ''.join(
                '■' if 'background: #27A6F3' in cell['style'] else '□'
                for cell in grid_cells
            )

            # Add to range data
            range_data[title] = range_representation
    return range_data

In [40]:
def operator_info(operator_name):
    # Construct the URL
    base_url = "https://arknights.fandom.com/wiki/"
    url = f"{base_url}{operator_name.replace(' ', '_')}"

    try:
        # Fetch the page content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for HTTP issues
        soup = BeautifulSoup(response.text, 'html.parser')

        # Scrape operator information
        operator_data = {}

        # Operator name
        name_element = soup.find('h2', {'data-source': 'name'})
        operator_data["name"] = name_element.text if name_element else "N/A"

        # File number
        file_no_element = soup.find('div', {'data-source': 'fileno'})
        operator_data["file_no"] = file_no_element.div.text if file_no_element else "N/A"

        # Record
        record_element = soup.find('div', {'data-source': 'record'})
        operator_data["record"] = record_element.a.text if record_element and record_element.a else "N/A"

        # Paradox Simulation
        simulation_element = soup.find('div', {'data-source': 'simulation'})
        operator_data["paradox_simulation"] = simulation_element.a.text if simulation_element and simulation_element.a else "N/A"

        # Illustrator
        illustrator_element = soup.find('div', {'data-source': 'illustrator'})
        operator_data["illustrator"] = illustrator_element.div.text if illustrator_element else "N/A"

        # Basic Information
        basic_info = {}
        for field in ["gender", "experience", "birthplace", "birthdate", "race", "height", "oripathy"]:
            element = soup.find('div', {'data-source': field})
            basic_info[field] = element.div.text if element else "N/A"
        operator_data["basic_info"] = basic_info

        # Other Names
        other_names = {}
        for lang in ["cnname", "jpname", "krname"]:
            element = soup.find('div', {'data-source': lang})
            other_names[lang] = element.div.text if element else "N/A"
        operator_data["other_names"] = other_names

        # Character Voices
        character_voices = {}
        for lang in ["jpcv", "cncv", "encv", "krcv"]:
            element = soup.find('div', {'data-source': lang})
            character_voices[lang] = element.div.text if element else "N/A"
        operator_data["character_voices"] = character_voices

        # Physical Examination
        physical_exam = {}
        for attribute in ["strength", "mobility", "endurance", "tactical", "skill", "originium"]:
            element = soup.find('div', {'data-source': attribute})
            physical_exam[attribute] = element.div.text if element else "N/A"
        operator_data["physical_examination"] = physical_exam

        # Related Characters
        related_characters = []
        related_elements = soup.select("td.pi-horizontal-group-item.pi-data-value.pi-font.pi-border-color.pi-item-spacing div div a")
        related_characters = [element['title'] for element in related_elements if 'title' in element.attrs]
        operator_data["related_characters"] = related_characters

        # Range Extraction
        operator_data["range"] = extract_range(soup)

        # Save to JSON file
        output_dir = "operator"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{operator_name}.json")
        with open(output_path, "w", encoding="utf-8") as json_file:
            json.dump(operator_data, json_file, ensure_ascii=False, indent=4)

        print(f"Operator data for {operator_name} saved to {output_path}")

    except Exception as e:
        print(f"Error scraping data for {operator_name}: {e}")

In [None]:
operator_info("Aak")
# for op in operator_names:
#     try:
#         result = operator_info(op)
#         if "error" in result:
#             print(f"Error for {op}: {result['error']}")
#         else:
#             print(f"Successfully retrieved data for {op}")
#     except Exception as e:
#         print(f"Failed to retrieve data for {op}: {e}")

Operator data for Aak saved to operator/Aak.json
Failed to retrieve data for Aak: argument of type 'NoneType' is not iterable
Operator data for Angelina saved to operator/Angelina.json
Failed to retrieve data for Angelina: argument of type 'NoneType' is not iterable
Operator data for Archetto saved to operator/Archetto.json
Failed to retrieve data for Archetto: argument of type 'NoneType' is not iterable
Operator data for Ash saved to operator/Ash.json
Failed to retrieve data for Ash: argument of type 'NoneType' is not iterable
Operator data for Bagpipe saved to operator/Bagpipe.json
Failed to retrieve data for Bagpipe: argument of type 'NoneType' is not iterable
Operator data for Blaze saved to operator/Blaze.json
Failed to retrieve data for Blaze: argument of type 'NoneType' is not iterable
Operator data for Blemishine saved to operator/Blemishine.json
Failed to retrieve data for Blemishine: argument of type 'NoneType' is not iterable
Operator data for Carnelian saved to operator/Car