# Selenium

Using selenium for webscraping from dynamic websites

Press 6 + Enter in the launch menu to start a docker container running Selenium

![](../../install/figures/rsm-launch-menu-macos-arm.png)

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv
import os
import time
import signal
import sys
from selenium.webdriver.firefox.options import Options


class WebDriverManager:
    def __init__(self):
        self.driver = None
        self.options = self._setup_options()
        self._setup_signal_handlers()
        self._setup_driver()

    def _setup_options(self):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        return options

    def _setup_signal_handlers(self):
        def signal_handler(signum, frame):
            if self.driver:
                self.driver.quit()
            sys.exit(1)

        signal.signal(signal.SIGINT, signal_handler)
        signal.signal(signal.SIGTERM, signal_handler)

    def _check_selenium_endpoint(self, url):
        try:
            requests.get(url, timeout=2)
            return True
        except requests.RequestException:
            return False

    def _cleanup_existing_sessions(self, url):
        try:
            requests.delete(f"{url}/wd/hub/session", timeout=2)
            time.sleep(1)
            return True
        except:
            return False

    def _setup_driver(self):
        try:
            endpoints = [
                "http://rsm-selenium0:4444",
                "http://rsm-selenium:4444",
                "http://127.0.0.1:4444",
            ]

            for url in endpoints:
                if self._check_selenium_endpoint(url):
                    # Cleanup any existing sessions
                    self._cleanup_existing_sessions(url)

                    # Create new driver with timeouts
                    self.driver = webdriver.Remote(
                        command_executor=f"{url}/wd/hub", options=self.options
                    )

                    # Set timeouts
                    self.driver.set_script_timeout(20)  # 20 seconds timeout
                    self.driver.set_page_load_timeout(30)  # 30 seconds timeout
                    self.driver.implicitly_wait(10)  # 10 seconds implicit wait

                    return  # Successfully created driver

            raise ConnectionError("No Selenium endpoint available")

        except Exception as e:
            if self.driver:
                self.driver.quit()
            raise e

    def reset_selenium(self):
        """Reset Selenium by cleaning up existing sessions"""
        if hasattr(self, "driver") and self.driver:
            try:
                self.driver.quit()
            except:
                pass
            self.driver = None

        time.sleep(2)
        self._setup_driver()
        return self.driver is not None

    def __del__(self):
        if hasattr(self, "driver") and self.driver:
            try:
                self.driver.quit()
            except:
                pass

    def get_driver(self):
        return self.driver


def start_driver():
    driver_manager = WebDriverManager()
    return driver_manager.get_driver()

In [2]:
driver = start_driver()
driver.quit()

In [3]:
# if running without docker
# URL = "http://127.0.0.1:8123"

# if running in docker, use the container name as the hostname
# this uses the shared 'rsm-docker' network to connect
# URL = "http://rsm-msba-k8s-latest:8123"

URL = "https://rsm-shiny-02.ucsd.edu/selenium/"

In [4]:
print("=== BeautifulSoup results ===")
response = requests.get(URL)
soup = BeautifulSoup(response.content, "html.parser")

# BeautifulSoup can only static elements like the title
print(
    "Title found by BeautifulSoup:",
    soup.find("title").text if soup.find("title") else "No title found",
)
print("Button found by BeautifulSoup:", bool(soup.find("button", id="showText")))
print("Dynamic text element found by BeautifulSoup:", bool(soup.find(id="dynamicText")))
print("Can BeautifulSoup find the dynamic text?", soup.find(id="dynamicText"))

=== BeautifulSoup results ===
Title found by BeautifulSoup: Scraping Demo Page
Button found by BeautifulSoup: True
Dynamic text element found by BeautifulSoup: True
Can BeautifulSoup find the dynamic text? <div class="shiny-text-output" id="dynamicText"></div>


In [5]:
import time

driver = start_driver()
print("=== Selenium Results ===")

driver.get(URL)

# Wait for the page to load
wait = WebDriverWait(driver, 5)

# Check for title
print("Title found by Selenium:", driver.title)

# Look for the button
button = wait.until(EC.presence_of_element_located((By.ID, "showText")))
print("Button found by Selenium:", bool(button))

# Click the button
button.click()
time.sleep(1)  # Give the app a moment to update

# Look for the dynamic text
dynamic_text = wait.until(EC.presence_of_element_located((By.ID, "dynamicText")))
print("Dynamic text after click:", dynamic_text.text)

driver.quit()

=== Selenium Results ===
Title found by Selenium: Scraping Demo Page
Button found by Selenium: True
Dynamic text after click: This text was dynamically generated!


The below requires that you have a .env file setup with the following variables that have your UCSD username and password. The most common location to place this file is in your home directory (e.g., ~/.env):

* SELENIUM_USERNAME=<your_username>
* SELENIUM_PASSWORD=<your_password>

In [6]:
URL = "https://rsm-shiny-02.ucsd.edu/selenium_auth/"
load_dotenv()

driver = start_driver()
print("=== Selenium results with authentication ===")

driver.get(URL)
wait = WebDriverWait(driver, 5)

# check if we need to login
try:
    login_button = wait.until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "input[type='submit'][value='Log in']")
        )
    )

    # find username and password fields
    username_field = driver.find_element(By.CSS_SELECTOR, "input[type='text']")
    password_field = driver.find_element(By.CSS_SELECTOR, "input[type='password']")

    if login_button:
        # enter credentials
        load_dotenv()
        username = os.getenv("SELENIUM_USERNAME")
        password = os.getenv("SELENIUM_PASSWORD")

        if not username or not password:
            raise ValueError("Missing credentials in .env file")

        username_field.send_keys(username)
        password_field.send_keys(password)
        login_button.click()

        # wait for login to complete
        time.sleep(2)
except:
    print("No login required or already logged in")

button = wait.until(EC.presence_of_element_located((By.ID, "showText")))
print("Button found by Selenium:", bool(button))

button.click()
time.sleep(1)

print("Title found by selenium:", driver.title)
dynamic_text = wait.until(EC.presence_of_element_located((By.ID, "dynamicText")))
print("Dynamic text after click:", dynamic_text.text)

driver.quit()

=== Selenium results with authentication ===
Button found by Selenium: True
Title found by selenium: Scraping Demo Page
Dynamic text after click: This text was dynamically generated!
