# Selenium 4 Python Cheat Sheet for Web Scraping

## 1. Setup & Configuration

This section covers the initial setup of the Selenium WebDriver for Chrome, including using the `Service` object for modern initialization and `Options` to customize the browser session.

In [None]:
# Import necessary classes from the selenium library
from selenium import webdriver
# Import the Service class to manage the browser driver
from selenium.webdriver.chrome.service import Service
# Import the Options class to configure the browser
from selenium.webdriver.chrome.options import Options
# Import By to locate elements
from selenium.webdriver.common.by import By
# Import webdriver-manager to automatically manage the driver executable
from webdriver_manager.chrome import ChromeDriverManager

# Create an Options object to customize the Chrome browser session
chrome_options = Options()
# Add an argument to run Chrome in headless mode (without a visible UI)
chrome_options.add_argument("--headless")
# Add an argument to set the browser window size
chrome_options.add_argument("--window-size=1920,1080")
# Add an argument to set a custom user-agent string
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")

# Create a Service object, automatically downloading and managing the chromedriver executable
service = Service(ChromeDriverManager().install())

# Initialize the Chrome WebDriver with the specified service and options
driver = webdriver.Chrome(service=service, options=chrome_options)

# Print a confirmation message
print("WebDriver initialized successfully in headless mode.")

## 2. Navigation

Basic browser navigation commands.

In [None]:
# The URL to navigate to
url = "http://quotes.toscrape.com/"
# Navigate to the specified URL
driver.get(url)
# Print the current URL of the browser
print(f"Current URL: {driver.current_url}")
# Print the title of the current page
print(f"Page Title: {driver.title}")

# Navigate to another page to demonstrate back/forward
driver.get("http://quotes.toscrape.com/login")
# Print the new URL
print(f"Navigated to: {driver.current_url}")

# Navigate back to the previous page in history
driver.back()
# Print the URL after going back
print(f"After going back: {driver.current_url}")

# Navigate forward to the next page in history
driver.forward()
# Print the URL after going forward
print(f"After going forward: {driver.current_url}")

# Refresh the current page
driver.refresh()
# Print a confirmation message
print("Page refreshed.")

## 3. Locating Elements

Selenium provides various strategies to locate elements on a page. Using the `By` class is the modern and recommended approach.

In [None]:
# Navigate back to the main page for element locating
driver.get("http://quotes.toscrape.com/")

# --- Find a single element --- 

# Find an element by its CSS Selector
first_quote = driver.find_element(By.CSS_SELECTOR, ".text")
# Print the text of the found element
print(f"First Quote (by CSS Selector): {first_quote.text}")

# Find an element by its XPath
first_author = driver.find_element(By.XPATH, "//small[@class='author']")
# Print the text of the found element
print(f"First Author (by XPath): {first_author.text}")

# --- Find multiple elements --- 

# Find all elements with a specific class name
tags = driver.find_elements(By.CLASS_NAME, "tag")
# Loop through the found elements and print their text
print(f"Found {len(tags)} tags (by Class Name):")
# Iterate over the list of tag elements
for tag in tags:
    # Print the text of each tag
    print(f"- {tag.text}")

# Find all quote elements by their tag name
divs = driver.find_elements(By.TAG_NAME, "div")
# Print the count of the found elements
print(f"Found {len(divs)} div elements on the page (by Tag Name).")

## 4. Interaction

How to interact with web elements like clicking buttons, typing text, and handling dropdowns.

In [None]:
# Import the Select class for handling dropdowns
from selenium.webdriver.support.ui import Select
# Import Keys for sending special keyboard inputs
from selenium.webdriver.common.keys import Keys

# Navigate to a page with form elements
driver.get("http://quotes.toscrape.com/login")

# Find the username input field by its ID
username_field = driver.find_element(By.ID, "username")
# Type text into the input field
username_field.send_keys("myusername")
# Print a confirmation
print("Typed 'myusername' into the username field.")

# Find the password input field by its ID
password_field = driver.find_element(By.ID, "password")
# Type text into the input field
password_field.send_keys("mypassword")
# Print a confirmation
print("Typed 'mypassword' into the password field.")

# Clear the text from the username field
username_field.clear()
# Print a confirmation
print("Cleared the username field.")

# Retype the username
username_field.send_keys("another_user")

# Find the login button by its CSS selector and click it
login_button = driver.find_element(By.CSS_SELECTOR, "input[type='submit']")
# Click the button
login_button.click()
# Print a confirmation
print("Clicked the login button.")

## 5. Waiting Strategies

Waiting is crucial for scraping dynamic web pages. Modern web pages load data asynchronously. If you try to access an element before it's loaded, your script will fail.

**Implicit Waits:** Tell the WebDriver to poll the DOM for a certain amount of time when trying to find an element if it's not immediately available. This is a global setting and is generally discouraged because it makes tests slow and unreliable. It can't handle cases where an element is present but not yet interactive.

**Explicit Waits:** The best approach. You tell the WebDriver to wait for a certain condition to be met before proceeding. This is more precise and reliable.

In [None]:
# Import WebDriverWait for explicit waits
from selenium.webdriver.support.ui import WebDriverWait
# Import expected_conditions to define wait conditions
from selenium.webdriver.support import expected_conditions as EC

# Navigate to a page that might have dynamically loading content
driver.get("http://quotes.toscrape.com/")

# --- Explicit Wait Example --- 
try:
    # Create a WebDriverWait instance, waiting up to 10 seconds
    wait = WebDriverWait(driver, 10)
    # Define the condition: wait until the element with class 'author' is present in the DOM
    author_element = wait.until(
        EC.presence_of_element_located((By.CLASS_NAME, "author"))
    )
    # If the element is found, print its text
    print(f"Explicit wait successful. First author found: {author_element.text}")
except Exception as e:
    # If the element is not found within the timeout, an exception is thrown
    print(f"Explicit wait failed: {e}")

# --- Common Expected Conditions ---
# element_to_be_clickable((By.ID, 'my-button'))
# visibility_of_element_located((By.CSS_SELECTOR, '.my-class'))
# text_to_be_present_in_element((By.ID, 'element-id'), 'expected text')

## 6. Data Extraction

Once you've located elements, you need to extract data from them, like text or attributes.

In [None]:
# Navigate to the main page
driver.get("http://quotes.toscrape.com/")

# Find all quote elements
quotes = driver.find_elements(By.CLASS_NAME, "quote")

# Loop through the first 5 quotes
for quote in quotes[:5]:
    # Extract the text of the quote
    text = quote.find_element(By.CLASS_NAME, "text").text
    # Extract the text of the author
    author = quote.find_element(By.CLASS_NAME, "author").text
    # Print the extracted data
    print(f'"{"text"}" - {author}')

# Find the "About" link
about_link = driver.find_element(By.LINK_TEXT, "About")
# Get the value of the 'href' attribute
href = about_link.get_attribute("href")
# Print the extracted URL
print(f"'About' link points to: {href}")

## 7. Advanced Handling

Handling JavaScript alerts, iframes, and executing custom JavaScript.

In [None]:
# --- JavaScript Execution --- 
# Use JavaScript to get the page title
js_title = driver.execute_script("return document.title;")
# Print the title obtained via JavaScript
print(f"Page title from JS: {js_title}")

# Use JavaScript to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Print a confirmation message
print("Scrolled to the bottom of the page using JavaScript.")

# --- Handling Alerts (Example) --- 
# Note: quotes.toscrape.com doesn't have alerts, this is a template.
# driver.get("page_with_alert.html")
# driver.find_element(By.TAG_NAME, 'button').click()
# alert = driver.switch_to.alert
# print(f"Alert text: {alert.text}")
# alert.accept() # Or alert.dismiss()

# --- Handling iFrames (Example) --- 
# Note: This is also a template.
# driver.get("page_with_iframe.html")
# Switch to the iframe by its ID or name
# driver.switch_to.frame("my_iframe")
# Now you can find elements inside the iframe
# iframe_element = driver.find_element(By.ID, "inside_iframe_id")
# To switch back to the main content
# driver.switch_to.default_content()

## 8. Cleanup

It's important to properly close the browser session to free up resources.
- `driver.close()`: Closes the current browser window.
- `driver.quit()`: Closes all browser windows and safely ends the WebDriver session.

In [None]:
# Close all browser windows and end the WebDriver session
driver.quit()
# Print a confirmation message
print("WebDriver session has been closed and resources released.")