**Python Web Scraper Overview (spunout.ie)**

This script scrapes articles from **spunout.ie** using a **Two-Pass System**; first collecting article URLs and metadata, then extracting full article text.

**Libraries Used**

1. **Selenium** – Automates a real browser to load JavaScript-rendered site content.
2. **Pandas** – Structures scraped data and exports to CSV.
3. **Time** – Adds short delays to avoid overwhelming the server.

**How It Works**

1. **Setup**

   * Launches a **headless Chrome** browser with stability flags for server-safe execution.
2. **URL Discovery**

   * Starts on `/information`, finds all `/category/` links, removes duplicates.
3. **Metadata Collection**

   * Iterates category pages, handles pagination, extracts **Title** and **URL**
   * Determines **Category** and **Topic** from URL format
   * Uses a tracking set to avoid duplicate URLs
4. **Content Extraction**

   * Opens each article
   * Waits for content to fully load
   * Cleans extracted text
5. **Export**

   * Saves results to `spunout_data.csv` using Pandas.


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize Chrome options to configure the browser behavior
options = Options()
# Run the browser in headless mode, meaning it operates without a visible graphical interface
options.add_argument("--headless")
# Bypass the OS security model; often required for running in certain environments like Docker
options.add_argument("--no-sandbox")
# Overcome limited resource problems in containers by disabling shared memory usage
options.add_argument("--disable-dev-shm-usage")
# Create the WebDriver instance with the specified options
driver = webdriver.Chrome(options=options)

# Initialize empty lists to store the scraped data
titles, categories, topics, texts, urls_to_visit = [], [], [], [], []
# Initialize a set to keep track of URLs we have already processed to avoid duplicates
seen_urls = set()

# Phase 1: Discover all listing pages (Main categories and sub-categories)
print("Step 1: Finding all listing pages...")
# Navigate to the main information hub of the website
driver.get("https://spunout.ie/information")
# Pause execution to allow the page to load fully
time.sleep(3)
# Find all anchor elements that contain '/category/' in their href attribute
all_links = driver.find_elements(By.XPATH, "//a[contains(@href, '/category/')]")
# Extract the href attribute from each link element and convert to a set to remove duplicates
raw_listing_urls = list(set([el.get_attribute("href") for el in all_links]))

# Phase 2: Iterate through each listing page to find article URLs and metadata
for listing_url in raw_listing_urls:
    print(f"\n[SCANNING LISTING] {listing_url}")
    # Load the specific category or sub-category page
    driver.get(listing_url)
    
    # Start a loop to handle pagination within this category
    while True:
        # Wait for the page to settle
        time.sleep(2)
        # Locate individual article blocks on the listing page
        blocks = driver.find_elements(By.CLASS_NAME, "news_list_single")
        # If no blocks are found, we have reached the end of the listing
        if not blocks: break
        
        # Iterate through each article block found on the current page
        for block in blocks:
            try:
                # Extract the URL of the article from the anchor tag within the block
                article_url = block.find_element(By.TAG_NAME, "a").get_attribute("href")
                
                # Check if we have already processed this URL to prevent duplicates
                if article_url not in seen_urls:
                    # Extract the visible title of the article
                    title = block.find_element(By.TAG_NAME, "h5").text
                    
                    # Metadata Extraction: Parse the URL structure to determine Category and Topic
                    # Remove the base domain and split the remaining path by slashes
                    path_parts = article_url.replace("https://spunout.ie/", "").strip('/').split('/')
                    
                    # The Category is determined to be the first segment of the URL path
                    # Replace hyphens with spaces, title-case it, and replace spaces with underscores for consistency
                    final_cat = path_parts[0].replace('-', ' ').title().replace(' ', '_')
                    
                    # The Topic is determined to be the second segment, but only if a third segment exists (the article title)
                    final_topic = ""
                    if len(path_parts) >= 3:
                        final_topic = path_parts[1].replace('-', ' ').title().replace(' ', '_')
                    
                    # Log the extracted data for verification purposes
                    print(f"  [ADDED] {article_url}")
                    print(f"   -> Category: {final_cat} | Topic: {final_topic if final_topic else '[Empty]'}")
                    
                    # Append the extracted data to their respective lists
                    titles.append(title)
                    categories.append(final_cat)
                    topics.append(final_topic)
                    urls_to_visit.append(article_url)
                    # Mark this URL as seen
                    seen_urls.add(article_url)
            except:
                # If an error occurs while processing a block, skip it and continue with the next
                continue

        # Pagination Logic: Try to find and click the 'Next' button
        try:
            # Locate the next page button using CSS selector
            next_btn = driver.find_element(By.CSS_SELECTOR, "a.next")
            # Use JavaScript to click the element. This is often more reliable than a standard click 
            # as it bypasses visibility checks and ensures the click event fires even if the element is obscured.
            driver.execute_script("arguments[0].click();", next_btn)
        except:
            # If no next button is found, exit the while loop to move to the next category
            break

# Phase 3: Content Extraction (Visit each discovered URL to get the full text)
print(f"\nStep 3: Extracting text from {len(urls_to_visit)} articles...")
# Iterate through the list of collected article URLs using an index counter
for i, url in enumerate(urls_to_visit):
    try:
        # Navigate to the specific article page
        driver.get(url)
        # Wait explicitly until the content div is present in the DOM
        # This ensures that the dynamic content has loaded before we try to read it
        content = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "content_inner"))
        ).text
        # Clean up the text by splitting on whitespace and rejoining with single spaces
        # This removes excessive newlines and tabs
        texts.append(" ".join(content.split()))
        # Print progress update every 10 articles
        if (i+1) % 10 == 0: print(f"Progress: {i+1}/{len(urls_to_visit)}")
    except:
        # If content extraction fails, append 'N/A' to maintain list alignment
        texts.append("N/A")

# Close the browser window as scraping is complete
driver.quit()

# Phase 4: Final CSV Export
# Create a Pandas DataFrame to structure the data
df = pd.DataFrame({
    "Title": titles,
    "Category": categories,
    "Topic": topics,
    "Content": texts,
    "URL": urls_to_visit
})
# Export the DataFrame to a CSV file
# index=False prevents writing row numbers, encoding='utf-8-sig' ensures proper character support in Excel
df.to_csv("./data/spunout_data.csv", index=False, encoding='utf-8-sig')
print("\nSuccess! CSV saved.")