In [1]:
# import libararies
import time
import csv
import logging
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

# -------------------------
# Logging configuration
# -------------------------
logging.basicConfig(
    filename='scraping_errors.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# -------------------------
# Set up Selenium WebDriver using webdriver_manager
# -------------------------
chrome_options = Options()
chrome_options.add_argument('--headless')  # run in headless mode; comment out if debugging
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
wait = WebDriverWait(driver, 20)

In [2]:
# -------------------------
# Helper Function: Safe extraction for Selenium elements using XPath
# -------------------------
def safe_extract(driver, xpath, default="0"):
    try:
        element = driver.find_element(By.XPATH, xpath)
        text = element.text.strip()
        return text if text else default
    except NoSuchElementException:
        logging.error(f"Element not found for XPath: {xpath}")
        return default

def safe_extract_attribute(driver, xpath, attribute, default="0"):
    try:
        element = driver.find_element(By.XPATH, xpath)
        attr_val = element.get_attribute(attribute)
        return attr_val.strip() if attr_val else default
    except NoSuchElementException:
        logging.error(f"Element not found for XPath (attribute {attribute}): {xpath}")
        return default


In [3]:
# -------------------------
# Main Scraper Configuration
# -------------------------
base_exhibitors_url = "https://kbis2025.smallworldlabs.com/exhibitors"
output_csv = r"C:\Users\jchan\csi360_fire_police\cabinet_vendors_list\cabinets\resources\output\vendor_list.csv"
exhibitor_data = []

try:
    # 1. Load the exhibitors page and wait for dynamic content
    logging.info(f"Loading exhibitors page: {base_exhibitors_url}")
    driver.get(base_exhibitors_url)
    wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '/co/')]")))
    
    # 2. Collect unique exhibitor URLs
    exhibitor_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/co/')]")
    exhibitor_urls = []
    for elem in exhibitor_elements:
        url = elem.get_attribute('href')
        if url and url not in exhibitor_urls:
            exhibitor_urls.append(url)
    
    logging.info(f"Found {len(exhibitor_urls)} exhibitor links.")
    
    # 3. Iterate through each exhibitor page and extract data
    for url in exhibitor_urls:
        logging.info(f"Processing exhibitor page: {url}")
        driver.get(url)
        
        try:
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        except TimeoutException:
            logging.error(f"Timeout loading page: {url}")
            continue  # Skip this exhibitor if page doesn't load in time
        
        # Initialize data dictionary with default values
        data = {
            "Company Name": safe_extract(driver, "//h1"),
            "Description": safe_extract(driver, "//*[contains(text(),'What They Do')]/following-sibling::*"),
            "Founded": safe_extract(driver, "//*[contains(text(),'Founded')]/following-sibling::*"),
            "Website": safe_extract_attribute(driver, "//*[contains(text(),'Website')]/following-sibling::a", "href"),
            "Categories": safe_extract(driver, "//*[contains(text(),'Categories')]/following-sibling::*"),
            "Key Words": safe_extract(driver, "//*[contains(text(),'Key Words')]/following-sibling::*")
        }
        exhibitor_data.append(data)
        
        # 5. Wait 2 seconds between requests
        time.sleep(2)

except Exception as e:
    logging.error(f"An error occurred during scraping: {str(e)}")
finally:
    driver.quit()

In [4]:
# -------------------------
# Save extracted data to CSV
# -------------------------
csv_headers = ["Company Name", "Description", "Founded", "Website", "Categories", "Key Words"]
with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_headers)
    writer.writeheader()
    for item in exhibitor_data:
        writer.writerow(item)

logging.info(f"Scraping completed. Data saved to {output_csv}.")
print(f"Scraping completed. Data saved to {output_csv}.")


Scraping completed. Data saved to C:\Users\jchan\csi360_fire_police\cabinet_vendors_list\cabinets\resources\output\vendor_list.csv.
