# City Bid Tracker - Westminster

Automated scraper for public procurement opportunities from Westminster's official website.

## Purpose
Helps contractors and vendors discover bidding opportunities by extracting:
- RFP numbers and titles
- Starting and closing dates
- Bid status information
- Direct links to full documentation

## Setup & Usage
1. Run the dependency installation cell
2. Execute the crawler class definition
3. Run the final execution cell
4. CSV file will be automatically downloaded

## Output
Creates `westminster_bids.csv` with all current bid opportunities.

## Technical Notes
This crawler implements enhanced anti-blocking measures due to the website's protection systems, including realistic browser headers and retry logic for access denied responses.

In [None]:
!pip install selenium webdriver_manager pandas
# Install Chrome and ChromeDriver
!apt-get update
!apt install chromium-chromedriver

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import csv
import os
import time
from google.colab import files
import random

class WestminsterBidsCrawler:
    def __init__(self):
        self.base_url = "https://www.westminster-ca.gov/departments/advanced-components/list-detail-pages/rfp-posts-list"
        self.output_file = "westminster_bids.csv"
        self.fieldnames = [
            "RFP Number",
            "Title",
            "Starting Date",
            "Closing Date",
            "Status",
            "Details URL",
            "Last Updated"
        ]
        self.max_retries = 3
        self.setup_driver()

    def setup_driver(self):
        """Setup Chrome driver with enhanced options to avoid blocking"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')

        # Add realistic browser headers
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        chrome_options.add_argument('--accept-language=en-US,en;q=0.9')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')

        # Additional headers to look more like a real browser
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        try:
            print("Attempting to use system chromedriver...")
            self.driver = webdriver.Chrome(options=chrome_options)
        except Exception as e:
            print(f"System chromedriver failed: {str(e)}")
            print("Attempting to use ChromeDriverManager...")
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)

        # Set page load timeout
        self.driver.set_page_load_timeout(30)

        # Add additional headers via CDP
        self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
            "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })

        self.wait = WebDriverWait(self.driver, 15)  # Increased wait time
        print("Chrome driver initialized successfully")

    def random_delay(self):
        """Add random delay between actions to appear more human-like"""
        time.sleep(random.uniform(2, 5))

    def parse_bid_item(self, row):
        """Parse individual bid listing row"""
        try:
            print("\nParsing new row...")
            bid_data = {
                "RFP Number": "",
                "Title": "",
                "Starting Date": "",
                "Closing Date": "",
                "Status": "",
                "Details URL": "",
                "Last Updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

            cells = row.find_elements(By.TAG_NAME, "td")
            print(f"Found {len(cells)} cells in row")

            if len(cells) >= 5:
                bid_data["RFP Number"] = cells[0].text.strip()

                try:
                    title_link = cells[1].find_element(By.TAG_NAME, "a")
                    bid_data["Title"] = title_link.text.strip()
                    bid_data["Details URL"] = title_link.get_attribute("href")
                except Exception as e:
                    print(f"Error extracting title/URL: {str(e)}")

                bid_data["Starting Date"] = cells[2].text.strip()
                bid_data["Closing Date"] = cells[3].text.strip()
                bid_data["Status"] = cells[4].text.strip()

                print(f"Parsed bid: {bid_data['Title']}")
                return bid_data if bid_data["Title"] else None

            return None

        except Exception as e:
            print(f"Error parsing bid item: {str(e)}")
            return None

    def setup_csv(self):
        """Create or verify CSV file with headers"""
        try:
            if not os.path.exists(self.output_file):
                with open(self.output_file, 'w', newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=self.fieldnames)
                    writer.writeheader()
                print(f"Created new CSV file: {self.output_file}")
            else:
                print(f"CSV file already exists: {self.output_file}")
        except Exception as e:
            print(f"Error setting up CSV: {str(e)}")

    def get_page_with_retry(self):
        """Attempt to load the page with retries"""
        for attempt in range(self.max_retries):
            try:
                print(f"\nAttempt {attempt + 1} to load page...")
                self.driver.get(self.base_url)
                self.random_delay()

                # Check for access denied
                if "Access Denied" in self.driver.page_source:
                    print("Access Denied detected, retrying...")
                    continue

                print("Page loaded successfully")
                return True
            except Exception as e:
                print(f"Error loading page: {str(e)}")
                if attempt < self.max_retries - 1:
                    wait_time = (attempt + 1) * 5
                    print(f"Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                continue
        return False

    def get_bid_listings(self):
        """Fetch and parse all bid listings"""
        try:
            if not self.get_page_with_retry():
                print("Failed to load page after all retries")
                return []

            print("Looking for bid table...")
            table = None

            # Try multiple selectors with explicit wait
            try:
                table = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "table.responsive-table-data-mb"))
                )
            except Exception as e:
                print(f"Error finding table: {str(e)}")
                print("Page source preview:")
                print(self.driver.page_source[:500])
                return []

            # Find all bid rows
            rows = table.find_elements(By.XPATH, ".//tbody/tr")
            print(f"Found {len(rows)} rows in table")

            bids = []
            for row in rows:
                bid_data = self.parse_bid_item(row)
                if bid_data:
                    bids.append(bid_data)
                self.random_delay()

            print(f"Successfully parsed {len(bids)} bids")
            return bids

        except Exception as e:
            print(f"Error fetching bid listings: {str(e)}")
            return []

    def save_bids(self, bids):
        """Save bid data to CSV"""
        try:
            if not bids:
                print("No bids to save")
                return

            existing_bids = set()
            if os.path.exists(self.output_file):
                with open(self.output_file, 'r', encoding='utf-8') as f:
                    reader = csv.DictReader(f)
                    for row in reader:
                        existing_bids.add(f"{row['RFP Number']}-{row['Title']}")

            new_bids = [bid for bid in bids if f"{bid['RFP Number']}-{bid['Title']}" not in existing_bids]

            if new_bids:
                mode = 'w' if not os.path.exists(self.output_file) else 'a'
                with open(self.output_file, mode, newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=self.fieldnames)
                    if mode == 'w':
                        writer.writeheader()
                    writer.writerows(new_bids)
                print(f"Added {len(new_bids)} new bids")
            else:
                print("No new bids to add")

            # Download the CSV file
            files.download(self.output_file)

        except Exception as e:
            print(f"Error saving bids: {str(e)}")

    def run(self):
        """Main execution method"""
        try:
            print(f"Starting Westminster bids crawler at {datetime.now()}")
            self.setup_csv()
            bids = self.get_bid_listings()
            self.save_bids(bids)
            print("Crawler execution completed")
        finally:
            if hasattr(self, 'driver'):
                self.driver.quit()

In [None]:
crawler = WestminsterBidsCrawler()
crawler.run()

Attempting to use system chromedriver...
Chrome driver initialized successfully
Starting Westminster bids crawler at 2025-02-10 19:55:09.023492
CSV file already exists: westminster_bids.csv

Attempt 1 to load page...
Page loaded successfully
Looking for bid table...
Found 4 rows in table

Parsing new row...
Found 5 cells in row
Parsed bid: Workers' Compensation Claims Administration Services

Parsing new row...
Found 5 cells in row
Parsed bid: CITYWIDE SLURRY SEAL IMPROVEMENTS

Parsing new row...
Found 5 cells in row
Parsed bid: GROCERY GIFT CARD PURCHASE- REQUEST FOR PROPOSAL(RFP)

Parsing new row...
Found 5 cells in row
Parsed bid: Basic Life Support (BLS) Ambulance Transport Services
Successfully parsed 4 bids
Added 4 new bids


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Crawler execution completed


## Disclaimer
This tool accesses publicly available information only from official government websites. It respects robots.txt guidelines and implements responsible scraping practices with delays between requests.