In [None]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Additional library for handling dynamic content, such as Google ad pop-ups
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

# Configure the Chrome WebDriver options for headless execution if preferred
chrome_options = Options()
chrome_options.add_argument("--headless")  # Comment this line to see browser actions

# Start the Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)


In [None]:
# Cell 2: Define Helper Functions for Data Formatting and Error Handling

# Function to format state to abbreviations
def format_state(state_name):
    # Dictionary of state abbreviations
    state_abbr = {
        "California": "CA",
        "Colorado": "CO",
        # Add other state mappings as needed
    }
    return state_abbr.get(state_name, state_name)  # Default to state_name if not found

# Function to format phone numbers to (XXX) XXX-XXXX
def format_phone(phone):
    phone_digits = re.sub(r"\D", "", phone)  # Remove non-digit characters
    return f"({phone_digits[:3]}) {phone_digits[3:6]}-{phone_digits[6:10]}" if len(phone_digits) >= 10 else phone


In [None]:
# Initialize the DataFrame for Storing Extracted Data
columns = [
    "department_name", "title", "first_name", "last_name", 
    "building_name", "address", "city", "state", "zip_code", "county"
]
data = pd.DataFrame(columns=columns)


In [None]:
# Cell 4: Function to Close Google Ads

def close_google_ad():
    try:
        # Check for the ad's close button and attempt to close it
        close_button = driver.find_element(By.CSS_SELECTOR, "div[role='dialog'] button")
        close_button.click()
    except NoSuchElementException:
        # No ad found, continue normally
        pass

In [None]:
# Extract Data from Target Page

def extract_data_from_page(url):
    # Open the URL
    driver.get(url)
    
    # Check and close Google ad if present
    if "#google_vegnette" in driver.current_url:
        close_google_ad()
    
    # Parse page content
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Extract information with error handling
    try:
        department_name = soup.find("h1", class_="departmentname").get_text(strip=True)
    except AttributeError:
        department_name = ""
    
    try:
        title = soup.find("div", class_="title").get_text(strip=True)
    except AttributeError:
        title = ""
    
    # Extract the remaining fields from departmentinf div
    department_info = soup.find("div", class_="departmentinf")
    info_lines = department_info.get_text(separator="\n").splitlines() if department_info else []

    # Parse lines for relevant data fields, using blank if missing
    first_name = info_lines[0].split()[0] if len(info_lines) > 0 else ""
    last_name = info_lines[0].split()[1] if len(info_lines) > 1 else ""
    building_name = info_lines[1] if len(info_lines) > 1 else ""
    address = info_lines[2] if len(info_lines) > 2 else ""
    city_state_zip = info_lines[3] if len(info_lines) > 3 else ""
    
    # Split city, state, and zip code
    city = city_state_zip.split(",")[0] if "," in city_state_zip else ""
    state = format_state(city_state_zip.split(",")[1].strip().split()[0]) if "," in city_state_zip else ""
    zip_code = city_state_zip.split()[-1] if len(city_state_zip.split()) > 1 else ""
    
    # Find county in <a> tag
    try:
        county = department_info.find("a").get_text(strip=True)
    except AttributeError:
        county = ""
    
    # Append to DataFrame
    global data
    data = pd.concat([
        data, 
        pd.DataFrame([{
            "department_name": department_name,
            "title": title,
            "first_name": first_name,
            "last_name": last_name,
            "building_name": building_name,
            "address": address,
            "city": city,
            "state": state,
            "zip_code": zip_code,
            "county": county
        }])
    ], ignore_index=True)
