In [None]:
pip install apscheduler cloudscraper pandas beautifulsoup4



In [None]:
import cloudscraper
import time
import csv
from bs4 import BeautifulSoup
from apscheduler.schedulers.background import BackgroundScheduler

# Define the base URL
base_url = "https://www.realtor.com/realestateagents/columbus_oh?page={}"

# Create a CloudScraper session
scraper = cloudscraper.create_scraper()

def scrape_agents():
    agents_data = []
    page = 1  # Start from the first page
    max_pages = 200  # Limit to 200 pages

    while page <= max_pages:
        url = base_url.format(page)
        print(f"Scraping page {page}: {url}")  # Debugging: Show current page

        response = scraper.get(url)
        if response.status_code != 200:
            print("Error: Failed to retrieve page, stopping.")
            break  # Stop if there's an error (e.g., no more pages)

        soup = BeautifulSoup(response.text, "html.parser")

        # Find all agent cards on the page
        realtor_cards = soup.find_all("div", class_="jsx-3873707352 agent-list-card clearfix")

        if not realtor_cards:  # Stop when no more agent cards are found
            print("No more agents found, stopping.")
            break

        for realtor_card in realtor_cards:
            # Extract Name
            name_tag = realtor_card.find("span", class_="text-bold")
            name = name_tag.text.strip() if name_tag else "N/A"

            # Extract Agency Name
            agency_tag = realtor_card.find("div", class_="agent-group")
            agency = agency_tag.text.strip() if agency_tag else "N/A"

            # Extract Experience
            experience_tag = realtor_card.find("span", class_="bold-text")
            experience = experience_tag.text.strip() if experience_tag else "N/A"

            # Extract GCI (Activity Range)
            gci_container = realtor_card.find("div", class_="jsx-3873707352 agent-detail-item")
            gci_tag = gci_container.find("span", class_="jsx-3873707352 bold-text") if gci_container else None
            gci = gci_tag.text.strip() if gci_tag else "N/A"

            # Append agent data to list
            agents_data.append({
                "Name": name,
                "Agency": agency,
                "Experience": experience,
                "GCI (Activity Range)": gci
            })

        page += 1  # Move to the next page
        time.sleep(2)  # Delay to prevent blocking

    # Save to CSV
    csv_filename = "real_estate_agents.csv"
    with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["Name", "Agency", "Experience", "GCI (Activity Range)"])
        writer.writeheader()
        writer.writerows(agents_data)

    print(f"Scraping completed! Data saved to {csv_filename}")

# APScheduler setup
scheduler = BackgroundScheduler()
scheduler.add_job(scrape_agents, 'interval', hours=6)  # Runs every 6 hours
scheduler.start()

# Keep script running
try:
    print("⏳ Scheduler is running... Press Ctrl+C to exit.")
    while True:
        time.sleep(10)
except (KeyboardInterrupt, SystemExit):
    scheduler.shutdown()
    print("❌ Scheduler stopped.")




⏳ Scheduler is running... Press Ctrl+C to exit.
❌ Scheduler stopped.
