In [None]:
pip install apscheduler cloudscraper pandas beautifulsoup4



In [None]:
from apscheduler.schedulers.background import BackgroundScheduler
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import time

# Scraping function
def scrape_bbb():
    print("🚀 Starting BBB scraping task...")

    # Base URL with pagination
    base_url = "https://www.bbb.org/search?city=new-york&find_country=USA&find_entity=10035-000&find_id=1304_3100-9100&find_latlng=40.762801%2C-73.977818&find_loc=New+York%2C+NY&find_text=General+Contractor&find_type=Category&page={}&sort=Distance&state=NY"

    scraper = cloudscraper.create_scraper()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    max_pages = 5  # Adjust as needed
    all_data = []

    for page in range(1, max_pages + 1):
        print(f"📄 Scraping page {page}...")

        response = scraper.get(base_url.format(page), headers=headers)
        if response.status_code != 200:
            print(f"❌ Failed to fetch page {page}. Status Code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        main_div = soup.find("div", class_="stack stack-space-20")
        if not main_div:
            print("⚠️ No more results found. Stopping.")
            break

        contractors = main_div.find_all("div", class_="card result-card")
        if not contractors:
            print("⚠️ No more contractors found. Ending pagination.")
            break

        for contractor in contractors:
            try:
                name = contractor.find("h3").find("a", class_="text-blue-medium").text.strip()
            except AttributeError:
                name = "N/A"

            try:
                company = contractor.find("span").text.strip()
            except AttributeError:
                company = "N/A"

            try:
                industry = contractor.find("p", class_="bds-body text-size-4 text-gray-70").text.strip()
            except AttributeError:
                industry = "N/A"

            try:
                contact_tag = contractor.find("a", class_="text-black")
                contact = contact_tag["href"].replace("tel:", "").strip() if contact_tag and "href" in contact_tag.attrs else "N/A"
            except AttributeError:
                contact = "N/A"

            try:
                location = contractor.find("p", class_="bds-body text-size-5 text-gray-70").text.strip()
            except AttributeError:
                location = "N/A"

            all_data.append({
                "Name": name,
                "Company": company,
                "Industry": industry,
                "Contact": contact,
                "Location": location
            })

        time.sleep(2)  # Prevent getting blocked

    df = pd.DataFrame(all_data)
    df.to_csv("bbb_contractors_scheduled.csv", index=False)
    print(f"✅ Scraped {len(df)} contractors. Data saved to 'bbb_contractors_scheduled.csv'.")

# APScheduler setup
scheduler = BackgroundScheduler()
scheduler.add_job(scrape_bbb, 'interval', hours=6)  # Runs every 6 hours
scheduler.start()

# Keep script running
try:
    print("⏳ Scheduler is running... Press Ctrl+C to exit.")
    while True:
        time.sleep(10)
except (KeyboardInterrupt, SystemExit):
    scheduler.shutdown()
    print("❌ Scheduler stopped.")




⏳ Scheduler is running... Press Ctrl+C to exit.
❌ Scheduler stopped.
