In [159]:
# Scraping of Computers and Computer Parts

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import csv
import time


In [161]:
CHROMEDRIVER_PATH = '/Users/shan/Desktop/Study/SpringB2025/SocialMedia&TextAnalytics/GroupProject/chromedriver-mac-arm64/chromedriver'


In [163]:
# Scraping Computers
BASE_URL = "https://rochester.craigslist.org/search/sya"


In [165]:
def get_listings_with_selenium(max_pages=1):
    listings = []

    service = Service(CHROMEDRIVER_PATH)
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # For debugging
    options.add_argument("--window-size=1280,800")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(service=service, options=options)

    post_links = []

    # STEP 1: Extract all post titles and URLs
    for page in range(0, max_pages * 120, 120):
        page_url = f"{BASE_URL}?s={page}"
        print(f"\n🔍 Scraping search page: {page_url}")
        driver.get(page_url)
        time.sleep(3)
        driver.save_screenshot(f"craigslist_page_{page}.png")

        results = driver.find_elements(By.CSS_SELECTOR, "div.cl-search-result.cl-search-view-mode-gallery")
        print(f" Found {len(results)} posts on page {page // 120 + 1}")

        for item in results:
            try:
                link_elem = item.find_element(By.CSS_SELECTOR, "a.cl-app-anchor")
                link = link_elem.get_attribute("href")
                title = link_elem.text.strip()
                post_links.append({"title": title, "link": link})
            except Exception as e:
                print(f" Skipped broken result: {e}")
                continue

    print(f"\n Collected {len(post_links)} post URLs. Now extracting descriptions...")

    # STEP 2: Visit each link separately
    for post in post_links:
        try:
            print(f" Visiting: {post['link']}")
            driver.get(post['link'])
            time.sleep(2)

            try:
                desc_elem = driver.find_element(By.ID, "postingbody")
                raw_text = desc_elem.text.strip()
                lines = raw_text.split("\n")
                cleaned = [line for line in lines if "QR Code" not in line and line.strip()]
                description = " ".join(cleaned)
            except:
                description = "N/A"

            listings.append({
                "title": post['title'],
                "description": description
            })

        except Exception as e:
            print(f" Failed to scrape {post['link']}: {e}")
            continue

    driver.quit()
    return listings

def save_to_csv(data, filename="computers.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "description"])
        writer.writeheader()
        for row in data:
            writer.writerow(row)
    print(f"\n Saved {len(data)} listings to {filename}")


In [167]:
# === RUN ===
if __name__ == "__main__":
    listings = get_listings_with_selenium(max_pages=2)
    save_to_csv(listings, filename="computers.csv")



🔍 Scraping search page: https://rochester.craigslist.org/search/sya?s=0
 Found 200 posts on page 1

🔍 Scraping search page: https://rochester.craigslist.org/search/sya?s=120
 Found 200 posts on page 2

 Collected 400 post URLs. Now extracting descriptions...
 Visiting: https://rochester.craigslist.org/sys/d/rochester-dell-coffee-lake-quad-core/7845317539.html
 Visiting: https://rochester.craigslist.org/sys/d/rochester-microsoft-surface-10-touch/7845315379.html
 Visiting: https://rochester.craigslist.org/sys/d/rochester-apple-imac-5k-intel-quad-core/7845305258.html
 Visiting: https://rochester.craigslist.org/sys/d/rochester-apple-macbook-pro-core-i9/7842733517.html
 Visiting: https://rochester.craigslist.org/sys/d/rochester-apple-imac-core-i7-4k-display/7845302514.html
 Visiting: https://rochester.craigslist.org/sys/d/rochester-amd-ryzen-hexacore-budget/7843526842.html
 Visiting: https://rochester.craigslist.org/sys/d/rochester-apple-mac-mini-m1-core-mac-os/7845032397.html
 Visiting: h

In [169]:
# Scraping Computer Parts
BASE_URL = "https://rochester.craigslist.org/search/sop"


In [170]:
def get_listings_with_selenium(max_pages=2):
    listings = []

    service = Service(CHROMEDRIVER_PATH)
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # For debugging
    options.add_argument("--window-size=1280,800")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/123.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(service=service, options=options)

    post_links = []

    # STEP 1: Extract all titles and links first
    for page in range(0, max_pages * 120, 120):
        page_url = f"{BASE_URL}?s={page}"
        print(f"\n🔍 Scraping search page: {page_url}")
        driver.get(page_url)
        time.sleep(3)
        driver.save_screenshot(f"computer_parts_page_{page}.png")

        results = driver.find_elements(By.CSS_SELECTOR, "div.cl-search-result.cl-search-view-mode-gallery")
        print(f" Found {len(results)} posts on page {page // 120 + 1}")

        for item in results:
            try:
                link_elem = item.find_element(By.CSS_SELECTOR, "a.cl-app-anchor")
                link = link_elem.get_attribute("href")
                title = link_elem.text.strip()
                post_links.append({"title": title, "link": link})
            except Exception as e:
                print(f" Skipped a broken result: {e}")
                continue

    print(f"\n Collected {len(post_links)} post URLs. Now extracting descriptions...")

    # STEP 2: Visit each link and extract description
    for post in post_links:
        try:
            print(f" Visiting: {post['link']}")
            driver.get(post['link'])
            time.sleep(2)

            try:
                desc_elem = driver.find_element(By.ID, "postingbody")
                raw_text = desc_elem.text.strip()
                lines = raw_text.split("\n")
                cleaned = [line for line in lines if "QR Code" not in line and line.strip()]
                description = " ".join(cleaned)
            except:
                description = "N/A"

            listings.append({
                "title": post['title'],
                "description": description
            })

        except Exception as e:
            print(f" Failed to scrape {post['link']}: {e}")
            continue

    driver.quit()
    return listings

def save_to_csv(data, filename="computer_parts.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "description"])
        writer.writeheader()
        for row in data:
            writer.writerow(row)
    print(f"\n Saved {len(data)} listings to {filename}")


In [171]:
# === RUN SCRIPT ===
if __name__ == "__main__":
    listings = get_listings_with_selenium(max_pages=3)
    save_to_csv(listings)



🔍 Scraping search page: https://rochester.craigslist.org/search/sop?s=0
 Found 114 posts on page 1

🔍 Scraping search page: https://rochester.craigslist.org/search/sop?s=120
 Found 114 posts on page 2

🔍 Scraping search page: https://rochester.craigslist.org/search/sop?s=240
 Found 114 posts on page 3

 Collected 342 post URLs. Now extracting descriptions...
 Visiting: https://rochester.craigslist.org/sop/d/fairport-hp-cb337w-tricolor-ink/7827954104.html
 Visiting: https://rochester.craigslist.org/sop/d/rochester-keyboard-model-a025-kfrmb2/7831101735.html
 Visiting: https://rochester.craigslist.org/sop/d/rochester-50-7b-dell-6ft-black-dvi/7827857144.html
 Visiting: https://rochester.craigslist.org/sop/d/rochester-surface-book-ac-adapter/7827857051.html
 Visiting: https://rochester.craigslist.org/sop/d/rochester-laptop-bag/7829851702.html
 Visiting: https://rochester.craigslist.org/sop/d/rochester-gigabyte-aorus-master-rtx/7844515364.html
 Visiting: https://rochester.craigslist.org/sop

In [172]:
# --