In [2]:
# 1. Install the library that bypasses bot protection
!pip install cloudscraper pandas bs4

import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
import os
import random

# --- CONFIGURATION ---
BASE_URL = "https://riyasewana.com/search/cars"
TOTAL_PAGES_TO_SCRAPE = 3  # Start small (3 pages) to test!
MAX_THREADS = 5            # Reduced to 5 to avoid triggering the firewall again
CSV_FILENAME = 'riyasewana_cloudscraper.csv'

# Initialize the Scraper (This pretends to be a real Chrome browser)
scraper = cloudscraper.create_scraper()

def get_soup(url):
    """Helper to download a page using Cloudscraper."""
    try:
        # We use scraper.get() instead of requests.get()
        response = scraper.get(url, timeout=15)

        # Check if we were blocked
        if response.status_code == 200:
            return BeautifulSoup(response.content, 'html.parser')
        else:
            print(f"  [!] Blocked or Error: Status {response.status_code} for {url}")
            return None
    except Exception as e:
        print(f"  [!] Connection error for {url}: {e}")
    return None

def get_links_from_page(page_num):
    """Gets all ad URLs from a search result page."""
    url = f"{BASE_URL}?page={page_num}"
    soup = get_soup(url)
    links = []
    if soup:
        # Selector for the ad links
        anchors = soup.select('li.item h2.more a')
        for a in anchors:
            links.append(a.get('href'))

    # Random sleep to look like a human
    time.sleep(random.uniform(0.5, 1.5))
    return links

def scrape_ad_details(ad_url):
    """Scrapes details from a specific ad URL."""
    soup = get_soup(ad_url)
    if not soup:
        return None

    data = {'URL': ad_url}

    try:
        # 1. Title
        title = soup.find('h1')
        data['Title'] = title.get_text(strip=True) if title else 'N/A'

        # 2. Contact & Price
        spans = soup.find_all('span', class_='moreph')
        if len(spans) >= 2:
            data['Contact'] = spans[0].get_text(strip=True)
            data['Price'] = spans[1].get_text(strip=True)

        # 3. Table Details
        table = soup.find('table', class_='moret')
        if table:
            for row in table.find_all('tr'):
                cells = row.find_all('td')
                if len(cells) == 4:
                    data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
                    data[cells[2].get_text(strip=True)] = cells[3].get_text(strip=True)
                elif len(cells) == 2:
                    data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
    except Exception as e:
        print(f"Error parsing details for {ad_url}: {e}")

    return data

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    start_time = time.time()
    all_links = []

    print(f"--- PHASE 1: Collecting Links from {TOTAL_PAGES_TO_SCRAPE} Pages ---")

    # We do pages strictly sequentially first to ensure we don't get IP banned
    for i in range(1, TOTAL_PAGES_TO_SCRAPE + 1):
        print(f"Scanning Page {i}...", end='\r')
        links = get_links_from_page(i)
        all_links.extend(links)
        # Sleep slightly to be polite
        time.sleep(0.5)

    print(f"\nTotal Links Found: {len(all_links)}")

    if len(all_links) > 0:
        print(f"--- PHASE 2: Scraping Details (Threads: {MAX_THREADS}) ---")
        scraped_data = []
        BATCH_SIZE = 50
        chunks = [all_links[i:i + BATCH_SIZE] for i in range(0, len(all_links), BATCH_SIZE)]

        for i, chunk in enumerate(chunks):
            print(f"Processing Batch {i+1}/{len(chunks)} ({len(chunk)} ads)...")

            with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
                results = list(executor.map(scrape_ad_details, chunk))

            valid_results = [r for r in results if r]
            scraped_data.extend(valid_results)

            # Save Backup
            df_batch = pd.DataFrame(valid_results)
            if not os.path.isfile(CSV_FILENAME):
                df_batch.to_csv(CSV_FILENAME, index=False)
            else:
                df_batch.to_csv(CSV_FILENAME, mode='a', header=False, index=False)

            time.sleep(1)

        end_time = time.time()
        print(f"\nDONE! Scraped {len(scraped_data)} ads in {end_time - start_time:.2f} seconds.")
        print(f"Data saved to {CSV_FILENAME}")

        # Display Preview
        df = pd.read_csv(CSV_FILENAME)
        print(df.head())
    else:
        print("\n[!] Still getting 0 links? The site might require the Selenium approach.")

Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.7/99.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, cloudscraper
Successfully installed bs4-0.0.2 cloudscraper-1.2.71
--- PHASE 1: Collecting Links from 3 Pages ---
  [!] Blocked or Error: Status 403 for https://riyasewana.com/search/cars?page=1
  [!] Blocked or Error: Status 403 for https://riyasewana.com/search/cars?page=2
  [!] Blocked or Error: Status 403 for https://riyasewana.com/search/cars?page=3

Total Links Found: 0

[!] Still getting 0 links? The site might require the Selenium approach.
