In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
import random
import os

# --- CONFIGURATION ---
BASE_URL = "https://riyasewana.com/search/cars"
# careful 
TOTAL_PAGES_TO_SCRAPE = 2
MAX_THREADS = 10  # Don't go too high hehe
CSV_FILENAME = 'riyasewana_fast_scrape.csv'

# Headers to mimic a real browser (Prevents immediate blocking)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}

def get_soup(url):
    """Helper to download a page and return BeautifulSoup object."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code == 200:
            return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        print(f"Request error for {url}: {e}")
    return None

def scrape_ad_details(ad_url):
    """Scrapes details from a specific ad URL."""
    soup = get_soup(ad_url)
    if not soup:
        return None

    data = {'URL': ad_url}
    
    # 1. Title
    title = soup.find('h1')
    data['Title'] = title.get_text(strip=True) if title else None
    
    # 2. Contact & Price
    spans = soup.find_all('span', class_='moreph')
    if len(spans) >= 2:
        data['Contact'] = spans[0].get_text(strip=True)
        data['Price'] = spans[1].get_text(strip=True)
    
    # 3. Table Details
    table = soup.find('table', class_='moret')
    if table:
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            # Handle 4-cell rows
            if len(cells) == 4:
                data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
                data[cells[2].get_text(strip=True)] = cells[3].get_text(strip=True)
            # Handle 2-cell rows
            elif len(cells) == 2:
                data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
                
    return data

def get_links_from_page(page_num):
    """Gets all ad URLs from a search result page."""
    url = f"{BASE_URL}?page={page_num}"
    soup = get_soup(url)
    links = []
    if soup:
        # Correct selector based on your previous success
        anchors = soup.select('li.item h2.more a')
        for a in anchors:
            links.append(a.get('href'))
    return links

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    start_time = time.time()
    all_links = []
    
    print(f"--- PHASE 1: Collecting Links from {TOTAL_PAGES_TO_SCRAPE} Pages ---")
    
    # We use ThreadPoolExecutor to fetch search pages in parallel too
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        # Map page numbers to the function
        future_to_page = {executor.submit(get_links_from_page, i): i for i in range(1, TOTAL_PAGES_TO_SCRAPE + 1)}
        
        for i, future in enumerate(concurrent.futures.as_completed(future_to_page)):
            page_links = future.result()
            all_links.extend(page_links)
            if i % 50 == 0:
                print(f"Processed {i} search pages... (Total links so far: {len(all_links)})")

    print(f"\nTotal Links Found: {len(all_links)}")
    print(f"--- PHASE 2: Scraping Details (Threads: {MAX_THREADS}) ---")

    scraped_data = []
    
    # Process links in batches so we can save progress incrementally
    BATCH_SIZE = 100
    
    # Break links into chunks
    chunks = [all_links[i:i + BATCH_SIZE] for i in range(0, len(all_links), BATCH_SIZE)]

    total_chunks = len(chunks)
    
    for i, chunk in enumerate(chunks):
        print(f"Processing Batch {i+1}/{total_chunks}...")
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
            # Start scraping ads in this batch
            results = list(executor.map(scrape_ad_details, chunk))
        
        # Filter out Nones (failed scrapes)
        valid_results = [r for r in results if r]
        scraped_data.extend(valid_results)
        
        # --- SAVE PROGRESS CHECKPOINT ---
        # This is professional: Save every batch so if Colab crashes, you don't lose everything.
        df_batch = pd.DataFrame(valid_results)
        
        # If file doesn't exist, write header. If it does, append without header.
        if not os.path.isfile(CSV_FILENAME):
            df_batch.to_csv(CSV_FILENAME, index=False)
        else:
            df_batch.to_csv(CSV_FILENAME, mode='a', header=False, index=False)
            
        # Sleep briefly to be nice to the server
        time.sleep(1) 

    end_time = time.time()
    duration = end_time - start_time
    
    print(f"\nDONE! Scraped {len(scraped_data)} ads in {duration:.2f} seconds.")
    print(f"Data saved to {CSV_FILENAME}")