In [None]:
# ==========================================
# 1. SETUP: Install Google Chrome Stable (The one that works!)
# ==========================================
# We have to reinstall this because Colab resets if you disconnected
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
!sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
!apt-get -y update
!apt-get install -y google-chrome-stable

# Install Python libraries
!pip install selenium webdriver-manager pandas

# ==========================================
# 2. THE OPTIMIZED SCRAPER
# ==========================================
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import os

# --- CONFIGURATION ---
BASE_URL = "https://riyasewana.com/search/cars"
START_PAGE = 2
END_PAGE = 4   # Set this to 664 to do the whole site
CSV_FILENAME = 'riyasewana_2to4.csv'

def setup_driver():
    """Initializes Selenium with IMAGE LOADING DISABLED for speed."""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')

    # --- SPEED HACK: DISABLE IMAGES ---
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)

    # Standard User Agent
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def scrape_vehicle_details(url, driver):
    """Scrapes a single page."""
    try:
        driver.get(url)
        # Reduced sleep because we aren't loading images
        time.sleep(1.5)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        vehicle_data = {'URL': url}

        # Title
        title = soup.find('h1')
        vehicle_data['Title'] = title.get_text(strip=True) if title else 'N/A'

        # Contact & Price
        spans = soup.find_all('span', class_='moreph')
        vehicle_data['Contact'] = spans[0].get_text(strip=True) if len(spans) >= 2 else 'N/A'
        vehicle_data['Price'] = spans[1].get_text(strip=True) if len(spans) >= 2 else 'N/A'

        # Details Table
        details_table = soup.find('table', class_='moret')
        if details_table:
            for row in details_table.find_all('tr'):
                cells = row.find_all('td')
                if len(cells) == 4:
                    vehicle_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
                    vehicle_data[cells[2].get_text(strip=True)] = cells[3].get_text(strip=True)
                elif len(cells) == 2:
                    vehicle_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
        return vehicle_data
    except Exception as e:
        print(f"Error on {url}: {e}")
        return None

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    driver = setup_driver()
    all_ad_links = []

    try:
        # --- PHASE 1: Collect Links ---
        print(f"--- PHASE 1: Collecting links from page {START_PAGE} to {END_PAGE} ---")
        for page_num in range(START_PAGE, END_PAGE + 1):
            search_url = f"{BASE_URL}?page={page_num}"
            print(f"Scanning Page {page_num}...", end="\r")

            driver.get(search_url)
            time.sleep(2)

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            links_on_page = soup.select('li.item h2.more a')

            for link in links_on_page:
                href = link.get('href')
                if href: all_ad_links.append(href)

        print(f"\nTotal Ads Found: {len(all_ad_links)}")

        # --- PHASE 2: Scrape & Save Incrementally ---
        print(f"--- PHASE 2: Scraping Details ---")

        scraped_buffer = []

        for i, ad_url in enumerate(all_ad_links):
            print(f"Scraping {i+1}/{len(all_ad_links)}: {ad_url}")
            data = scrape_vehicle_details(ad_url, driver)

            if data:
                scraped_buffer.append(data)

            # --- CHECKPOINT: Save every 10 ads ---
            if len(scraped_buffer) >= 10:
                df_batch = pd.DataFrame(scraped_buffer)

                # Check if file exists to determine if we need headers
                file_exists = os.path.isfile(CSV_FILENAME)

                df_batch.to_csv(CSV_FILENAME, mode='a', header=not file_exists, index=False)
                print(f"  [Saved 10 records to {CSV_FILENAME}]")
                scraped_buffer = [] # Clear buffer

        # Save any remaining data in the buffer
        if scraped_buffer:
            df_batch = pd.DataFrame(scraped_buffer)
            file_exists = os.path.isfile(CSV_FILENAME)
            df_batch.to_csv(CSV_FILENAME, mode='a', header=not file_exists, index=False)
            print(f"  [Saved final records to {CSV_FILENAME}]")

        print("\n--- DONE! ---")

        # Verify
        if os.path.isfile(CSV_FILENAME):
            full_df = pd.read_csv(CSV_FILENAME)
            print(f"Total rows in CSV: {len(full_df)}")
            print(full_df.head())

    except Exception as e:
        print(f"\nCRITICAL ERROR: {e}")
    finally:
        driver.quit()