In [2]:
# ==========================================
# 1. SETUP: Install Google Chrome & Dependencies
# ==========================================
# Force-install the official Google Chrome (bypassing the broken Snap version)
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
!sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
!apt-get -y update
!apt-get install -y google-chrome-stable

# Install Python libraries
!pip install selenium webdriver-manager pandas

# ==========================================
# 2. THE SCRAPER SCRIPT
# ==========================================
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd

# Step 2: Set up the Chrome WebDriver (Corrected for Colab)
def setup_driver():
    """Initializes and returns a Selenium WebDriver using Google Chrome Stable."""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    # Use webdriver_manager to get the driver that matches the Chrome we just installed
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

# Step 3: Function to scrape a single vehicle page
def scrape_vehicle_details(url, driver):
    """Scrapes detailed information from a single vehicle ad URL."""
    try:
        driver.get(url)
        # Reduced sleep slightly to speed it up, but kept safe for stability
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        vehicle_data = {'URL': url}

        # --- Title ---
        title_element = soup.find('h1')
        vehicle_data['Title'] = title_element.get_text(strip=True) if title_element else 'N/A'

        # --- Contact & Price ---
        all_spans = soup.find_all('span', class_='moreph')
        vehicle_data['Contact'] = all_spans[0].get_text(strip=True) if len(all_spans) >= 2 else 'N/A'
        vehicle_data['Price'] = all_spans[1].get_text(strip=True) if len(all_spans) >= 2 else 'N/A'

        # --- Details Table ---
        details_table = soup.find('table', class_='moret')
        if details_table:
            rows = details_table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if len(cells) == 4:
                    key1, val1, key2, val2 = [c.get_text(strip=True) for c in cells]
                    if key1: vehicle_data[key1] = val1
                    if key2: vehicle_data[key2] = val2
                elif len(cells) == 2:
                    key, val = [c.get_text(strip=True) for c in cells]
                    if key: vehicle_data[key] = val
        return vehicle_data
    except Exception as e:
        print(f"  - Error scraping {url}: {e}")
        return None

# --- Main Scraping Logic ---
if __name__ == "__main__":
    base_url = "https://riyasewana.com/search/cars"
    PAGES_TO_SCRAPE = 3

    all_ad_links = []

    # Initialize the driver using our corrected setup
    print("Initializing Google Chrome Driver...")
    driver = setup_driver()

    try:
        # Part 1: Collect all ad links
        print(f"--- Starting Part 1: Collecting ad links from {PAGES_TO_SCRAPE} pages ---")
        for page_num in range(1, PAGES_TO_SCRAPE + 1):
            search_url = f"{base_url}?page={page_num}"
            print(f"Fetching links from page {page_num}: {search_url}")
            driver.get(search_url)
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Your corrected selector
            links_on_page = soup.select('li.item h2.more a')

            for link in links_on_page:
                href = link.get('href')
                if href:
                    all_ad_links.append(href)
            print(f"  - Found {len(links_on_page)} links. Total links collected: {len(all_ad_links)}")

        # Part 2: Scrape the details
        print(f"\n--- Starting Part 2: Scraping details for {len(all_ad_links)} ads ---")
        all_vehicles_data = []
        for i, ad_url in enumerate(all_ad_links):
            print(f"Scraping ad {i+1}/{len(all_ad_links)}: {ad_url}")
            data = scrape_vehicle_details(ad_url, driver)
            if data:
                all_vehicles_data.append(data)

        # Part 3: Save results
        print("\n--- Scraping Complete ---")
        if all_vehicles_data:
            df = pd.DataFrame(all_vehicles_data)
            print(f"Successfully scraped {len(df)} vehicle ads.")
            print("Displaying the first 5 rows:")
            print(df.head())

            # Save to CSV
            csv_filename = 'riyasewana_car_listings.csv'
            df.to_csv(csv_filename, index=False)
            print(f"\nData has been saved to '{csv_filename}'")
        else:
            print("No data was scraped.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        driver.quit()
        print("\nBrowser session closed.")

OK
Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Get:3 http://dl.google.com/linux/chrome/deb stable InRelease [1,825 B]
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://dl.google.com/linux/chrome/deb stable/main amd64 Packages [1,209 B]
Hit:13 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 3,034 B in 2s (1,740 B/s)
Reading package lists... Done
W: http://dl