In [None]:
# -*- coding: utf-8 -*-
"""
FINAL SCRIPT: This script scrapes all vehicle listings from the Riyasewana 'cars' category.
It has been corrected to use the proper CSS selector for finding ad links on search pages.
"""

# Step 1: Install and set up necessary libraries
!pip install selenium pandas
!apt-get update
!apt-get install -y chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd

# Step 2: Set up the Chrome WebDriver for Colab
def setup_driver():
    """Initializes and returns a Selenium WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    driver = webdriver.Chrome(options=chrome_options)
    return driver

# Step 3: Function to scrape a single vehicle page (no changes needed here)
def scrape_vehicle_details(url, driver):
    """Scrapes detailed information from a single vehicle ad URL."""
    try:
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        vehicle_data = {'URL': url}

        # --- Title, Price, Contact, and Details (logic is correct) ---
        title_element = soup.find('h1')
        vehicle_data['Title'] = title_element.get_text(strip=True) if title_element else 'N/A'

        all_spans = soup.find_all('span', class_='moreph')
        vehicle_data['Contact'] = all_spans[0].get_text(strip=True) if len(all_spans) >= 2 else 'N/A'
        vehicle_data['Price'] = all_spans[1].get_text(strip=True) if len(all_spans) >= 2 else 'N/A'

        details_table = soup.find('table', class_='moret')
        if details_table:
            rows = details_table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if len(cells) == 4:
                    key1, val1, key2, val2 = [c.get_text(strip=True) for c in cells]
                    if key1: vehicle_data[key1] = val1
                    if key2: vehicle_data[key2] = val2
                elif len(cells) == 2:
                    key, val = [c.get_text(strip=True) for c in cells]
                    if key: vehicle_data[key] = val
        return vehicle_data
    except Exception as e:
        print(f"  - Error scraping {url}: {e}")
        return None

# --- Main Scraping Logic ---
if __name__ == "__main__":
    base_url = "https://riyasewana.com/search/cars"
    PAGES_TO_SCRAPE = 3  # Startingggggg with 3 pages for testing

    all_ad_links = []
    driver = setup_driver()

    try:
        # Part 1: Collect all ad links from the search result pages
        print(f"--- Starting Part 1: Collecting ad links from {PAGES_TO_SCRAPE} pages ---")
        for page_num in range(1, PAGES_TO_SCRAPE + 1):
            search_url = f"{base_url}?page={page_num}"
            print(f"Fetching links from page {page_num}: {search_url}")
            driver.get(search_url)
            time.sleep(4)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # --- KEY CORRECTION ---
            # The selector is now 'li.item h2.more a' to match the actual HTML
            links_on_page = soup.select('li.item h2.more a')

            for link in links_on_page:
                href = link.get('href')
                if href:
                    all_ad_links.append(href) # The href is already a full URL
            print(f"  - Found {len(links_on_page)} links. Total links collected: {len(all_ad_links)}")

        # Part 2: Scrape the details from each collected link
        print(f"\n--- Starting Part 2: Scraping details for {len(all_ad_links)} ads ---")
        all_vehicles_data = []
        for i, ad_url in enumerate(all_ad_links):
            print(f"Scraping ad {i+1}/{len(all_ad_links)}: {ad_url}")
            data = scrape_vehicle_details(ad_url, driver)
            if data:
                all_vehicles_data.append(data)

        # Part 3: Create a DataFrame and save/display the results
        print("\n--- Scraping Complete ---")
        if all_vehicles_data:
            df = pd.DataFrame(all_vehicles_data)
            print(f"Successfully scraped {len(df)} vehicle ads.")
            print("Displaying the first 5 rows of the dataset:")
            display(df.head())

            # Optional: Save the data to a CSV file in your Colab environment
            df.to_csv('riyasewana_car_listings.csv', index=False)
            print("\nData has been saved to 'riyasewana_car_listings.csv'")
        else:
            print("No data was scraped. Please check for website structure changes or network issues.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        driver.quit()
        print("\nBrowser session closed.")