In [None]:
#pip install selenium beautifulsoup4 pandas webdriver-manager category_encoders scikit-learn numpy matplotlib catboost

In [None]:
import os
import time
import random
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
# configuration

BASE_URL = 'https://www.cardekho.com/used-cars+5-lakh-to-10-lakh+in+new-delhi'
scroll_pause_time = 2
max_scrolls = 10 # controls number of listings loaded
output_file = 'data/cardekho_used_cars.csv'

if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
else:
    existing_df = pd.DataFrame()

In [None]:
# setting up driver

def get_driver():
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-extensions")

    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

In [None]:
# scroll page

def scroll_page(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")

    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2, 3))
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break
        last_height = new_height

In [None]:
# Collect car links

def get_car_links(driver):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = set()

    for a in soup.select("a[href*='/used-car-details/']"):
        href = a.get("href")
        if href:
            links.add("https://www.cardekho.com" + href)

    return list(links)

In [None]:
# scrape car details

def scrape_car_page(driver, url):
    driver.get(url)
    time.sleep(random.uniform(2, 4))

    soup = BeautifulSoup(driver.page_source, "html.parser")

    # for debugging
    with open('debug_page.html','w', encoding= 'utf-8') as f:
        f.write(soup.prettify())
    
    details = {}

    for item in soup.find_all('div', class_ = 'outer-card-container posR'):

        labels = item.find_all('div', class_ = 'label')
        values = item.find_all('span',class_ = 'value-text')

        for label,value in zip(labels, values):
            details[label.text.strip()] = value.text.strip()
            # key = label.text.strip()
            # val = value.text.strip()
            # details[key] = val

    def safe(tag, class_name):
        el = soup.find(tag, class_ = class_name)
        return el.text.strip() if el else None

    data = {
        "url": url,
        "title": safe("div",'vehicleName'),
        "price": safe("div",'vehiclePrice'),
        "registration_yr": details.get('Registration Year'),
        "insurance": details.get('Insurance'),
        "fuel_type": details.get('Fuel Type'),
        "seats": details.get('Seats'),
        "km_driven": details.get('Kms Driven'),
        "ownership": details.get('Ownership'),
        "engine_displacement": details.get('Engine Displacement'),
        "transmission": details.get('Transmission'),
        "manufacture_yr": details.get('Year of Manufacture')
    }

    return data

In [None]:
# main

def main():
    driver = get_driver()
    driver.set_page_load_timeout(60)
    time.sleep(5)
    driver.get(BASE_URL)
    time.sleep(5)

    print("Scrolling page to load cars...")
    scroll_page(driver)

    print("Collecting car links...")
    car_links = get_car_links(driver)
    print(f"Found {len(car_links)} listings")

    all_data = []

    for i, link in enumerate(car_links[60:120], start=1):
        print(f"[{i}/{len(car_links)}] Scraping: {link}")
        try:
            car_data = scrape_car_page(driver, link)
            all_data.append(car_data)
        except Exception as e:
            print(f"Failed on {link}: {e}")
        time.sleep(random.uniform(2, 4))

    driver.quit()

    new_df = pd.DataFrame(all_data)

    if not existing_df.empty:
        final_df = pd.concat([existing_df, new_df],ignore_index= True)
    else:
        final_df = new_df

    final_df.to_csv(output_file, index=False)

    print(f"\n Data saved to {output_file}")
    print(f"Total records: {len(final_df)}")

In [None]:
if __name__ == "__main__":
    main()