1st stage of getting scrap dataset

In [1]:
#!/usr/bin/env python3
"""
cardekho_used_nagpur_scraper_with_mileage.py

Scrapes https://www.cardekho.com/used-cars+in+nagpur and writes CSV + Excel.
Extracted columns:
Car_name, brand, model, kms_driven, mileage, transmission, fuel_type, year_of_manufacture, price, detail_page

Requires: selenium, beautifulsoup4, pandas, openpyxl, webdriver-manager
"""

import re
import time
import random
import math
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

# Selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# -------- CONFIG ----------
# START_URL = "https://www.cardekho.com/used-cars+in+hyderabad"
START_URL = "https://www.cardekho.com/used-cars+in+nagpur"   # URL for nagpur
OUTPUT_CSV = "cardekho_used_cars_nagpur_with_mileage.csv"
OUTPUT_XLSX = "cardekho_used_cars_nagpur_with_mileage.xlsx"

HEADLESS = True                 # Set False to watch browser
MAX_PAGES_OVERRIDE = None       # Set to int to force how many result pages to scan; None -> auto-detect
MAX_SCROLLS = 40                # scroll rounds per search-result page
SCROLL_PAUSE = 0.7
PAGE_PAUSE = (0.6, 1.4)
VISIT_DETAIL_PAGES = True       # True -> open each listing detail page for mileage/transmission accuracy
DETAIL_PAUSE = (0.6, 1.2)
MAX_DETAIL_RETRIES = 1         # retry detail page once on failure
# Brands to help split brand/model
BRANDS = [
    "Maruti", "Hyundai", "Tata", "Honda", "Toyota", "Mahindra", "Kia",
    "BMW", "Audi", "Mercedes-Benz", "Mercedes", "Renault", "MG", "Skoda",
    "Volkswagen", "Ford", "Nissan", "Jeep", "Volvo", "Land Rover", "Jaguar",
    "Isuzu", "Datsun", "Chevrolet", "Opel"
]
# --------------------------

# Helper parsers
def guess_brand_and_model(name):
    if not name:
        return "", ""
    for b in BRANDS:
        if b.lower() in name.lower():
            model = re.sub(re.escape(b), "", name, flags=re.IGNORECASE).strip()
            model = re.sub(r'^[\-\:\–\—\s]+', '', model)
            return b, model or name
    parts = name.split()
    return (parts[0], " ".join(parts[1:]) if len(parts) > 1 else "") if parts else ("","")

def extract_kms(text):
    m = re.search(r'([\d,\.]+)\s*(?:kms|km)\b', text, flags=re.I)
    if m:
        return m.group(1).replace(",", "")
    return ""

def extract_fuel(text):
    for f in ["Petrol", "Diesel", "CNG", "LPG", "Electric", "Hybrid"]:
        if re.search(r'\b' + re.escape(f) + r'\b', text, flags=re.I):
            return f
    return ""

def extract_price(text):
    m = re.search(r'₹\s*[\d\.,\sA-Za-z]+', text)
    if m:
        return m.group(0).strip()
    m2 = re.search(r'[\d\.,]+\s*(Lakh|lakh|Lakhs|lakhs|Crore|crore|Cr)\b', text)
    if m2:
        return m2.group(0)
    return ""

def extract_year(text):
    m = re.search(r'\b(19|20)\d{2}\b', text)
    return m.group(0) if m else ""

# mileage pattern examples:
# 18.5 kmpl, 22 km/kg, 120 km/kWh, 18 kmpl (ARAI), 25.6 kmpl
MILEAGE_REGEXES = [
    re.compile(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|km/kg|km/kwh|km/l|kmperlitre|km/gal|km/100km|kml|kpl)\b', flags=re.I),
    re.compile(r'([\d]{1,3}(?:\.\d+)?)\s*(km(?:/kwh|/kg|pl)?)\b', flags=re.I),
    re.compile(r'([\d]{1,3}(?:\.\d+)?)\s*(mpg|mpg\;)', flags=re.I)
]

def extract_mileage(text):
    txt = text.replace("\xa0"," ").strip()
    for rx in MILEAGE_REGEXES:
        m = rx.search(txt)
        if m:
            val = m.group(1)
            unit = m.group(2)
            return f"{val} {unit}".strip()
    # sometimes written like "Mileage: 18.5 kmpl" or "18.5kmpl"
    m2 = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|km/kg|km/kwh|km|kpl|km/l)\b', txt, flags=re.I)
    if m2:
        return f"{m2.group(1)} {m2.group(2)}"
    return ""

# transmission extraction
def extract_transmission(text):
    for t in ["Manual", "Automatic", "CVT", "AMT", "DCT", "AT", "MT"]:
        if re.search(r'\b' + re.escape(t) + r'\b', text, flags=re.I):
            # normalize common variants
            if t.upper() in ("AT","AMT","CVT","DCT","MT"):
                return t.upper()
            return t.title()
    return ""

# safe text extractor for BeautifulSoup element
def text_of(elem):
    return elem.get_text(" ", strip=True) if elem else ""

# ---------- main ----------
def main():
    # Setup Selenium Chrome
    chrome_opts = Options()
    if HEADLESS:
        chrome_opts.add_argument("--headless=new")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_argument("--window-size=1920,1080")
    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_opts)

    try:
        # load first page and detect pages count
        driver.get(START_URL)
        time.sleep(2.0)
        soup0 = BeautifulSoup(driver.page_source, "html.parser")
        page_text = soup0.get_text(" ", strip=True)

        total_listings = None
        m = re.search(r'([\d,]{2,})\s+Second Hand Cars in Nagpur', page_text, flags=re.I)
        if not m:
            m = re.search(r'of\s+([\d,]+)\s+results', page_text, flags=re.I)
        if m:
            total_listings = int(m.group(1).replace(",", ""))

        per_page_guess = max(1, len(soup0.find_all("h3")))
        estimated_pages = math.ceil(total_listings / per_page_guess) if total_listings else None
        total_pages = int(estimated_pages) if estimated_pages else 200
        if MAX_PAGES_OVERRIDE:
            total_pages = MAX_PAGES_OVERRIDE

        print(f"Detected total_listings={total_listings}, per_page_guess={per_page_guess}, total_pages={total_pages}")

        rows = []
        seen_keys = set()
        detail_links = []

        # iterate search result pages
        for p in range(1, total_pages + 1):
            page_url = START_URL.rstrip("/") + "?page=" + str(p)
            try:
                driver.get(page_url)
            except Exception:
                time.sleep(1.0)
                driver.get(page_url)

            # aggressively scroll to let lazy content load
            last_h = driver.execute_script("return document.body.scrollHeight")
            scroll_count = 0
            while scroll_count < MAX_SCROLLS:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(SCROLL_PAUSE)
                new_h = driver.execute_script("return document.body.scrollHeight")
                if new_h == last_h:
                    # wiggle
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight-400);")
                    time.sleep(0.4)
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(0.4)
                    new_h = driver.execute_script("return document.body.scrollHeight")
                    if new_h == last_h:
                        break
                last_h = new_h
                scroll_count += 1

            page_soup = BeautifulSoup(driver.page_source, "html.parser")

            # find title nodes (h3) and extract card container text
            titles = page_soup.find_all("h3")
            for h in titles:
                title = text_of(h)
                if not title:
                    continue

                # climb up to find a container with price or kms present
                container = h
                card_text = ""
                link = ""
                for _ in range(6):
                    if container is None:
                        break
                    card_text = text_of(container)
                    if "₹" in card_text or re.search(r'\b\d+\s*(?:kms|km)\b', card_text, flags=re.I):
                        # find anchor inside container for detail link if present
                        a = container.find("a", href=True)
                        if a:
                            href = a["href"]
                            link = href if "cardekho.com" in href else urljoin(page_url, href)
                            link = link.split("#")[0].split("?utm")[0]
                        break
                    container = container.parent

                if not card_text:
                    # fallback to grabbing parent text
                    parent = h.parent
                    card_text = text_of(parent) if parent else title

                # check minimal heuristics: price or kms exist
                if ("₹" not in card_text) and (re.search(r'\b\d+\s*(?:kms|km)\b', card_text, flags=re.I) is None):
                    continue

                # dedupe by title+price
                price_snip = extract_price(card_text)
                key = (title + "||" + price_snip).strip()
                if key in seen_keys:
                    continue
                seen_keys.add(key)

                kms = extract_kms(card_text)
                fuel = extract_fuel(card_text)
                year = extract_year(title) or extract_year(card_text)
                brand, model = guess_brand_and_model(title)
                price = price_snip

                # prepare base row (mileage/transmission may be blank now; fill from detail page if VISIT_DETAIL_PAGES)
                rows.append({
                    "Car_name": title,
                    "brand": brand,
                    "model": model,
                    "kms_driven": kms,
                    "mileage": "",             # to fill
                    "transmission": "",       # to fill
                    "fuel_type": fuel,
                    "year_of_manufacture": year,
                    "price": price,
                    "detail_page": link
                })

                if link:
                    detail_links.append(link)

            # jitter between pages
            time.sleep(random.uniform(*PAGE_PAUSE))

            # early stop if found >= total_listings
            if total_listings and len(seen_keys) >= total_listings:
                print("Reached detected total listings; stopping page scan.")
                break

            # small progress print
            if p % 10 == 0:
                print(f"Scanned page {p}; collected rows so far: {len(rows)}")

        print(f"Collected {len(rows)} card-level rows, discovered {len(set(detail_links))} detail links.")

        # Optionally visit detail pages for mileage + transmission (more accurate)
        if VISIT_DETAIL_PAGES and detail_links:
            unique_detail_links = []
            seen_dl = set()
            for u in detail_links:
                if u and u not in seen_dl:
                    seen_dl.add(u)
                    unique_detail_links.append(u)

            # map detail_link -> parsed fields
            detail_map = {}

            for i, dl in enumerate(unique_detail_links):
                # small polite wait
                if i > 0:
                    time.sleep(random.uniform(*DETAIL_PAUSE))

                # retry loop
                attempt = 0
                success = False
                while attempt <= MAX_DETAIL_RETRIES and not success:
                    try:
                        driver.get(dl)
                        time.sleep(1.0 + random.random()*0.8)  # allow JS
                        dsoup = BeautifulSoup(driver.page_source, "html.parser")
                        page_text = dsoup.get_text(" ", strip=True)

                        # Try structured lookups:
                        # 1) meta-line near the H1/H2 (often contains "kms • Petrol • 2019 • 18.5 kmpl • Automatic")
                        h_tag = dsoup.find(["h1","h2"])
                        meta_line = ""
                        if h_tag:
                            nxt = h_tag.find_next()
                            checks = 0
                            while nxt and checks < 8:
                                t = text_of(nxt)
                                if t and (("kms" in t.lower()) or ("₹" in t) or re.search(r'\b\d+\s*(?:kmpl|km/kg|km/kwh|km/l|kpl)\b', t, flags=re.I)):
                                    meta_line = t
                                    break
                                nxt = nxt.find_next()
                                checks += 1

                        # 2) labeled field lookup: look for text nodes like 'Mileage' or 'Transmission' and read siblings
                        # Mileage
                        mileage_val = ""
                        # find text nodes "Mileage" or "Avg. Mileage" etc.
                        mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
                        if mnode:
                            try:
                                # sibling / next element may contain value
                                parent = mnode.parent
                                sib = parent.find_next_sibling()
                                if sib:
                                    mileage_val = text_of(sib)
                            except:
                                mileage_val = ""
                        if not mileage_val:
                            # try meta_line and page_text
                            mileage_val = extract_mileage(meta_line or page_text) or extract_mileage(page_text)

                        # Transmission
                        trans_val = ""
                        tnode = dsoup.find(text=re.compile(r'\bTransmission\b|\bGearbox\b', flags=re.I))
                        if tnode:
                            try:
                                parent = tnode.parent
                                sib = parent.find_next_sibling()
                                if sib:
                                    trans_val = text_of(sib)
                            except:
                                trans_val = ""
                        if not trans_val:
                            trans_val = extract_transmission(meta_line or page_text)

                        # Normalize/massage values
                        mileage_clean = extract_mileage(mileage_val or "")
                        transmission_clean = extract_transmission(trans_val or "")

                        # fallback: sometimes card-level has mileage like '18 kmpl' in small text — try to extract from page text
                        if not mileage_clean:
                            mileage_clean = extract_mileage(page_text)

                        # store
                        detail_map[dl] = {
                            "mileage": mileage_clean,
                            "transmission": transmission_clean
                        }
                        success = True
                    except Exception as e:
                        attempt += 1
                        if attempt > MAX_DETAIL_RETRIES:
                            # give empty values on failure
                            detail_map[dl] = {"mileage": "", "transmission": ""}
                            success = True
                        else:
                            time.sleep(0.8)

                # progress print occasionally
                if (i+1) % 100 == 0:
                    print(f"Processed {i+1} detail pages...")

            # Now merge detail_map into rows
            for r in rows:
                link = r.get("detail_page", "")
                if link and link in detail_map:
                    r["mileage"] = detail_map[link]["mileage"]
                    r["transmission"] = detail_map[link]["transmission"]
                else:
                    # attempt to find mileage/trans in the card text (already tried earlier)
                    # leave empty if not found
                    if not r["mileage"]:
                        # try to infer from model/name
                        r["mileage"] = ""
                    if not r["transmission"]:
                        r["transmission"] = ""

        # build DataFrame and clean
        df = pd.DataFrame(rows, columns=[
            "Car_name", "brand", "model", "kms_driven", "mileage", "transmission",
            "fuel_type", "year_of_manufacture", "price", "detail_page"
        ])

        # normalize
        df["kms_driven"] = df["kms_driven"].fillna("").astype(str).apply(lambda x: re.sub(r'[^\d\.]', '', x))
        df["mileage"] = df["mileage"].fillna("").astype(str).apply(lambda x: x.strip())
        df["transmission"] = df["transmission"].fillna("").astype(str).apply(lambda x: x.strip().title())
        df["year_of_manufacture"] = df["year_of_manufacture"].fillna("").astype(str).apply(lambda x: (re.search(r'\b(19|20)\d{2}\b', x).group(0) if re.search(r'\b(19|20)\d{2}\b', x) else ""))
        df["price"] = df["price"].fillna("").astype(str).apply(lambda x: x.strip())
        df["fuel_type"] = df["fuel_type"].fillna("").astype(str).apply(lambda x: x.strip().title())

        # dedupe by detail_link if present else by Car_name+price
        if df["detail_page"].notnull().sum() > 0:
            df = df.drop_duplicates(subset=["detail_page"]).reset_index(drop=True)
        else:
            df = df.drop_duplicates(subset=["Car_name", "price"]).reset_index(drop=True)

        # save
        df.to_csv(OUTPUT_CSV, index=False)
        df.to_excel(OUTPUT_XLSX, index=False)
        print(f"Done. Collected {len(df)} rows. Saved to {OUTPUT_CSV} and {OUTPUT_XLSX}")

    finally:
        try:
            driver.quit()
        except:
            pass

if __name__ == "__main__":
    main()


Detected total_listings=600, per_page_guess=64, total_pages=10
Scanned page 10; collected rows so far: 517
Collected 517 card-level rows, discovered 504 detail links.


  mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
  tnode = dsoup.find(text=re.compile(r'\bTransmission\b|\bGearbox\b', flags=re.I))


Processed 100 detail pages...
Processed 200 detail pages...
Processed 300 detail pages...
Processed 400 detail pages...
Processed 500 detail pages...
Done. Collected 505 rows. Saved to cardekho_used_cars_nagpur_with_mileage.csv and cardekho_used_cars_nagpur_with_mileage.xlsx


2nd Stage of Web Scrapping

In [3]:
#!/usr/bin/env python3
"""
cardekho_with_price_fix.py

Improved scraper for CarDekho used cars (Nagpur) with robust price extraction.
Requires: selenium, webdriver-manager, beautifulsoup4, pandas, openpyxl
"""

import re
import time
import random
import math
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# -------- CONFIG ----------
START_URL = "https://www.cardekho.com/used-cars+in+nagpur"
OUTPUT_CSV = "cardekho_used_cars_nagpur_price_fixed.csv"
OUTPUT_XLSX = "cardekho_used_cars_nagpur_price_fixed.xlsx"

HEADLESS = True
MAX_PAGES_OVERRIDE = None
MAX_SCROLLS = 40
SCROLL_PAUSE = 0.7
PAGE_PAUSE = (0.6, 1.4)
VISIT_DETAIL_PAGES = True
DETAIL_PAUSE = (0.6, 1.2)
MAX_DETAIL_RETRIES = 1

BRANDS = [
    "Maruti", "Hyundai", "Tata", "Honda", "Toyota", "Mahindra", "Kia",
    "BMW", "Audi", "Mercedes-Benz", "Mercedes", "Renault", "MG", "Skoda",
    "Volkswagen", "Ford", "Nissan", "Jeep", "Volvo", "Land Rover", "Jaguar",
    "Isuzu", "Datsun", "Chevrolet", "Opel"
]
# --------------------------

def text_of(elem):
    return elem.get_text(" ", strip=True) if elem else ""

def guess_brand_and_model(name):
    if not name:
        return "", ""
    for b in BRANDS:
        if b.lower() in name.lower():
            model = re.sub(re.escape(b), "", name, flags=re.IGNORECASE).strip()
            model = re.sub(r'^[\-\:\–\—\s]+', '', model)
            return b, model or name
    parts = name.split()
    return (parts[0], " ".join(parts[1:]) if len(parts) > 1 else "") if parts else ("","")

def extract_kms(text):
    m = re.search(r'([\d,\.]+)\s*(?:kms|km)\b', text, flags=re.I)
    if m:
        return m.group(1).replace(",", "")
    return ""

def extract_fuel(text):
    for f in ["Petrol", "Diesel", "CNG", "LPG", "Electric", "Hybrid"]:
        if re.search(r'\b' + re.escape(f) + r'\b', text, flags=re.I):
            return f
    return ""

# NEW: robust price extraction from soup and raw text
def extract_price_from_soup(soup):
    # 1. meta tags
    meta_selectors = [
        ('meta', {'property': 'og:price:amount'}),
        ('meta', {'itemprop': 'price'}),
        ('meta', {'name': 'price'}),
    ]
    for tag, attrs in meta_selectors:
        mtag = soup.find(tag, attrs=attrs)
        if mtag:
            val = mtag.get('content') or mtag.get('value') or ""
            if val:
                # normalize: prepend ₹ if numeric and no symbol
                if re.search(r'[\d]', val) and '₹' not in val:
                    return "₹ " + val.strip()
                return val.strip()

    # 2. attributes that often store price
    for attr in ('data-price', 'data-offer-price', 'data-srp', 'data-amount', 'data-price-value'):
        el = soup.find(attrs={attr: True})
        if el:
            val = el.get(attr)
            if val:
                if re.search(r'[\d]', val) and '₹' not in val:
                    return "₹ " + val.strip()
                return val.strip()

    # 3. elements with class or id containing 'price' or 'amount'
    px = soup.find(lambda tag: tag.name in ("div","span","p","strong") and (
        tag.get("class") or tag.get("id")
    ) and re.search(r'price|amount|selling|srp|finalPrice|carPrice|actual-price', " ".join((tag.get("class") or []) + [tag.get("id") or ""]), flags=re.I))
    if px:
        txt = text_of(px)
        pr = find_rupee_in_text(txt)
        if pr:
            return pr
        if txt:
            return txt.strip()

    # 4. any visible text near top with rupee sign
    top_region = ""
    # try header / top sections
    head_candidates = soup.find_all(["header", "section", "div"], limit=6)
    for c in head_candidates:
        t = text_of(c)
        if '₹' in t:
            top_region = t
            break
    if not top_region:
        # fallback full page text (takes last resort)
        top_region = soup.get_text(" ", strip=True)

    pr = find_rupee_in_text(top_region)
    if pr:
        return pr

    # 5. fallback regex on whole page (Lakh/Crore)
    p2 = re.search(r'[\d\.,]+\s*(?:Lakh|lakh|Lakhs|lakhs|Crore|crore|Cr)\b', soup.get_text(" ", strip=True))
    if p2:
        return p2.group(0).strip()

    return ""

def find_rupee_in_text(text):
    if not text:
        return ""
    m = re.search(r'₹\s*[\d\.,\sA-Za-z]+', text)
    if m:
        return m.group(0).strip()
    # sometimes rupee symbol is missing but values use lakh/crore
    m2 = re.search(r'[\d\.,]+\s*(Lakh|lakh|Lakhs|lakhs|Crore|crore|Cr)\b', text)
    if m2:
        return m2.group(0).strip()
    return ""

def extract_price_from_text(text):
    # ensures same fallback when only raw text available
    p = find_rupee_in_text(text)
    return p

def extract_year(text):
    m = re.search(r'\b(19|20)\d{2}\b', text)
    return m.group(0) if m else ""

# main
def main():
    chrome_opts = Options()
    if HEADLESS:
        chrome_opts.add_argument("--headless=new")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_argument("--window-size=1920,1080")
    chrome_opts.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_opts)

    try:
        driver.get(START_URL)
        time.sleep(2.0)
        soup0 = BeautifulSoup(driver.page_source, "html.parser")
        text0 = soup0.get_text(" ", strip=True)

        total_listings = None
        m = re.search(r'([\d,]{2,})\s+Second Hand Cars in Nagpur', text0, flags=re.I)
        if not m:
            m = re.search(r'of\s+([\d,]+)\s+results', text0, flags=re.I)
        if m:
            total_listings = int(m.group(1).replace(",", ""))

        per_page_guess = max(1, len(soup0.find_all("h3")))
        estimated_pages = math.ceil(total_listings / per_page_guess) if total_listings else None
        total_pages = int(estimated_pages) if estimated_pages else 200
        if MAX_PAGES_OVERRIDE:
            total_pages = MAX_PAGES_OVERRIDE

        print(f"Detected total_listings={total_listings}, per_page_guess={per_page_guess}, total_pages={total_pages}")

        rows = []
        seen_keys = set()
        detail_links = []

        for p in range(1, total_pages + 1):
            page_url = START_URL.rstrip("/") + "?page=" + str(p)
            try:
                driver.get(page_url)
            except Exception:
                time.sleep(1.0)
                driver.get(page_url)

            # scroll aggressively
            last_h = driver.execute_script("return document.body.scrollHeight")
            sc = 0
            while sc < MAX_SCROLLS:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(SCROLL_PAUSE)
                new_h = driver.execute_script("return document.body.scrollHeight")
                if new_h == last_h:
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight-400);")
                    time.sleep(0.4)
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(0.4)
                    new_h = driver.execute_script("return document.body.scrollHeight")
                    if new_h == last_h:
                        break
                last_h = new_h
                sc += 1

            page_soup = BeautifulSoup(driver.page_source, "html.parser")
            titles = page_soup.find_all("h3")
            for h in titles:
                title = text_of(h)
                if not title:
                    continue

                container = h
                card_text = ""
                link = ""
                for _ in range(6):
                    if container is None:
                        break
                    card_text = text_of(container)
                    if "₹" in card_text or re.search(r'\b\d+\s*(?:kms|km)\b', card_text, flags=re.I):
                        a = container.find("a", href=True)
                        if a:
                            href = a["href"]
                            link = href if "cardekho.com" in href else urljoin(page_url, href)
                            link = link.split("#")[0].split("?utm")[0]
                        break
                    container = container.parent

                if not card_text:
                    parent = h.parent
                    card_text = text_of(parent) if parent else title

                if ("₹" not in card_text) and (re.search(r'\b\d+\s*(?:kms|km)\b', card_text, flags=re.I) is None):
                    continue

                price_card = extract_price_from_text(card_text)
                key = (title + "||" + (price_card or "")).strip()
                if key in seen_keys:
                    continue
                seen_keys.add(key)

                kms = extract_kms(card_text)
                fuel = extract_fuel(card_text)
                year = extract_year(title) or extract_year(card_text)
                brand = ""
                model = ""
                try:
                    brand, model = guess_brand_and_model(title)
                except:
                    pass

                rows.append({
                    "Car_name": title,
                    "brand": brand,
                    "model": model,
                    "kms_driven": kms,
                    "mileage": "",
                    "transmission": "",
                    "fuel_type": fuel,
                    "year_of_manufacture": year,
                    "price": price_card,
                    "detail_page": link
                })

                if link:
                    detail_links.append(link)

            time.sleep(random.uniform(*PAGE_PAUSE))

            if total_listings and len(seen_keys) >= total_listings:
                print("Reached detected total listings; stopping page scan.")
                break

            if p % 10 == 0:
                print(f"Scanned page {p}; rows so far: {len(rows)}")

        print(f"Collected {len(rows)} card-level rows, detail links: {len(set(detail_links))}")

        # Now ensure price is filled: visit detail pages for any row missing price
        if VISIT_DETAIL_PAGES and detail_links:
            unique_detail_links = []
            seen_dl = set()
            for u in detail_links:
                if u and u not in seen_dl:
                    seen_dl.add(u)
                    unique_detail_links.append(u)

            # map link -> price (and optionally mileage/transmission)
            detail_map = {}

            for i, dl in enumerate(unique_detail_links):
                # polite pause
                if i > 0:
                    time.sleep(random.uniform(*DETAIL_PAUSE))
                attempt = 0
                success = False
                while attempt <= MAX_DETAIL_RETRIES and not success:
                    try:
                        driver.get(dl)
                        time.sleep(1.0 + random.random()*0.8)
                        dsoup = BeautifulSoup(driver.page_source, "html.parser")

                        # price from many possible places
                        price_val = extract_price_from_soup(dsoup)
                        # fallback to regex on page text
                        if not price_val:
                            price_val = extract_price_from_text(dsoup.get_text(" ", strip=True))

                        # mileage and transmission extraction as before
                        page_text = dsoup.get_text(" ", strip=True)
                        # try meta-line near title for quick values
                        h_tag = dsoup.find(["h1","h2"])
                        meta_line = ""
                        if h_tag:
                            nxt = h_tag.find_next()
                            checks = 0
                            while nxt and checks < 8:
                                t = text_of(nxt)
                                if t and (("kms" in t.lower()) or ("₹" in t) or re.search(r'\b\d+\s*(?:kmpl|km/kg|km/kwh|km/l|kpl)\b', t, flags=re.I)):
                                    meta_line = t
                                    break
                                nxt = nxt.find_next()
                                checks += 1

                        # mileage
                        mileage_val = ""
                        mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
                        if mnode:
                            try:
                                parent = mnode.parent
                                sib = parent.find_next_sibling()
                                if sib:
                                    mileage_val = text_of(sib)
                            except:
                                mileage_val = ""
                        if not mileage_val:
                            # check meta_line then page_text
                            mileage_val = extract_mileage(meta_line or page_text) or extract_mileage(page_text)

                        # transmission
                        trans_val = ""
                        tnode = dsoup.find(text=re.compile(r'\bTransmission\b|\bGearbox\b', flags=re.I))
                        if tnode:
                            try:
                                parent = tnode.parent
                                sib = parent.find_next_sibling()
                                if sib:
                                    trans_val = text_of(sib)
                            except:
                                trans_val = ""
                        if not trans_val:
                            trans_val = extract_transmission(meta_line or page_text)

                        detail_map[dl] = {
                            "price": price_val or "",
                            "mileage": mileage_val or "",
                            "transmission": trans_val or ""
                        }
                        success = True
                    except Exception as e:
                        attempt += 1
                        if attempt > MAX_DETAIL_RETRIES:
                            detail_map[dl] = {"price": "", "mileage": "", "transmission": ""}
                            success = True
                        else:
                            time.sleep(0.8)

                if (i+1) % 100 == 0:
                    print(f"Processed {i+1} detail pages...")

            # merge into rows
            for r in rows:
                link = r.get("detail_page", "")
                if link and link in detail_map:
                    # prefer detail price if card-level empty
                    if not r.get("price"):
                        r["price"] = detail_map[link]["price"]
                    # always try to fill mileage/trans if available
                    if not r.get("mileage"):
                        r["mileage"] = detail_map[link]["mileage"]
                    if not r.get("transmission"):
                        r["transmission"] = detail_map[link]["transmission"]

        # final normalization
        df = pd.DataFrame(rows, columns=[
            "Car_name","brand","model","kms_driven","mileage","transmission","fuel_type","year_of_manufacture","price","detail_page"
        ])
        df["kms_driven"] = df["kms_driven"].fillna("").astype(str).apply(lambda x: re.sub(r'[^\d\.]', '', x))
        df["mileage"] = df["mileage"].fillna("").astype(str).apply(lambda x: x.strip())
        df["transmission"] = df["transmission"].fillna("").astype(str).apply(lambda x: x.strip().title())
        df["year_of_manufacture"] = df["year_of_manufacture"].fillna("").astype(str).apply(lambda x: (re.search(r'\b(19|20)\d{2}\b', x).group(0) if re.search(r'\b(19|20)\d{2}\b', x) else ""))
        df["price"] = df["price"].fillna("").astype(str).apply(lambda x: x.strip())
        df["fuel_type"] = df["fuel_type"].fillna("").astype(str).apply(lambda x: x.strip().title())

        # dedupe
        if "detail_page" in df.columns and df["detail_page"].str.len().sum() > 0:
            df = df.drop_duplicates(subset=["detail_page"]).reset_index(drop=True)
        else:
            df = df.drop_duplicates(subset=["Car_name","price"]).reset_index(drop=True)

        # save
        df.to_csv(OUTPUT_CSV, index=False)
        df.to_excel(OUTPUT_XLSX, index=False)
        print(f"Saved {len(df)} rows to {OUTPUT_CSV} and {OUTPUT_XLSX}")

    finally:
        try:
            driver.quit()
        except:
            pass

# helpers used in detail extraction (mileage/transmission)
def extract_mileage(text):
    if not text:
        return ""
    # reuse a robust regex set (common patterns)
    m = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|km/kg|km/kwh|km/l|kpl|km)', text, flags=re.I)
    if m:
        return f"{m.group(1)} {m.group(2)}".strip()
    m2 = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(mpg)\b', text, flags=re.I)
    if m2:
        return f"{m2.group(1)} {m2.group(2)}"
    return ""

def extract_transmission(text):
    if not text:
        return ""
    for t in ["Manual","Automatic","CVT","AMT","DCT","AT","MT"]:
        if re.search(r'\b' + re.escape(t) + r'\b', text, flags=re.I):
            if t.upper() in ("AT","AMT","CVT","DCT","MT"):
                return t.upper()
            return t.title()
    return ""

if __name__ == "__main__":
    main()


Detected total_listings=601, per_page_guess=64, total_pages=10
Scanned page 10; rows so far: 533
Collected 533 card-level rows, detail links: 516


  mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
  tnode = dsoup.find(text=re.compile(r'\bTransmission\b|\bGearbox\b', flags=re.I))


Processed 100 detail pages...
Processed 200 detail pages...
Processed 300 detail pages...
Processed 400 detail pages...
Processed 500 detail pages...
Saved 517 rows to cardekho_used_cars_nagpur_price_fixed.csv and cardekho_used_cars_nagpur_price_fixed.xlsx


3rd Stage of Web Scrap

In [6]:
# Jupyter cell: Clean Selenium scraper (improved from your first working script) + mileage extraction
# Produces: Car_name, brand, model, kms_driven, fuel_type, year_of_manufacture, price, mileage, detail_page
# Saves to cardekho_used_cars_hyderabad_clean_fixed.csv and .xlsx
# Requirements: selenium, webdriver-manager, beautifulsoup4, pandas, openpyxl

import re, time, random, math
from urllib.parse import urljoin, urlparse
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# -------------- CONFIG --------------
START_URL = "https://www.cardekho.com/used-cars+in+nagpur"
OUTPUT_CSV = "cardekho_used_cars_nagpur_clean_fixed.csv"
OUTPUT_XLSX = "cardekho_used_cars_nagpur_clean_fixed.xlsx"
HEADLESS = True               # set False to see browser for debugging
MAX_PAGES_OVERRIDE = None     # set an int to force more pages, else auto-detect
MAX_SCROLL_ROUNDS = 60        # number of scroll attempts per page (increase to load more)
SCROLL_PAUSE = 0.6            # seconds between scrolls
PAGE_PAUSE = (0.8, 1.6)       # jitter after loading a page
# Basic brand list to split brand/model (optional)
BRANDS = ["Maruti","Hyundai","Tata","Honda","Toyota","Mahindra","Kia","BMW","Audi","Mercedes-Benz",
          "Renault","MG","Skoda","Volkswagen","Ford","Nissan","Jeep","Volvo","Land Rover","Jaguar",
          "Isuzu","Datsun","Chevrolet","Opel"]
# -------------------------------------

# helpers
def guess_brand_and_model(title):
    if not title: return "", ""
    for b in BRANDS:
        if b.lower() in title.lower():
            brand = b
            model = re.sub(re.escape(b), "", title, flags=re.IGNORECASE).strip()
            model = re.sub(r'^[\-\:\–\—\s]+','', model)
            if not model:
                model = title
            return brand, model
    parts = title.split()
    return (parts[0], " ".join(parts[1:])) if parts else ("","")

def clean_kms(k):
    if not k: return ""
    return re.sub(r'[^\d\.]', '', str(k))

def find_rupee(text):
    m = re.search(r'₹\s*[\d\.,\sA-Za-z]+', text)
    return m.group(0).strip() if m else ""

def find_kms(text):
    m = re.search(r'([\d,\.]+)\s*(?:kms|km)\b', text, flags=re.I)
    return m.group(1).replace(",","") if m else ""

def find_fuel(text):
    for f in ["Petrol","Diesel","CNG","LPG","Electric","Hybrid"]:
        if re.search(r'\b' + re.escape(f) + r'\b', text, flags=re.I):
            return f
    return ""

def find_year(text):
    m = re.search(r'\b(19|20)\d{2}\b', text)
    return m.group(0) if m else ""

# NEW: mileage extraction helper (looks for kmpl / km/kg / km/kWh / etc.)
def extract_mileage(text):
    if not text:
        return ""
    txt = text.replace("\xa0"," ").strip()
    # common patterns like "18.5 kmpl", "22 km/kg", "120 km/kWh"
    m = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|km/kg|km/kwh|km/l|kpl|km)', txt, flags=re.I)
    if m:
        val = m.group(1)
        unit = m.group(2)
        return f"{val} {unit}".strip()
    # sometimes "Mileage: 18.5 kmpl" or "18.5kmpl" or "ARAI mileage 18 kmpl"
    m2 = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|kpl|km/kg|km/kwh|km/l)\b', txt, flags=re.I)
    if m2:
        return f"{m2.group(1)} {m2.group(2)}"
    # also check "Mileage - 18.5" followed by unit nearby
    m3 = re.search(r'Mileage[:\-\s]*([\d]{1,3}(?:\.\d+)?)', txt, flags=re.I)
    if m3:
        # try to find unit near the number
        after = txt[m3.end(): m3.end()+12]
        u = re.search(r'(kmpl|kpl|km/kg|km/kwh|km/l)', after, flags=re.I)
        if u:
            return f"{m3.group(1)} {u.group(1)}"
        return m3.group(1)
    return ""

# Setup Selenium
opts = Options()
if HEADLESS:
    opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--disable-gpu")
# avoid automation flags where possible (helps some sites)
opts.add_argument("--disable-blink-features=AutomationControlled")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=opts)

try:
    driver.get(START_URL)
    time.sleep(2.0)

    # detect total listings/pages (best-effort)
    soup0 = BeautifulSoup(driver.page_source, "html.parser")
    text0 = soup0.get_text(" ", strip=True)
    total_listings = None
    m = re.search(r'([\d,]{2,})\s+Second Hand Cars in Nagpur', text0, flags=re.I)
    if not m:
        m = re.search(r'of\s+([\d,]+)\s+results', text0, flags=re.I)
    if m:
        total_listings = int(m.group(1).replace(",", ""))
    per_page_guess = max(1, len(soup0.find_all("h3")))
    estimated_pages = math.ceil(total_listings / per_page_guess) if total_listings else None
    total_pages = int(estimated_pages) if estimated_pages else 200
    if MAX_PAGES_OVERRIDE:
        total_pages = MAX_PAGES_OVERRIDE
    print(f"Detected total_listings={total_listings}, per_page_guess={per_page_guess}, total_pages_to_try={total_pages}")

    # --- collect all listing card containers (and their detail links where present) ---
    detail_links = []
    cards_collected = []
    seen_links = set()
    seen_keys = set()  # dedupe by title+price

    for p in range(1, total_pages + 1):
        page_url = START_URL.rstrip("/") + "?page=" + str(p)
        try:
            driver.get(page_url)
        except Exception:
            time.sleep(1.0)
            driver.get(page_url)
        # aggressively scroll to trigger lazy-load
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_round = 0
        while scroll_round < MAX_SCROLL_ROUNDS:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE + random.random()*0.3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                # small wiggle to force load
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight-400);")
                time.sleep(0.4)
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.4)
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
            last_height = new_height
            scroll_round += 1

        # parse page to find listing *cards*
        page_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Heuristic: cards often contain a title (h3), price (₹) and kms text.
        # We'll find all h3/title nodes and then locate the nearest card container around them.
        titles = page_soup.find_all("h3")
        for h in titles:
            title = h.get_text(" ", strip=True)
            if not title:
                continue

            # climb up parents to find a card-like container (max 6 levels)
            parent = h
            container_text = ""
            detail_href = ""
            for _ in range(6):
                if parent is None:
                    break
                # gather text
                txt = parent.get_text(" ", strip=True)
                if "₹" in txt or re.search(r'\b\d+\s*(?:kms|km)\b', txt, flags=re.I):
                    container_text = txt
                    # also try to find detail link inside this parent container
                    a = parent.find("a", href=True)
                    if a:
                        href = a["href"]
                        abs_href = href if "cardekho.com" in href else urljoin(page_url, href)
                        detail_href = abs_href.split("#")[0].split("?utm")[0]
                    break
                parent = parent.parent

            # fallback: if no container_text found, use h.get_text plus parent.get_text
            if not container_text:
                parent = h.parent
                container_text = parent.get_text(" ", strip=True) if parent else h.get_text(" ", strip=True)

            # ensure it's likely a listing: should have price or kms or both
            if ("₹" not in container_text) and (re.search(r'\b\d+\s*(?:kms|km)\b', container_text, flags=re.I) is None):
                continue

            price = find_rupee(container_text)
            kms = find_kms(container_text)
            fuel = find_fuel(container_text)
            year = find_year(title) or find_year(container_text)
            mileage = extract_mileage(container_text)  # <-- extract mileage from card text

            brand, model = guess_brand_and_model(title)

            # dedupe key
            unique_key = (title + "||" + (price or "")).strip()
            if unique_key in seen_keys:
                continue
            seen_keys.add(unique_key)

            if detail_href and detail_href not in seen_links:
                seen_links.add(detail_href)
            # append raw row (text-based) including mileage
            cards_collected.append({
                "Car_name": title,
                "brand": brand,
                "model": model,
                "kms_driven": kms.replace(",",""),
                "fuel_type": fuel,
                "year_of_manufacture": year,
                "price": price,
                "mileage": mileage,
                "detail_page": detail_href,
                "page": p
            })

        # small pause
        time.sleep(random.uniform(*PAGE_PAUSE))

        # early stop if we've collected at least detected total_listings
        if total_listings and len(seen_keys) >= total_listings:
            print("Collected detected total_listings, stopping page scan.")
            break

        # small progress print every 10 pages
        if p % 10 == 0:
            print(f"Scanned page {p}; collected cards so far: {len(cards_collected)}")

    print(f"Initial collection done: {len(cards_collected)} card rows, detail links discovered: {len(seen_links)}")

    # If detail links exist, visit each detail page to extract more reliable fields (optional but recommended)
    # We'll visit only pages that either lack kms/fuel/year/price/mileage to improve data quality.
    # This block is slower; set visit_details=False to skip.
    visit_details = True
    improved_rows = []
    visited = 0

    if visit_details and len(seen_links) > 0:
        for idx, row in enumerate(cards_collected):
            # decide whether to open detail page: if any of main fields missing or no detail link present
            need_detail = False
            if not row["kms_driven"] or not row["fuel_type"] or not row["price"] or not row["year_of_manufacture"] or not row.get("mileage"):
                need_detail = True
            if row["detail_page"]:
                detail_url = row["detail_page"]
            else:
                detail_url = None
            if not need_detail and detail_url:
                # keep as is
                improved_rows.append(row)
                continue

            if detail_url:
                try:
                    # open detail page
                    driver.get(detail_url)
                    # wait short while
                    time.sleep(1.0 + random.random()*0.8)
                    dsoup = BeautifulSoup(driver.page_source, "html.parser")
                    page_text = dsoup.get_text(" ", strip=True)

                    # Title from h1/h2 if present
                    ttag = dsoup.find(["h1","h2"])
                    if ttag:
                        title_det = ttag.get_text(" ", strip=True)
                        if title_det:
                            row["Car_name"] = title_det
                            brand, model = guess_brand_and_model(title_det)
                            row["brand"] = brand
                            row["model"] = model

                    # Try to find labeled values first (reliable)
                    # Kms (look for label 'Kms Driven' or 'Kms')
                    label_kms = dsoup.find(text=re.compile(r'Kms\s*Driven|Kms|Odometer', flags=re.I))
                    if label_kms:
                        try:
                            val = label_kms.parent.find_next_sibling()
                            if val:
                                k = re.sub(r'[^\d]', '', val.get_text(" ", strip=True))
                                if k:
                                    row["kms_driven"] = k
                        except:
                            pass
                    # fallback to page text
                    if not row["kms_driven"]:
                        kf = find_kms(page_text)
                        if kf:
                            row["kms_driven"] = kf

                    # fuel type label
                    label_fuel = dsoup.find(text=re.compile(r'Fuel\s*Type|Fuel', flags=re.I))
                    if label_fuel:
                        try:
                            val = label_fuel.parent.find_next_sibling()
                            if val:
                                row["fuel_type"] = val.get_text(" ", strip=True)
                        except:
                            pass
                    if not row["fuel_type"]:
                        ff = find_fuel(page_text)
                        if ff:
                            row["fuel_type"] = ff

                    # year
                    label_year = dsoup.find(text=re.compile(r'Year\s*of\s*Manufacture|Registration\s*Year|Year', flags=re.I))
                    if label_year:
                        try:
                            val = label_year.parent.find_next_sibling()
                            if val:
                                yy = find_year(val.get_text(" ", strip=True))
                                if yy:
                                    row["year_of_manufacture"] = yy
                        except:
                            pass
                    if not row["year_of_manufacture"]:
                        yy = find_year(row["Car_name"]) or find_year(page_text)
                        if yy:
                            row["year_of_manufacture"] = yy

                    # price
                    pr = find_rupee(page_text)
                    if pr:
                        row["price"] = pr

                    # MILEAGE extraction on detail page: labeled field or meta-line or page-wide fallback
                    mileage_val = ""
                    # 1) labeled field 'Mileage' or 'Avg. Mileage'
                    mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
                    if mnode:
                        try:
                            parent = mnode.parent
                            sib = parent.find_next_sibling()
                            if sib:
                                mileage_val = sib.get_text(" ", strip=True)
                        except:
                            mileage_val = ""
                    # 2) try meta-line near title
                    if not mileage_val and ttag:
                        nxt = ttag.find_next()
                        checks = 0
                        while nxt and checks < 8:
                            txt = nxt.get_text(" ", strip=True)
                            if txt and (("kms" in txt.lower()) or re.search(r'\b\d+\s*(?:kmpl|km/kg|km/kwh|km/l|kpl)\b', txt, flags=re.I)):
                                mileage_val = txt
                                break
                            nxt = nxt.find_next()
                            checks += 1
                    # 3) fallback to page_text extraction
                    if not mileage_val:
                        mileage_val = extract_mileage(page_text)

                    # normalize mileage
                    if mileage_val:
                        row["mileage"] = mileage_val

                except Exception as e:
                    # if detail fetch fails, keep earlier extracted values
                    pass

                # tiny sleep between detail visits
                time.sleep(random.uniform(0.35, 0.9))
            improved_rows.append(row)
            visited += 1

            # checkpoint: save every 200 detail pages processed
            if visited % 200 == 0:
                df_ck = pd.DataFrame(improved_rows)
                df_ck = df_ck.drop_duplicates(subset=["detail_page","Car_name","price"])
                df_ck.to_csv(OUTPUT_CSV, index=False)
                df_ck.to_excel(OUTPUT_XLSX, index=False)
                print(f"Checkpoint saved after {visited} detail visits: {len(df_ck)} rows.")

    else:
        improved_rows = cards_collected

    # Final cleaning + dedupe
    df = pd.DataFrame(improved_rows)
    # normalize strings and numeric kms
    for c in ["Car_name","brand","model","fuel_type","price","detail_page","mileage"]:
        if c in df.columns:
            df[c] = df[c].fillna("").astype(str).str.strip()
    df["kms_driven"] = df["kms_driven"].apply(lambda x: clean_kms(x) if x else "")
    df["year_of_manufacture"] = df["year_of_manufacture"].apply(lambda x: find_year(str(x)) if x else "")

    # dedupe by detail_page if present else by Car_name+price
    if "detail_page" in df.columns and df["detail_page"].str.len().sum() > 0:
        df = df.drop_duplicates(subset=["detail_page"]).reset_index(drop=True)
    else:
        df = df.drop_duplicates(subset=["Car_name","price"]).reset_index(drop=True)

    # Save
    df.to_csv(OUTPUT_CSV, index=False)
    df.to_excel(OUTPUT_XLSX, index=False)
    print(f"Saved cleaned output: {OUTPUT_CSV} and {OUTPUT_XLSX}")
    print("Total rows collected:", len(df))
    display(df.head(40))

finally:
    try:
        driver.quit()
    except:
        pass


Detected total_listings=603, per_page_guess=64, total_pages_to_try=10
Scanned page 10; collected cards so far: 535
Initial collection done: 535 card rows, detail links discovered: 518


  label_kms = dsoup.find(text=re.compile(r'Kms\s*Driven|Kms|Odometer', flags=re.I))
  label_fuel = dsoup.find(text=re.compile(r'Fuel\s*Type|Fuel', flags=re.I))
  label_year = dsoup.find(text=re.compile(r'Year\s*of\s*Manufacture|Registration\s*Year|Year', flags=re.I))
  mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))


Checkpoint saved after 200 detail visits: 196 rows.
Checkpoint saved after 400 detail visits: 396 rows.
Saved cleaned output: cardekho_used_cars_nagpur_clean_fixed.csv and cardekho_used_cars_nagpur_clean_fixed.xlsx
Total rows collected: 519


Unnamed: 0,Car_name,brand,model,kms_driven,fuel_type,year_of_manufacture,price,mileage,detail_page,page
0,2015 Maruti Swift VXI,Maruti,2015 Swift VXI,2015425201520142021470000151324148839646561565,"{""@context"":""https://schema.org"",""@type"":""WebP...",2015,₹4.25 Lakh,"47,000 kms • Petrol • Manual • 2nd Owner",https://www.cardekho.com/used-car-details/used...,1
1,2023 MG Astor Super MT,MG,2023 Astor Super MT,202314987552023147007557916718429285435931,"{""@context"":""https://schema.org"",""@type"":""WebP...",2023,₹7.55 Lakh with Less Driven,"14,725 kms • Petrol • Manual • 1st Owner",https://www.cardekho.com/buy-used-car-details/...,1
2,2018 Hyundai Xcent 1.2 VTVT S,Hyundai,2018 Xcent 1.2 VTVT S,2025119743020251251000430126442497385495362216,"{""@context"":""https://schema.org"",""@type"":""WebP...",2018,₹4.30 Lakh with Almost New,"51,000 kms • Petrol • Manual • 1st Owner",https://www.cardekho.com/used-car-details/used...,1
3,57 Used Cars Under 2 Lakh in Nagpur,57,Used Cars Under 2 Lakh in Nagpur,572202,"{""@context"":""https://schema.org"",""@type"":""WebP...",2012,₹1 Lakh,57 used cars Under 2 Lakh are available for sa...,https://www.cardekho.com/used-cars+0-lakh-to-2...,1
4,81 Used Cars Under 3 Lakh in Nagpur,81,Used Cars Under 3 Lakh in Nagpur,813323,"{""@context"":""https://schema.org"",""@type"":""WebP...",2015,₹2.89 Lakh,81 used cars Under 3 Lakh are available for sa...,https://www.cardekho.com/used-cars+2-lakh-to-3...,1
5,228 Used Cars Under 5 Lakh in Nagpur,228,Used Cars Under 5 Lakh in Nagpur,2285535,"{""@context"":""https://schema.org"",""@type"":""WebP...",2017,₹3.25 Lakh,228 used cars Under 5 Lakh are available for s...,https://www.cardekho.com/used-cars+3-lakh-to-5...,1
6,135 Used Cars Under 8 Lakh in Nagpur,135,Used Cars Under 8 Lakh in Nagpur,1358858,"{""@context"":""https://schema.org"",""@type"":""WebP...",2023,₹6.95 Lakh,135 used cars Under 8 Lakh are available for s...,https://www.cardekho.com/used-cars+5-lakh-to-8...,1
7,34 Used Cars Under 10 Lakh in Nagpur,34,Used Cars Under 10 Lakh in Nagpur,341010810,"{""@context"":""https://schema.org"",""@type"":""WebP...",2021,₹8.50 Lakh,34 used cars Under 10 Lakh are available for s...,https://www.cardekho.com/used-cars+8-lakh-to-1...,1
8,68 Used Cars Above 10 Lakh in Nagpur,68,Used Cars Above 10 Lakh in Nagpur,681010105,"{""@context"":""https://schema.org"",""@type"":""WebP...",2021,₹12 Lakh,68 used cars Above 10 Lakh are available for s...,https://www.cardekho.com/used-cars+10-lakh-to-...,1
9,2014 Ford Ecosport 1.5 DV5 MT Titanium Optional,Ford,2014 Ecosport 1.5 DV5 MT Titanium Optional,2014370201420132015155120000155684875414007812...,"{""@context"":""https://schema.org"",""@type"":""WebP...",2014,₹3.70 Lakh,"1,20,000 kms • Diesel • Manual • 3rd Owner",https://www.cardekho.com/used-car-details/used...,1


4th Stage of Scrap

In [8]:
# Jupyter cell: Repair mileage & transmission for existing Cardekho file
# - Loads /mnt/data/cardekho_used_cars_hyderabad_price_fixed.xlsx
# - Visits detail_page for rows missing/invalid mileage or transmission
# - Writes back cleaned file (CSV + XLSX)
# Requirements: selenium, webdriver-manager, beautifulsoup4, pandas, openpyxl

import re, time, random, os
from urllib.parse import urljoin
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ---------- CONFIG ----------
INPUT_XLSX = "cardekho_used_cars_nagpur_price_fixed.xlsx"
OUTPUT_XLSX = "cardekho_used_cars_nagpur_price_fixed_cleaned.xlsx"
OUTPUT_CSV = "cardekho_used_cars_nagpur_price_fixed_cleaned.csv"

HEADLESS = True                # set False to watch browser
DELAY_BETWEEN = (0.6, 1.2)     # polite per-page pause
CHECKPOINT_EVERY = 50          # save every N updated rows
MAX_RETRIES = 1
# --------------------------------

if not os.path.exists(INPUT_XLSX):
    raise FileNotFoundError(f"Input file not found: {INPUT_XLSX}. Put your file at this path and re-run.")

# --- utility functions ---
def text_of(elem):
    return elem.get_text(" ", strip=True) if elem else ""

def normalize_mileage(raw):
    if not raw:
        return ""
    s = str(raw).strip()
    s = s.replace("\xa0"," ").replace("\n"," ").strip()
    # try to capture number + unit (common)
    m = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|kpl|km/kg|km/kwh|km/l|km|mpg)', s, flags=re.I)
    if m:
        val = m.group(1)
        unit = m.group(2).lower()
        # normalize unit names
        unit = unit.replace("kpl","kmpl").replace("km/l","kmpl")
        return f"{val} {unit}"
    # tries like "18.5" then search for unit nearby
    m2 = re.search(r'([\d]{1,3}(?:\.\d+)?)', s)
    if m2:
        # if no unit, just return number
        return m2.group(1)
    return s

def normalize_transmission(raw):
    if not raw:
        return ""
    s = str(raw)
    for t in ["AMT","CVT","DCT","AT","MT","Manual","Automatic"]:
        if re.search(r'\b' + re.escape(t) + r'\b', s, flags=re.I):
            # canonicalize
            if t.upper() in ("AMT","CVT","DCT","AT","MT"):
                return t.upper()
            return t.title()
    # last resort: find words
    if re.search(r'\bmanual\b', s, flags=re.I):
        return "Manual"
    if re.search(r'\bautomatic\b', s, flags=re.I):
        return "Automatic"
    return s.strip()

def extract_mileage_from_text(text):
    if not text:
        return ""
    # 1) direct patterns e.g. "18.5 kmpl"
    m = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|kpl|km/kg|km/kwh|km/l|km|mpg)\b', text, flags=re.I)
    if m:
        unit = m.group(2).lower().replace("kpl","kmpl").replace("km/l","kmpl")
        return f"{m.group(1)} {unit}"
    # 2) near 'mileage' keyword: capture window +/- 60 chars
    for keyword in ["mileage","avg. mileage","avg mileage","city mileage","claimed mileage","average mileage"]:
        idx = text.lower().find(keyword)
        if idx != -1:
            start = max(0, idx-60)
            end = min(len(text), idx+80)
            ctx = text[start:end]
            m2 = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|kpl|km/kg|km/kwh|km/l|km|mpg)\b', ctx, flags=re.I)
            if m2:
                unit = m2.group(2).lower().replace("kpl","kmpl").replace("km/l","kmpl")
                return f"{m2.group(1)} {unit}"
            # number only
            m3 = re.search(r'([\d]{1,3}(?:\.\d+)?)', ctx)
            if m3:
                return m3.group(1)
    # 3) any number+unit elsewhere
    m4 = re.search(r'([\d]{1,3}(?:\.\d+)?)\s*(kmpl|kpl|km/kg|km/kwh|km/l|km|mpg)\b', text, flags=re.I)
    if m4:
        unit = m4.group(2).lower().replace("kpl","kmpl").replace("km/l","kmpl")
        return f"{m4.group(1)} {unit}"
    return ""

def extract_trans_from_text(text):
    if not text:
        return ""
    # look for label/context words
    for t in ["Manual","Automatic","AMT","CVT","DCT","AT","MT"]:
        if re.search(r'\b' + re.escape(t) + r'\b', text, flags=re.I):
            return normalize_transmission(t)
    # also search near keywords 'transmission' or 'gearbox'
    idx = text.lower().find("transmission")
    if idx == -1:
        idx = text.lower().find("gearbox")
    if idx != -1:
        start = max(0, idx-40)
        end = min(len(text), idx+80)
        ctx = text[start:end]
        for t in ["Manual","Automatic","AMT","CVT","DCT","AT","MT"]:
            if re.search(r'\b' + re.escape(t) + r'\b', ctx, flags=re.I):
                return normalize_transmission(t)
        # fallback to any word in ctx
        m = re.search(r'\b(Manual|Automatic|AMT|CVT|DCT|AT|MT)\b', ctx, flags=re.I)
        if m:
            return normalize_transmission(m.group(1))
    return ""

# ----------------- load dataset -----------------
df = pd.read_excel(INPUT_XLSX)
print(f"Loaded {len(df)} rows from {INPUT_XLSX}")

# identify rows that need fixing:
# Criteria: mileage empty OR transmission empty OR mileage looks like URL/junk (contains 'http' or '/')
def mileage_is_bad(val):
    if not val or str(val).strip() == "":
        return True
    s = str(val).lower()
    if "http" in s or "/" in s and len(s) > 10:   # simplistic junk heuristics
        return True
    # if it's non-numeric and non-unit, mark for check
    if not re.search(r'\d', s):
        return True
    return False

def transmission_is_bad(val):
    if not val or str(val).strip() == "":
        return True
    s = str(val)
    if re.search(r'http|\/', s):
        return True
    # ok if contains known token
    if re.search(r'\b(Manual|Automatic|AMT|CVT|DCT|AT|MT)\b', s, flags=re.I):
        return False
    return False  # conservative: if present assume ok

# build list of indices to fix
to_fix_idx = []
for i, row in df.iterrows():
    mismatch = False
    if mileage_is_bad(row.get("mileage", "")):
        mismatch = True
    if transmission_is_bad(row.get("transmission", "")):
        mismatch = True
    # we will only attempt to fix those that have a valid detail_page URL
    if mismatch and row.get("detail_page"):
        to_fix_idx.append(i)

print(f"Rows flagged for repair: {len(to_fix_idx)} (will visit detail pages)")

if len(to_fix_idx) == 0:
    print("No rows need fixing. Exiting.")
else:
    # Setup Selenium (headful or headless)
    opts = Options()
    if HEADLESS:
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1200,900")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

    updated = 0
    try:
        for batch_i, idx in enumerate(to_fix_idx, start=1):
            row = df.loc[idx]
            url = str(row.get("detail_page")).strip()
            if not url:
                continue
            # polite jitter
            time.sleep(random.uniform(*DELAY_BETWEEN))
            # fetch page
            success = False
            for attempt in range(MAX_RETRIES+1):
                try:
                    driver.get(url)
                    # let JS run and content load
                    time.sleep(1.0 + random.random()*0.8)
                    page_soup = BeautifulSoup(driver.page_source, "html.parser")
                    page_text = page_soup.get_text(" ", strip=True)
                    success = True
                    break
                except Exception as e:
                    if attempt < MAX_RETRIES:
                        time.sleep(0.6)
                        continue
                    else:
                        success = False
            if not success:
                print(f"[{batch_i}/{len(to_fix_idx)}] Failed to load: {url}")
                continue

            # MULTIPLE extraction strategies, most-specific -> fallback
            new_mileage = ""
            new_trans = ""

            # Strategy A: labeled dt/dd or table tr (common structure)
            # dt/dd
            try:
                dts = page_soup.find_all("dt")
                if dts:
                    for dt in dts:
                        label = text_of(dt).lower()
                        if "mile" in label:
                            dd = dt.find_next_sibling("dd")
                            if dd:
                                cand = text_of(dd)
                                if cand:
                                    new_mileage = extract_mileage_from_text(cand) or normalize_mileage(cand)
                        if "trans" in label or "gear" in label:
                            dd = dt.find_next_sibling("dd")
                            if dd:
                                new_trans = extract_trans_from_text(text_of(dd)) or normalize_transmission(text_of(dd))
            except Exception:
                pass

            # Strategy B: table rows <tr><th>Label</th><td>Value</td>
            if not new_mileage or not new_trans:
                try:
                    for tr in page_soup.find_all("tr"):
                        th = tr.find(["th","td"])
                        tlabel = text_of(th).lower() if th else ""
                        tvals = [text_of(x) for x in tr.find_all("td")]
                        tval = tvals[0] if tvals else ""
                        if not new_mileage and ("mileage" in tlabel or "avg" in tlabel and "mileage" in tlabel):
                            new_mileage = extract_mileage_from_text(tval) or normalize_mileage(tval)
                        if not new_trans and ("transmission" in tlabel or "gearbox" in tlabel or "gear" in tlabel):
                            new_trans = extract_trans_from_text(tval) or normalize_transmission(tval)
                except Exception:
                    pass

            # Strategy C: look for elements with class/id containing keywords
            if not new_mileage:
                try:
                    mileage_nodes = page_soup.find_all(attrs={"class": re.compile(r"mile|mileage|avg-mile|avgMileage", flags=re.I)})
                    for n in mileage_nodes:
                        cand = text_of(n)
                        cand_val = extract_mileage_from_text(cand) or normalize_mileage(cand)
                        if cand_val:
                            new_mileage = cand_val
                            break
                except Exception:
                    pass
            if not new_trans:
                try:
                    trans_nodes = page_soup.find_all(attrs={"class": re.compile(r"trans|gear|gearbox", flags=re.I)})
                    for n in trans_nodes:
                        cand = text_of(n)
                        cand_val = extract_trans_from_text(cand) or normalize_transmission(cand)
                        if cand_val:
                            new_trans = cand_val
                            break
                except Exception:
                    pass

            # Strategy D: meta / data- attributes (rare)
            if not new_mileage:
                try:
                    mtag = page_soup.find("meta", attrs={"name": re.compile(r"mileage", flags=re.I)})
                    if mtag and mtag.get("content"):
                        cand = mtag.get("content")
                        new_mileage = extract_mileage_from_text(cand) or normalize_mileage(cand)
                except Exception:
                    pass

            # Strategy E: near-title meta-line (many Cardekho detail pages show compact specs after title)
            if not new_mileage or not new_trans:
                try:
                    htag = page_soup.find(["h1","h2"])
                    if htag:
                        nxt = htag.find_next()
                        checks = 0
                        while nxt and checks < 10:
                            txt = text_of(nxt)
                            if txt and (("kms" in txt.lower()) or ("kmpl" in txt.lower()) or ("mileage" in txt.lower()) or ("transmission" in txt.lower()) or ("gear" in txt.lower())):
                                if not new_mileage:
                                    new_mileage = extract_mileage_from_text(txt) or normalize_mileage(txt)
                                if not new_trans:
                                    new_trans = extract_trans_from_text(txt) or normalize_transmission(txt)
                                break
                            nxt = nxt.find_next()
                            checks += 1
                except Exception:
                    pass

            # Strategy F: page-wide regex fallback (last resort)
            if not new_mileage:
                new_mileage = extract_mileage_from_text(page_text)
            if not new_trans:
                new_trans = extract_trans_from_text(page_text)

            # Final normalization
            new_mileage = normalize_mileage(new_mileage)
            new_trans = normalize_transmission(new_trans)

            # If still empty, try card-level existing values as fallback (do not overwrite good existing)
            existing_mileage = df.at[idx, "mileage"] if "mileage" in df.columns else ""
            existing_trans = df.at[idx, "transmission"] if "transmission" in df.columns else ""
            if not new_mileage and existing_mileage and not mileage_is_bad(existing_mileage):
                new_mileage = existing_mileage
            if not new_trans and existing_trans and existing_trans.strip():
                new_trans = existing_trans

            # write back if changed
            changed = False
            if new_mileage and (str(df.at[idx, "mileage"]) != new_mileage):
                df.at[idx, "mileage"] = new_mileage
                changed = True
            if new_trans and (str(df.at[idx, "transmission"]) != new_trans):
                df.at[idx, "transmission"] = new_trans
                changed = True

            if changed:
                updated += 1
                print(f"[{batch_i}/{len(to_fix_idx)}] Updated idx={idx}: mileage='{new_mileage}' transmission='{new_trans}'")
            else:
                print(f"[{batch_i}/{len(to_fix_idx)}] No new data for idx={idx}")

            # checkpointing
            if updated and updated % CHECKPOINT_EVERY == 0:
                df.to_csv(OUTPUT_CSV, index=False)
                df.to_excel(OUTPUT_XLSX, index=False)
                print(f"Checkpoint saved after {updated} updates.")

    finally:
        try:
            driver.quit()
        except:
            pass

    # final save
    df.to_csv(OUTPUT_CSV, index=False)
    df.to_excel(OUTPUT_XLSX, index=False)
    print(f"Done. Updated {updated} rows. Saved cleaned file to:\n - {OUTPUT_XLSX}\n - {OUTPUT_CSV}")


Loaded 517 rows from cardekho_used_cars_nagpur_price_fixed.xlsx
Rows flagged for repair: 41 (will visit detail pages)
[1/41] Updated idx=0: mileage='000 km' transmission='Manual'
[2/41] Updated idx=1: mileage='725 km' transmission='Manual'
[3/41] Updated idx=2: mileage='765 km' transmission='Manual'
[4/41] Updated idx=3: mileage='57' transmission='AT'
[5/41] Updated idx=4: mileage='80' transmission='AT'
[6/41] Updated idx=5: mileage='228' transmission='AT'
[7/41] Updated idx=6: mileage='135' transmission='AT'
[8/41] Updated idx=7: mileage='34' transmission='AT'
[9/41] Updated idx=8: mileage='68' transmission='AT'
[10/41] Updated idx=9: mileage='552 km' transmission='Manual'
[11/41] Updated idx=10: mileage='000 km' transmission='Manual'
[12/41] Updated idx=11: mileage='000 km' transmission='Automatic'
[13/41] Updated idx=12: mileage='000 km' transmission='Manual'
[14/41] Updated idx=13: mileage='800 km' transmission='Manual'
[15/41] Updated idx=14: mileage='000 km' transmission='Manual'

5th Stage of Scrap

In [1]:
#!/usr/bin/env python3
"""
cardekho_with_price_fix.py

Improved scraper for CarDekho used cars (Hyderabad) with robust price extraction.
Only mileage logic improved — rest unchanged.
Requires: selenium, webdriver-manager, beautifulsoup4, pandas, openpyxl
"""

import re
import time
import random
import math
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# ---------------------------
# Path to uploaded file (local). Use this path if you want to read the previously saved file.
# Downstream systems can convert this local path into a downloadable URL if needed.
UPLOADED_FILE_PATH = "cardekho_used_cars_nagpur_price_fixed_cleaned.xlsx"
# ---------------------------

# -------- CONFIG ----------
START_URL = "https://www.cardekho.com/used-cars+in+nagpur"
OUTPUT_CSV = "cardekho_used_cars_nagpur_price_fixed.csv"
OUTPUT_XLSX = "cardekho_used_cars_nagpur_price_fixed.xlsx"

HEADLESS = True
MAX_PAGES_OVERRIDE = None
MAX_SCROLLS = 40
SCROLL_PAUSE = 0.7
PAGE_PAUSE = (0.6, 1.4)
VISIT_DETAIL_PAGES = True
DETAIL_PAUSE = (0.6, 1.2)
MAX_DETAIL_RETRIES = 1

BRANDS = [
    "Maruti", "Hyundai", "Tata", "Honda", "Toyota", "Mahindra", "Kia",
    "BMW", "Audi", "Mercedes-Benz", "Mercedes", "Renault", "MG", "Skoda",
    "Volkswagen", "Ford", "Nissan", "Jeep", "Volvo", "Land Rover", "Jaguar",
    "Isuzu", "Datsun", "Chevrolet", "Opel"
]
# --------------------------

def text_of(elem):
    return elem.get_text(" ", strip=True) if elem else ""

def guess_brand_and_model(name):
    if not name:
        return "", ""
    for b in BRANDS:
        if b.lower() in name.lower():
            model = re.sub(re.escape(b), "", name, flags=re.IGNORECASE).strip()
            model = re.sub(r'^[\-\:\–\—\s]+', '', model)
            return b, model or name
    parts = name.split()
    return (parts[0], " ".join(parts[1:]) if len(parts) > 1 else "") if parts else ("","")

def extract_kms(text):
    m = re.search(r'([\d,\.]+)\s*(?:kms|km)\b', text, flags=re.I)
    if m:
        return m.group(1).replace(",", "")
    return ""

def extract_fuel(text):
    for f in ["Petrol", "Diesel", "CNG", "LPG", "Electric", "Hybrid"]:
        if re.search(r'\b' + re.escape(f) + r'\b', text, flags=re.I):
            return f
    return ""

# ----------------------
# IMPROVED MILEAGE LOGIC
# ----------------------

# convert mpg to kmpl factor
_MPG_TO_KMPL = 0.425144

def _try_parse_number(s):
    """Return float or None for first numeric group found."""
    if not s:
        return None
    s2 = str(s).replace(",", "").replace("\xa0"," ").strip()
    m = re.search(r'([0-9]{1,3}(?:\.[0-9]+)?)', s2)
    if not m:
        return None
    try:
        return float(m.group(1))
    except:
        return None

def extract_mileage(text):
    """
    Robust mileage extraction that:
     - prefers explicit units (kmpl, km/l, kpl)
     - converts mpg -> kmpl
     - ignores pure distance values like '120 km' (treat as invalid)
     - returns standardized string like '18.5 kmpl' or '' if not found
    """
    if not text:
        return ""
    s = str(text).replace("\xa0"," ").strip()
    low = s.lower()

    # 1) explicit kmpl / km/l / kpl patterns
    m = re.search(r'([0-9]{1,3}(?:\.[0-9]+)?)\s*(km\s*/\s*l|kmpl|kpl|kmperlitre|km per litre)\b', low, flags=re.I)
    if m:
        num = float(m.group(1))
        # format number: remove .0 if integer else keep up to 2 decimals
        num_fmt = int(num) if num.is_integer() else round(num, 2)
        return f"{num_fmt} kmpl"

    # 2) mpg -> convert to kmpl
    m_mpg = re.search(r'([0-9]{1,3}(?:\.[0-9]+)?)\s*mpg\b', low, flags=re.I)
    if m_mpg:
        mpg = float(m_mpg.group(1))
        kmpl = round(mpg * _MPG_TO_KMPL, 2)
        kmpl_fmt = int(kmpl) if float(kmpl).is_integer() else kmpl
        return f"{kmpl_fmt} kmpl"

    # 3) patterns like "18.5 kmpl" without spaces or with units in mixed-case
    m2 = re.search(r'([0-9]{1,3}(?:\.[0-9]+)?)\s*(kmpl|kpl|km/l)\b', low, flags=re.I)
    if m2:
        num = float(m2.group(1))
        num_fmt = int(num) if num.is_integer() else round(num, 2)
        return f"{num_fmt} kmpl"

    # 4) look for 'mileage' keyword and numbers near it
    for keyword in ("mileage", "avg. mileage", "avg mileage", "claimed mileage", "claimed fuel economy"):
        idx = low.find(keyword)
        if idx != -1:
            window = low[max(0, idx-50): idx+80]
            m3 = re.search(r'([0-9]{1,3}(?:\.[0-9]+)?)\s*(kmpl|kpl|km/l|mpg)?', window, flags=re.I)
            if m3:
                num = float(m3.group(1))
                unit = m3.group(2)
                if unit and "mpg" in unit:
                    kmpl = round(num * _MPG_TO_KMPL, 2)
                    kmpl_fmt = int(kmpl) if float(kmpl).is_integer() else kmpl
                    return f"{kmpl_fmt} kmpl"
                # if unit absent, only accept if number plausible for kmpl
                if not unit:
                    if num <= 50:  # treat as kmpl
                        num_fmt = int(num) if float(num).is_integer() else round(num, 2)
                        return f"{num_fmt} kmpl"
                else:
                    # if unit is kmpl-like handled above; fallback
                    num_fmt = int(num) if float(num).is_integer() else round(num, 2)
                    return f"{num_fmt} kmpl"

    # 5) generic number+unit elsewhere on page
    m4 = re.search(r'([0-9]{1,3}(?:\.[0-9]+)?)\s*(kmpl|kpl|km/l|mpg)\b', low, flags=re.I)
    if m4:
        val = float(m4.group(1))
        unit = m4.group(2)
        if 'mpg' in unit:
            kmpl = round(val * _MPG_TO_KMPL, 2)
            kmpl_fmt = int(kmpl) if float(kmpl).is_integer() else kmpl
            return f"{kmpl_fmt} kmpl"
        num_fmt = int(val) if val.is_integer() else round(val, 2)
        return f"{num_fmt} kmpl"

    # 6) numeric-only fallback: if page has a single small number (<50) assume kmpl
    num = _try_parse_number(s)
    if num is not None and num <= 50:
        num_fmt = int(num) if float(num).is_integer() else round(num, 2)
        return f"{num_fmt} kmpl"

    # 7) otherwise likely a distance or invalid — return empty
    return ""

# helper used by the fallback detail extraction (kept unchanged)
def extract_mileage_from_text(text):
    # reuse improved extract_mileage
    return extract_mileage(text)

def extract_transmission(text):
    if not text:
        return ""
    for t in ["Manual","Automatic","CVT","AMT","DCT","AT","MT"]:
        if re.search(r'\b' + re.escape(t) + r'\b', text, flags=re.I):
            if t.upper() in ("AT","AMT","CVT","DCT","MT"):
                return t.upper()
            return t.title()
    return ""

# ----------------------
# existing other helpers unchanged
# ----------------------

def extract_price_from_soup(soup):
    # 1. meta tags
    meta_selectors = [
        ('meta', {'property': 'og:price:amount'}),
        ('meta', {'itemprop': 'price'}),
        ('meta', {'name': 'price'}),
    ]
    for tag, attrs in meta_selectors:
        mtag = soup.find(tag, attrs=attrs)
        if mtag:
            val = mtag.get('content') or mtag.get('value') or ""
            if val:
                if re.search(r'[\d]', val) and '₹' not in val:
                    return "₹ " + val.strip()
                return val.strip()

    # 2. attributes that often store price
    for attr in ('data-price', 'data-offer-price', 'data-srp', 'data-amount', 'data-price-value'):
        el = soup.find(attrs={attr: True})
        if el:
            val = el.get(attr)
            if val:
                if re.search(r'[\d]', val) and '₹' not in val:
                    return "₹ " + val.strip()
                return val.strip()

    # 3. elements with class or id containing 'price' or 'amount'
    px = soup.find(lambda tag: tag.name in ("div","span","p","strong") and (
        tag.get("class") or tag.get("id")
    ) and re.search(r'price|amount|selling|srp|finalPrice|carPrice|actual-price', " ".join((tag.get("class") or []) + [tag.get("id") or ""]), flags=re.I))
    if px:
        txt = text_of(px)
        pr = find_rupee_in_text(txt)
        if pr:
            return pr
        if txt:
            return txt.strip()

    # 4. any visible text near top with rupee sign
    top_region = ""
    head_candidates = soup.find_all(["header", "section", "div"], limit=6)
    for c in head_candidates:
        t = text_of(c)
        if '₹' in t:
            top_region = t
            break
    if not top_region:
        top_region = soup.get_text(" ", strip=True)

    pr = find_rupee_in_text(top_region)
    if pr:
        return pr

    # 5. fallback regex on whole page (Lakh/Crore)
    p2 = re.search(r'[\d\.,]+\s*(?:Lakh|lakh|Lakhs|lakhs|Crore|crore|Cr)\b', soup.get_text(" ", strip=True))
    if p2:
        return p2.group(0).strip()

    return ""

def find_rupee_in_text(text):
    if not text:
        return ""
    m = re.search(r'₹\s*[\d\.,\sA-Za-z]+', text)
    if m:
        return m.group(0).strip()
    m2 = re.search(r'[\d\.,]+\s*(Lakh|lakh|Lakhs|lakhs|Crore|crore|Cr)\b', text)
    if m2:
        return m2.group(0).strip()
    return ""

def extract_price_from_text(text):
    p = find_rupee_in_text(text)
    return p

def extract_year(text):
    m = re.search(r'\b(19|20)\d{2}\b', text)
    return m.group(0) if m else ""

# ----------------------
# main scraping flow (unchanged)
# ----------------------

def main():
    chrome_opts = Options()
    if HEADLESS:
        chrome_opts.add_argument("--headless=new")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_argument("--window-size=1920,1080")
    chrome_opts.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_opts)

    try:
        driver.get(START_URL)
        time.sleep(2.0)
        soup0 = BeautifulSoup(driver.page_source, "html.parser")
        text0 = soup0.get_text(" ", strip=True)

        total_listings = None
        m = re.search(r'([\d,]{2,})\s+Second Hand Cars in Nagpur', text0, flags=re.I)
        if not m:
            m = re.search(r'of\s+([\d,]+)\s+results', text0, flags=re.I)
        if m:
            total_listings = int(m.group(1).replace(",", ""))

        per_page_guess = max(1, len(soup0.find_all("h3")))
        estimated_pages = math.ceil(total_listings / per_page_guess) if total_listings else None
        total_pages = int(estimated_pages) if estimated_pages else 200
        if MAX_PAGES_OVERRIDE:
            total_pages = MAX_PAGES_OVERRIDE

        print(f"Detected total_listings={total_listings}, per_page_guess={per_page_guess}, total_pages={total_pages}")

        rows = []
        seen_keys = set()
        detail_links = []

        for p in range(1, total_pages + 1):
            page_url = START_URL.rstrip("/") + "?page=" + str(p)
            try:
                driver.get(page_url)
            except Exception:
                time.sleep(1.0)
                driver.get(page_url)

            # scroll aggressively
            last_h = driver.execute_script("return document.body.scrollHeight")
            sc = 0
            while sc < MAX_SCROLLS:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(SCROLL_PAUSE)
                new_h = driver.execute_script("return document.body.scrollHeight")
                if new_h == last_h:
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight-400);")
                    time.sleep(0.4)
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(0.4)
                    new_h = driver.execute_script("return document.body.scrollHeight")
                    if new_h == last_h:
                        break
                last_h = new_h
                sc += 1

            page_soup = BeautifulSoup(driver.page_source, "html.parser")
            titles = page_soup.find_all("h3")
            for h in titles:
                title = text_of(h)
                if not title:
                    continue

                container = h
                card_text = ""
                link = ""
                for _ in range(6):
                    if container is None:
                        break
                    card_text = text_of(container)
                    if "₹" in card_text or re.search(r'\b\d+\s*(?:kms|km)\b', card_text, flags=re.I):
                        a = container.find("a", href=True)
                        if a:
                            href = a["href"]
                            link = href if "cardekho.com" in href else urljoin(page_url, href)
                            link = link.split("#")[0].split("?utm")[0]
                        break
                    container = container.parent

                if not card_text:
                    parent = h.parent
                    card_text = text_of(parent) if parent else title

                if ("₹" not in card_text) and (re.search(r'\b\d+\s*(?:kms|km)\b', card_text, flags=re.I) is None):
                    continue

                price_card = extract_price_from_text(card_text)
                key = (title + "||" + (price_card or "")).strip()
                if key in seen_keys:
                    continue
                seen_keys.add(key)

                kms = extract_kms(card_text)
                fuel = extract_fuel(card_text)
                year = extract_year(title) or extract_year(card_text)
                brand = ""
                model = ""
                try:
                    brand, model = guess_brand_and_model(title)
                except:
                    pass

                rows.append({
                    "Car_name": title,
                    "brand": brand,
                    "model": model,
                    "kms_driven": kms,
                    "mileage": "",             # will be filled from detail page if available
                    "transmission": "",
                    "fuel_type": fuel,
                    "year_of_manufacture": year,
                    "price": price_card,
                    "detail_page": link
                })

                if link:
                    detail_links.append(link)

            time.sleep(random.uniform(*PAGE_PAUSE))

            if total_listings and len(seen_keys) >= total_listings:
                print("Reached detected total listings; stopping page scan.")
                break

            if p % 10 == 0:
                print(f"Scanned page {p}; rows so far: {len(rows)}")

        print(f"Collected {len(rows)} card-level rows, detail links: {len(set(detail_links))}")

        # Now ensure price/mileage/transmission are filled: visit detail pages for any row missing price/mileage/trans
        if VISIT_DETAIL_PAGES and detail_links:
            unique_detail_links = []
            seen_dl = set()
            for u in detail_links:
                if u and u not in seen_dl:
                    seen_dl.add(u)
                    unique_detail_links.append(u)

            # map link -> price/mileage/transmission
            detail_map = {}

            for i, dl in enumerate(unique_detail_links):
                # polite pause
                if i > 0:
                    time.sleep(random.uniform(*DETAIL_PAUSE))
                attempt = 0
                success = False
                while attempt <= MAX_DETAIL_RETRIES and not success:
                    try:
                        driver.get(dl)
                        time.sleep(1.0 + random.random()*0.8)
                        dsoup = BeautifulSoup(driver.page_source, "html.parser")

                        # price from many possible places
                        price_val = extract_price_from_soup(dsoup)
                        if not price_val:
                            price_val = extract_price_from_text(dsoup.get_text(" ", strip=True))

                        # mileage and transmission extraction now uses improved functions
                        page_text = dsoup.get_text(" ", strip=True)
                        # try meta-line near title for quick values
                        h_tag = dsoup.find(["h1","h2"])
                        meta_line = ""
                        if h_tag:
                            nxt = h_tag.find_next()
                            checks = 0
                            while nxt and checks < 8:
                                t = text_of(nxt)
                                if t and (("kms" in t.lower()) or ("₹" in t) or re.search(r'\b\d+\s*(?:kmpl|km/kg|km/kwh|km/l|kpl)\b', t, flags=re.I)):
                                    meta_line = t
                                    break
                                nxt = nxt.find_next()
                                checks += 1

                        # mileage: improved extraction
                        mileage_val = ""
                        mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
                        if mnode:
                            try:
                                parent = mnode.parent
                                sib = parent.find_next_sibling()
                                if sib:
                                    mileage_val = text_of(sib)
                            except:
                                mileage_val = ""
                        if not mileage_val:
                            # check meta_line then page_text using improved extract_mileage
                            mileage_val = extract_mileage(meta_line or page_text) or extract_mileage(page_text)

                        # transmission
                        trans_val = ""
                        tnode = dsoup.find(text=re.compile(r'\bTransmission\b|\bGearbox\b', flags=re.I))
                        if tnode:
                            try:
                                parent = tnode.parent
                                sib = parent.find_next_sibling()
                                if sib:
                                    trans_val = text_of(sib)
                            except:
                                trans_val = ""
                        if not trans_val:
                            trans_val = extract_transmission(meta_line or page_text)

                        # normalize mileage into consistent 'X kmpl' format (handled by extract_mileage)
                        mileage_clean = extract_mileage(mileage_val or "")
                        detail_map[dl] = {
                            "price": price_val or "",
                            "mileage": mileage_clean or "",
                            "transmission": trans_val or ""
                        }
                        success = True
                    except Exception as e:
                        attempt += 1
                        if attempt > MAX_DETAIL_RETRIES:
                            detail_map[dl] = {"price": "", "mileage": "", "transmission": ""}
                            success = True
                        else:
                            time.sleep(0.8)

                if (i+1) % 100 == 0:
                    print(f"Processed {i+1} detail pages...")

            # merge into rows
            for r in rows:
                link = r.get("detail_page", "")
                if link and link in detail_map:
                    if not r.get("price"):
                        r["price"] = detail_map[link]["price"]
                    # set mileage/transmission from detail
                    if detail_map[link].get("mileage"):
                        r["mileage"] = detail_map[link]["mileage"]
                    if detail_map[link].get("transmission"):
                        r["transmission"] = detail_map[link]["transmission"]

        # final normalization
        df = pd.DataFrame(rows, columns=[
            "Car_name","brand","model","kms_driven","mileage","transmission","fuel_type","year_of_manufacture","price","detail_page"
        ])
        df["kms_driven"] = df["kms_driven"].fillna("").astype(str).apply(lambda x: re.sub(r'[^\d\.]', '', x))
        df["mileage"] = df["mileage"].fillna("").astype(str).apply(lambda x: x.strip())
        df["transmission"] = df["transmission"].fillna("").astype(str).apply(lambda x: x.strip().title())
        df["year_of_manufacture"] = df["year_of_manufacture"].fillna("").astype(str).apply(lambda x: (re.search(r'\b(19|20)\d{2}\b', x).group(0) if re.search(r'\b(19|20)\d{2}\b', x) else ""))
        df["price"] = df["price"].fillna("").astype(str).apply(lambda x: x.strip())
        df["fuel_type"] = df["fuel_type"].fillna("").astype(str).apply(lambda x: x.strip().title())

        # dedupe
        if "detail_page" in df.columns and df["detail_page"].str.len().sum() > 0:
            df = df.drop_duplicates(subset=["detail_page"]).reset_index(drop=True)
        else:
            df = df.drop_duplicates(subset=["Car_name","price"]).reset_index(drop=True)

        # save
        df.to_csv(OUTPUT_CSV, index=False)
        df.to_excel(OUTPUT_XLSX, index=False)
        print(f"Saved {len(df)} rows to {OUTPUT_CSV} and {OUTPUT_XLSX}")

    finally:
        try:
            driver.quit()
        except:
            pass

# helpers used in detail extraction (note: improved extract_mileage used above)
def extract_mileage_fallback(text):
    # kept for compatibility; call improved extractor
    return extract_mileage(text)

def extract_transmission(text):
    if not text:
        return ""
    for t in ["Manual","Automatic","CVT","AMT","DCT","AT","MT"]:
        if re.search(r'\b' + re.escape(t) + r'\b', text, flags=re.I):
            if t.upper() in ("AT","AMT","CVT","DCT","MT"):
                return t.upper()
            return t.title()
    return ""

if __name__ == "__main__":
    main()


Detected total_listings=602, per_page_guess=64, total_pages=10
Scanned page 10; rows so far: 535
Collected 535 card-level rows, detail links: 518


  mnode = dsoup.find(text=re.compile(r'\bMileage\b|\bAvg\.?\s*Mileage\b', flags=re.I))
  tnode = dsoup.find(text=re.compile(r'\bTransmission\b|\bGearbox\b', flags=re.I))


Processed 100 detail pages...
Processed 200 detail pages...
Processed 300 detail pages...
Processed 400 detail pages...
Processed 500 detail pages...
Saved 519 rows to cardekho_used_cars_nagpur_price_fixed.csv and cardekho_used_cars_nagpur_price_fixed.xlsx
