Corrected the price column

In [3]:
import re, math
import pandas as pd
from pathlib import Path
from typing import Union, Iterable

# Parse Indian prices like "11.95 Lakh", "2.75 Crore", "6,50,000"
_multiplier = {"lakh":1e5,"lakhs":1e5,"lac":1e5,"crore":1e7,"crores":1e7,"cr":1e7,"k":1e3}
def parse_price_to_rupees(x: Union[str,float,int]):
    if x is None: return math.nan
    if isinstance(x,(int,float)): return float(x)
    s = re.sub(r"[â‚¹,]", "", str(x).strip().lower())
    if not s or s in {"nan","none"}: return math.nan
    m = re.match(r"([0-9]*\.?[0-9]+)\s*([a-z\.]+)?", s)
    if not m: return math.nan
    val = float(m.group(1)); unit = (m.group(2) or "").replace(".","")
    unit = {"lakhs":"lakh","crores":"crore"}.get(unit, unit)
    return val * _multiplier.get(unit, 1.0)

def process_excel_only(
    files: Iterable[Union[str, Path]],
    price_col: str = "price",
    sheet: str | int | None = None,   # None=first sheet; or name/index
    suffix: str = "_priced.xlsx",
    out_dir: str | Path | None = None
):
    for f in map(Path, files):
        if f.suffix.lower() not in {".xlsx", ".xls"}:
            print(f"Skip (not Excel): {f}"); continue
        if not f.exists():
            print(f"Skip (missing): {f}"); continue

        df = pd.read_excel(f, sheet_name=sheet, engine="openpyxl")
        if price_col not in df.columns:
            print(f"Skip (missing column '{price_col}'): {f.name}"); continue

        df["price_rupees"] = df[price_col].apply(parse_price_to_rupees)
        df["price_lakh"] = df["price_rupees"] / 1e5

        target_dir = Path(out_dir) if out_dir else f.parent
        out_path = target_dir / f"{f.stem}{suffix}"
        df.to_excel(out_path, index=False, engine="openpyxl")
        print(f"Wrote: {out_path}")

if __name__ == "__main__":
    inputs = [
        "cardekho_used_cars_hyderabad_clean_final.xlsx",
        "cardekho_used_cars_ahmedabad_clean_final.xlsx",
        "cardekho_used_cars_bangalore_clean_final.xlsx",
        "cardekho_used_cars_indore_clean_final.xlsx",
        "cardekho_used_cars_pune_clean_final.xlsx",
        "cardekho_used_cars_nagpur_clean_final.xlsx",
        "cardekho_used_cars_kolkata_clean_final.xlsx"
        # add more Excel files here
    ]
    process_excel_only(inputs, price_col="price", sheet=0)


Wrote: cardekho_used_cars_hyderabad_clean_final_priced.xlsx
Wrote: cardekho_used_cars_ahmedabad_clean_final_priced.xlsx
Wrote: cardekho_used_cars_bangalore_clean_final_priced.xlsx
Wrote: cardekho_used_cars_indore_clean_final_priced.xlsx
Wrote: cardekho_used_cars_pune_clean_final_priced.xlsx
Wrote: cardekho_used_cars_nagpur_clean_final_priced.xlsx
Wrote: cardekho_used_cars_kolkata_clean_final_priced.xlsx


Dropped Column

In [5]:
import pandas as pd
from pathlib import Path
from typing import Iterable, Union

def tidy_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Drop price and price_lakh if they exist
    drops = [c for c in ["price", "price_lakh"] if c in df.columns]
    if drops:
        df = df.drop(columns=drops)
    # Move detail_page to the last column
    if "detail_page" in df.columns:
        cols = [c for c in df.columns if c != "detail_page"] + ["detail_page"]
        df = df[cols]
    return df

def process_many_excel(
    files: Iterable[Union[str, Path]],
    sheet: str | int | None = 0,          # 0 = first sheet; or use a sheet name
    suffix: str = "_last.xlsx",
    out_dir: str | Path | None = None
):
    for f in map(Path, files):
        if f.suffix.lower() not in {".xlsx", ".xls"}:
            print(f"Skip (not Excel): {f}"); continue
        if not f.exists():
            print(f"Skip (missing): {f}"); continue

        df = pd.read_excel(f, sheet_name=sheet, engine="openpyxl")
        df = tidy_columns(df)

        target = (Path(out_dir) if out_dir else f.parent) / f"{f.stem}{suffix}"
        df.to_excel(target, index=False, engine="openpyxl")
        print(f"Wrote: {target}")

if __name__ == "__main__":
    # Option A: hard-code multiple files
    inputs = [
        r"cardekho_used_cars_hyderabad_clean_final_priced.xlsx",
        r"cardekho_used_cars_ahmedabad_clean_final_priced.xlsx",
        r"cardekho_used_cars_bangalore_clean_final_priced.xlsx",
        r"cardekho_used_cars_indore_clean_final_priced.xlsx",
        r"cardekho_used_cars_nagpur_clean_final_priced.xlsx",
        r"cardekho_used_cars_kolkata_clean_final_priced.xlsx",
        r"cardekho_used_cars_pune_clean_final_priced.xlsx"
        
    ]
    # Option B: or glob a folder pattern
    # from pathlib import Path
    # inputs = list(Path(r"D:\projects\cars").glob("cardekho_used_cars_*_1.xlsx"))

    process_many_excel(inputs, sheet=0, suffix="_last.xlsx")


Wrote: cardekho_used_cars_hyderabad_clean_final_priced_last.xlsx
Wrote: cardekho_used_cars_ahmedabad_clean_final_priced_last.xlsx
Wrote: cardekho_used_cars_bangalore_clean_final_priced_last.xlsx
Wrote: cardekho_used_cars_indore_clean_final_priced_last.xlsx
Wrote: cardekho_used_cars_nagpur_clean_final_priced_last.xlsx
Wrote: cardekho_used_cars_kolkata_clean_final_priced_last.xlsx
Wrote: cardekho_used_cars_pune_clean_final_priced_last.xlsx


Merged all files

In [6]:
import pandas as pd
from pathlib import Path
from typing import Union

def merge_excel(
    inputs: list[Union[str, Path]] | None = None,
    folder: Union[str, Path] | None = None,
    pattern: str = "*.xlsx",
    sheet: str | int | None = 0,
    output_file: Union[str, Path] = "all_cars_merged.xlsx"
):
    """
    Merge many Excel files into a single worksheet (rows appended).
    Provide either an explicit inputs list OR a folder+pattern to auto-discover files.
    """
    files: list[Path] = []
    if inputs:
        files = [Path(p) for p in inputs]
    elif folder:
        files = list(Path(folder).glob(pattern))
    else:
        raise ValueError("Provide inputs list or folder path.")

    dfs = []
    for f in files:
        if f.suffix.lower() not in {".xlsx", ".xls"}:
            print(f"Skip (not Excel): {f}"); continue
        if not f.exists():
            print(f"Skip (missing): {f}"); continue

        df = pd.read_excel(f, sheet_name=sheet, engine="openpyxl")
        df["__source_file__"] = f.name  # optional provenance column
        print(f"Loaded: {f.name} ({len(df)} rows)")
        dfs.append(df)

    if not dfs:
        print("No valid Excel files to merge."); return

    merged = pd.concat(dfs, ignore_index=True)
    out = Path(output_file)
    merged.to_excel(out, index=False, engine="openpyxl")
    print(f"\nMerged {len(dfs)} files -> {out} | Total rows: {len(merged)}")

if __name__ == "__main__":
    # Option A: list files explicitly
    inputs = [
        r"cardekho_used_cars_hyderabad_clean_final_priced_last.xlsx",
        r"cardekho_used_cars_ahmedabad_clean_final_priced_last.xlsx",
        r"cardekho_used_cars_bangalore_clean_final_priced_last.xlsx",
        r"cardekho_used_cars_indore_clean_final_priced_last.xlsx",
        r"cardekho_used_cars_nagpur_clean_final_priced_last.xlsx",
        r"cardekho_used_cars_kolkata_clean_final_priced_last.xlsx",
        r"cardekho_used_cars_pune_clean_final_priced_last.xlsx"
    ]
    merge_excel(inputs=inputs, sheet=0, output_file=r"all_cars_merged.xlsx")

   


Loaded: cardekho_used_cars_hyderabad_clean_final_priced_last.xlsx (863 rows)
Loaded: cardekho_used_cars_ahmedabad_clean_final_priced_last.xlsx (735 rows)
Loaded: cardekho_used_cars_bangalore_clean_final_priced_last.xlsx (878 rows)
Loaded: cardekho_used_cars_indore_clean_final_priced_last.xlsx (452 rows)
Loaded: cardekho_used_cars_nagpur_clean_final_priced_last.xlsx (507 rows)
Loaded: cardekho_used_cars_kolkata_clean_final_priced_last.xlsx (842 rows)
Loaded: cardekho_used_cars_pune_clean_final_priced_last.xlsx (806 rows)

Merged 7 files -> all_cars_merged.xlsx | Total rows: 5083
