In [1]:
import os, re, numpy as np, pandas as pd

# ---------------- CONFIG ----------------
INPUT_PATH = "D:/GUVI/myenv/Luxury_Housing_Sales_Analysis_Bengaluru/data/Luxury_Housing_Bangalore.csv"
OUTPUT_DIR = "D:/GUVI/myenv/Luxury_Housing_Sales_Analysis_Bengaluru/data/processed"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "luxury_bangalore_cleaned.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- HELPER FUNCTIONS ----------------
def parse_ticket_price(val):
    """Convert values like '1.2 Cr', '₹1,20,00,000' into crores (float)."""
    if pd.isna(val): return np.nan
    s = re.sub(r"[^\d\.]", "", str(val))
    return float(s) if s else np.nan

def normalize_text(s):
    return np.nan if pd.isna(s) else " ".join(str(s).strip().title().split())

def parse_configuration(cfg):
    """Normalize to '3BHK' etc."""
    if pd.isna(cfg): return np.nan
    m = re.search(r"(\d+)\s*BHK", str(cfg).upper())
    return f"{m.group(1)}BHK" if m else str(cfg).upper()

def derive_quarter_info(q):
    """Return standardized quarter string and number."""
    if pd.isna(q): return (np.nan, np.nan)
    s = str(q).upper()
    m = re.search(r"Q([1-4]).*?(\d{4})", s) or re.search(r"(\d{4}).*?Q([1-4])", s)
    if m:
        year, qnum = (m.group(2), m.group(1)) if "Q" in m.group(0)[:2] else (m.group(1), m.group(2))
        return (f"{year}-Q{qnum}", int(qnum))
    # try month pattern
    m = re.search(r"(\d{4})[-/](\d{2})", s)
    if m:
        y, mth = int(m.group(1)), int(m.group(2))
        qnum = (mth - 1) // 3 + 1
        return (f"{y}-Q{qnum}", qnum)
    return (s, np.nan)

def sentiment_score(text):
    """Basic keyword-based sentiment."""
    if pd.isna(text): return 0
    t = str(text).lower()
    pos = ["good","great","excellent","happy","satisfied","liked","love","positive"]
    neg = ["bad","poor","delay","problem","issue","negative","complaint","expensive"]
    sc = sum(w in t for w in pos) - sum(w in t for w in neg)
    return round(sc / max(1, len(t.split())), 3)

# ---------------- LOAD ----------------
print("Loading data...")
df = pd.read_csv(INPUT_PATH, low_memory=False)
print("Rows:", len(df))

# ---------------- CLEAN ----------------
df.columns = [c.strip().replace(" ", "_") for c in df.columns]

df["Ticket_Price_Cr"] = df["Ticket_Price_Cr"].apply(parse_ticket_price)
df["Ticket_Price_INR"] = df["Ticket_Price_Cr"] * 1e7  # 1 Cr = 10 million INR

# Normalize text
for c in ["Micro_Market","Project_Name","Developer_Name","Configuration",
          "Possession_Status","Sales_Channel","Buyer_Type","Transaction_Type"]:
    if c in df.columns: df[c] = df[c].apply(normalize_text)

df["Configuration"] = df["Configuration"].apply(parse_configuration)

# Amenity, Connectivity, Infra numeric
for c in ["Amenity_Score","Connectivity_Score","Locality_Infra_Score","Avg_Traffic_Time_Min"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Handle NRI_Buyer → binary flag
df["NRI_Buyer_Flag"] = df["NRI_Buyer"].astype(str).str.lower().isin(["yes","true","1"]).astype(int)

# Derive quarter info
df[["Purchase_Quarter_Canonical","Quarter_Number"]] = df["Purchase_Quarter"].apply(lambda x: pd.Series(derive_quarter_info(x)))

# Price per sqft
df["Price_per_Sqft_INR"] = np.where(df["Unit_Size_Sqft"]>0,
                                    df["Ticket_Price_INR"]/df["Unit_Size_Sqft"], np.nan)

# Buyer Comment sentiment
df["Buyer_Comments_Sentiment"] = df["Buyer_Comments"].apply(sentiment_score)

# Luxury flag (example rule: >3 Cr)
df["Luxury_Flag"] = (df["Ticket_Price_Cr"] >= 3).astype(int)

# Booking potential heuristic score (amenity + connectivity + infra − traffic/20)
df["Booking_Potential_Score"] = (
    df[["Amenity_Score","Connectivity_Score","Locality_Infra_Score"]].mean(axis=1)
    - (df["Avg_Traffic_Time_Min"]/20)
).round(2)

# ---------------- VALIDATE ----------------
print("After cleaning:", df.shape)
print(df.head(3))

# ---------------- SAVE ----------------
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Cleaned file saved at {OUTPUT_PATH}")


Loading data...
Rows: 101000
After cleaning: (101000, 26)
  Property_ID       Micro_Market Project_Name Developer_Name  Unit_Size_Sqft  \
0  PROP000001      Sarjapur Road    Project_0            Rmz          4025.0   
1  PROP000002        Indiranagar    Project_1    Puravankara          5760.0   
2  PROP000003  Bannerghatta Road    Project_2   Tata Housing          7707.0   

  Configuration  Ticket_Price_Cr Transaction_Type Buyer_Type Purchase_Quarter  \
0          4BHK        12.750846          Primary        Nri       2025-03-31   
1          3BHK        16.292152          Primary      Other       2024-06-30   
2          4BHK        10.517724          Primary        Hni       2023-12-31   

   ...  Avg_Traffic_Time_Min             Buyer_Comments Ticket_Price_INR  \
0  ...                    18       Loved the amenities!     1.275085e+08   
1  ...                   106                        NaN     1.629215e+08   
2  ...                   113  Agent was not responsive.     1.051772