In [5]:
# Import required libraries for data manipulation
import pandas as pd
import numpy as np

In [6]:
# Load the raw luxury housing dataset from CSV
csv_path = "../01_raw_data/Luxury_Housing_Bangalore.csv"
df = pd.read_csv(csv_path)

In [7]:
# Check the initial shape of the raw dataset
df.shape

(101000, 18)

In [8]:
# Define categorical text columns that require standardization
text_cols = [
    "Micro_Market",
    "Developer_Name",
    "Configuration",
    "Transaction_Type",
    "Buyer_Type",
    "Sales_Channel",
    "Possession_Status"
]

In [9]:
# Convert text columns to lowercase and remove leading/trailing spaces
for col in text_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.lower()
    )


In [10]:
# Remove currency symbols and text from Ticket_Price_Cr
df["Ticket_Price_Cr"] = (
    df["Ticket_Price_Cr"]
    .astype(str)
    .str.replace("â‚¹", "", regex=False)
    .str.replace("cr", "", regex=False)
    .str.replace("Cr", "", regex=False)
    .str.strip()
)


In [11]:
# Convert cleaned Ticket_Price_Cr values to numeric format
df["Ticket_Price_Cr"] = pd.to_numeric(df["Ticket_Price_Cr"], errors="coerce")

In [12]:
# Check remaining missing values after price conversion
df["Ticket_Price_Cr"].isnull().sum()

np.int64(10019)

In [13]:
# Replace invalid unit sizes (zero or negative) with null values
df.loc[df["Unit_Size_Sqft"] <= 0, "Unit_Size_Sqft"] = np.nan

In [14]:
# Impute missing unit sizes using median size per configuration type
df["Unit_Size_Sqft"] = df.groupby("Configuration")["Unit_Size_Sqft"] \
    .transform(lambda x: x.fillna(x.median()))


In [15]:
# Fill missing amenity scores using the overall median value
df["Amenity_Score"] = df["Amenity_Score"].fillna(df["Amenity_Score"].median())


In [16]:
# Replace missing buyer comments with a standard placeholder
df["Buyer_Comments"] = df["Buyer_Comments"].fillna("not_provided")


In [17]:
# Remove duplicate records based on unique Property_ID
df = df.drop_duplicates(subset="Property_ID")


In [18]:
# Verify that no duplicate Property_ID values remain
df["Property_ID"].duplicated().sum()


np.int64(0)

In [19]:
# Verify that missing values have been handled appropriately
df.isnull().sum()


Property_ID                0
Micro_Market               0
Project_Name               0
Developer_Name             0
Unit_Size_Sqft             0
Configuration              0
Ticket_Price_Cr         9913
Transaction_Type           0
Buyer_Type                 0
Purchase_Quarter           0
Connectivity_Score         0
Amenity_Score              0
Possession_Status          0
Sales_Channel              0
NRI_Buyer                  0
Locality_Infra_Score       0
Avg_Traffic_Time_Min       0
Buyer_Comments             0
dtype: int64

In [20]:
# Review statistical summary of cleaned numeric columns
df.describe()


Unnamed: 0,Unit_Size_Sqft,Ticket_Price_Cr,Connectivity_Score,Amenity_Score,Locality_Infra_Score,Avg_Traffic_Time_Min
count,100000.0,90087.0,100000.0,100000.0,100000.0,100000.0
mean,6005.34268,12.719069,6.993001,7.503678,7.499378,67.18802
std,1638.258325,7.670383,1.731699,1.366642,1.443286,30.267763
min,3000.0,-1.42,4.000031,5.000224,5.000013,15.0
25%,4683.0,10.010969,5.495535,6.395876,6.249147,41.0
50%,6008.0,12.038456,6.986316,7.499123,7.497347,67.0
75%,7332.0,14.097009,8.490617,8.615671,8.751793,93.0
max,8999.0,100.0,9.99997,9.999865,9.999956,119.0


In [21]:
# Inspect final data types and non-null counts
df.info()


<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Property_ID           100000 non-null  str    
 1   Micro_Market          100000 non-null  str    
 2   Project_Name          100000 non-null  str    
 3   Developer_Name        100000 non-null  str    
 4   Unit_Size_Sqft        100000 non-null  float64
 5   Configuration         100000 non-null  str    
 6   Ticket_Price_Cr       90087 non-null   float64
 7   Transaction_Type      100000 non-null  str    
 8   Buyer_Type            100000 non-null  str    
 9   Purchase_Quarter      100000 non-null  str    
 10  Connectivity_Score    100000 non-null  float64
 11  Amenity_Score         100000 non-null  float64
 12  Possession_Status     100000 non-null  str    
 13  Sales_Channel         100000 non-null  str    
 14  NRI_Buyer             100000 non-null  str    
 15  Locality_Inf

In [22]:
# Save the cleaned dataset for feature engineering and SQL loading
clean_path = "../01_raw_data/Luxury_Housing_Bangalore_CLEAN.csv"
df.to_csv(clean_path, index=False)
