In [44]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [45]:
df = pd.read_csv('dataset.csv')

In [46]:
df = df.drop(columns=["AMENITIES", "BUILDUP AREA"], errors="ignore")
# Remove"House for Rent" from title
df = df[~df["TITLE"].str.contains("House for Rent", case=False, na=False)]

In [47]:

def convert_price(price):
    price = str(price).replace("Rs.", "").replace(",", "").strip()
    if "Cr" in price:
        return float(price.replace("Cr", "").strip()) * 10_000_000
    elif "Lakh" in price:
        return float(price.replace("Lakh", "").strip()) * 100_000
    else:
        return float(price) if price.isdigit() else np.nan

def clean_land_area(area):
    area = str(area).strip().lower()
    match = re.search(r"(\d+\.\d+|\d+)", area)
    number = float(match.group(1)) if match else np.nan
    
    if "aana" in area:
        return number * 342.25  #aana to square feet
    elif "kattha" in area:
        return number * 3388.98  # kattha to square feet
    elif "sq. mtr" in area:
        return number * 10.7639  # square meters to square feet
    elif "sq. ft" in area:
        return number
    return np.nan

def convert_road_access(road):
    match = re.search(r"([\d.]+)\s*Feet", str(road))
    return float(match.group(1)) if match else np.nan

def extract_parking(parking):
    cars = re.search(r"(\d+)\s*CaRs?", str(parking))
    bikes = re.search(r"(\d+)\s*Bikes?", str(parking))
    return int(cars.group(1)) if cars else 0, int(bikes.group(1)) if bikes else 0


In [48]:
# Apply transformations
df["PRICE"] = df["PRICE"].apply(convert_price)
df["LAND AREA"] = df["LAND AREA"].apply(clean_land_area)
df["ROAD ACCESS"] = df["ROAD ACCESS"].apply(convert_road_access)
df["CARS"], df["BIKES"] = zip(*df["PARKING"].apply(extract_parking))

# Drop unnecessary columns
df = df.drop(columns=["TITLE", "PARKING", "BUILT YEAR"], errors="ignore")

# one-hot encoding
df = pd.get_dummies(df, columns=["LOCATION", "FACING"], drop_first=True)

df = df.fillna(df.median())

In [49]:
X = df.drop(columns=["PRICE"])
y = df["PRICE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [50]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [51]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Mean Absolute Error (MAE): 11501175.11771854
Root Mean Squared Error (RMSE): 21589350.069620233
R² Score: 0.28689721756941855


In [None]:
predictions_df = pd.DataFrame({"Actual Price": y_test, "Predicted Price": y_pred})
# Display the DataFrame
print(predictions_df.head())

      Actual Price  Predicted Price
412     45000000.0       43097000.0
2352    21500000.0       26489000.0
3292    18500000.0       22572000.0
2754    32000000.0       39664500.0
339     46500000.0       38760000.0
