This jupyter notebook demonstrates the data pre-processing and model training methodology. However, this notebook is not updated to the current version of the product. Consider this as the base for the product

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [48]:
# loading the dataset
df = pd.read_csv("Real Estate Data V21.csv")

In [49]:
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [50]:
def convert_price_to_number(price_str):
    try:
        if isinstance(price_str, str):
            price_str = price_str.replace("₹", "").strip().lower()
            
            if "cr" in price_str:
                return float(price_str.replace("cr", "").strip()) * 1e7
            elif "l" in price_str:
                return float(price_str.replace("l", "").strip()) * 1e5
            else:
                return None  # unknown unit like "acs", "per sqft", etc.
    except:
        return None


In [51]:
df['Price'] = df['Price'].apply(convert_price_to_number)

In [52]:
df['City'] = df['Location'].apply(lambda x: x.split(',')[-1].strip())

In [53]:
def extract_bhk(text) :
    match = re.search(r'(\d+)\s*BHK', str(text), re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

In [54]:
df['BHK'] = df['Property Title'].apply(extract_bhk)

In [55]:
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,City,BHK
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",19900000.0,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,Chennai,4.0
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,22500000.0,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,Chennai,10.0
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",10000000.0,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,Chennai,3.0
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,33300000.0,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes,Chennai,7.0
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",4800000.0,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,Chennai,2.0


In [56]:
df['City'].unique()

array(['Chennai', 'Bangalore', 'Hyderabad', 'Mumbai', 'Thane', 'Kolkata',
       'Pune', 'New Delhi'], dtype=object)

In [57]:
encoder = LabelEncoder()
df['City'] = encoder.fit_transform(df['City'])

In [58]:
df['City'].unique()

array([1, 0, 2, 4, 7, 3, 6, 5])

In [59]:
df['Price'] = df['Price'].fillna(df['Price'].mean())

In [60]:
df['BHK'] = df['BHK'].fillna(-1)

In [61]:
df['Balcony'] = encoder.fit_transform(df['Balcony'])
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,City,BHK
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",19900000.0,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,1,1,4.0
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,22500000.0,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,1,1,10.0
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",10000000.0,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,0,1,3.0
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,33300000.0,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,1,1,7.0
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",4800000.0,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,1,1,2.0


In [None]:
X = df[['Total_Area', 'Price_per_SQFT', 'Baths', 'Balcony', 'City', 'BHK']]
y = df['Price']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1,1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1,1))

In [65]:
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [70]:
y_pred = model.predict(X_test_scaled)

In [76]:
print('MAE : ', mean_absolute_error(y_test_scaled, y_pred))

MAE :  0.29564373237053254


In [78]:
df['Price'].mean()

np.float64(10675451.490738826)

In [90]:
rf_model = RandomForestRegressor(max_depth=2, random_state=42)
rf_model.fit(X_train_scaled, y_train_scaled)

  return fit_method(estimator, *args, **kwargs)


In [91]:
rf_pred = rf_model.predict(X_test_scaled)

In [92]:
print('MAE for RandomForestRegressor : ', mean_absolute_error(y_test_scaled, rf_pred))

MAE for RandomForestRegressor :  0.22928815916861897


In [93]:
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, 25, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}


In [None]:
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=50,  # number of parameter settings sampled
    scoring='neg_mean_absolute_error',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1  # use all available cores
)

random_search.fit(X_train_scaled, y_train_scaled)


In [97]:
print("Best Parameters:", random_search.best_params_)

best_model = random_search.best_estimator_

Best Parameters: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 25}


In [98]:
rfv_pred = best_model.predict(X_test_scaled)
mae = mean_absolute_error(y_test_scaled, rfv_pred)
print("Tuned MAE:", mae)

Tuned MAE: 0.05987488642986815
