<a href="https://colab.research.google.com/github/pris25123/Real-Estate-AI/blob/main/FindYourSpace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Find Your Space - Real Estate ChatBot 🏠


dataset - https://www.kaggle.com/datasets/amitabhajoy/bengaluru-house-price-data

# 1. Dataset Preprocessing

In [1]:
!unzip archive.zip

Archive:  archive.zip
  inflating: Bengaluru_House_Data.csv  


In [2]:
import pandas as pd

df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df = df.drop("society", axis=1)
df.shape

(13320, 8)

In [4]:
df[['location','size','bath']].isnull().sum()

Unnamed: 0,0
location,1
size,16
bath,73


In [5]:
df = df.dropna(subset=["location", "size", "bath"])
df.shape

(13246, 8)

In [6]:
df['balcony'].isnull().sum()


np.int64(536)

In [7]:
df['balcony'].median()

2.0

In [8]:
df['balcony'] = df['balcony'].fillna(2.0)
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,0
total_sqft,0
bath,0
balcony,0
price,0


In [9]:
df['bhk'] = df['size'].str.extract(r'(\d+)').astype(int)
df[['size','bhk']].head()

Unnamed: 0,size,bhk
0,2 BHK,2
1,4 Bedroom,4
2,3 BHK,3
3,3 BHK,3
4,2 BHK,2


In [10]:
def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

df[~df['total_sqft'].apply(is_float)]['total_sqft'].unique()[:20]

array(['2100 - 2850', '3067 - 8156', '1042 - 1105', '1145 - 1340',
       '1015 - 1540', '34.46Sq. Meter', '1195 - 1440', '4125Perch',
       '1120 - 1145', '3090 - 5002', '1160 - 1195', '1000Sq. Meter',
       '1115 - 1130', '1100Sq. Yards', '520 - 645', '1000 - 1285',
       '650 - 665', '633 - 666', '5.31Acres', '30Acres'], dtype=object)

In [11]:
def convert_sqft_safe(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2

        if 'Sq. Meter' in x:
            return float(x.replace('Sq. Meter', '')) * 10.7639

        if 'Sq. Yards' in x:
            return float(x.replace('Sq. Yards', '')) * 9

        if 'Acres' in x:
            return float(x.replace('Acres', '')) * 43560

        if 'Perch' in x:
            return float(x.replace('Perch', '')) * 272.25

        return float(x)

    except:
        return None

df['total_sqft_clean'] = df['total_sqft'].apply(convert_sqft_safe)

In [12]:
df['total_sqft_clean'].isnull().sum()


np.int64(5)

In [13]:
df = df[df['total_sqft_clean'].notnull()]
df.shape

(13241, 10)

In [14]:
df = df.drop("total_sqft", axis=1)
df = df.rename(columns={"total_sqft_clean": "total_sqft"})
df.head()

Unnamed: 0,area_type,availability,location,size,bath,balcony,price,bhk,total_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,2.0,1.0,39.07,2,1056.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,5.0,3.0,120.0,4,2600.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,2.0,3.0,62.0,3,1440.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,3.0,1.0,95.0,3,1521.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,2.0,1.0,51.0,2,1200.0


In [15]:
df['location'] = df['location'].str.strip()
df['location'].nunique()

1292

In [16]:
location_counts = df['location'].value_counts()
location_counts.head()
len(location_counts[location_counts <= 10])

1052

In [17]:
df['location'] = df['location'].apply(
    lambda x: x if location_counts[x] > 10 else "Other"
)
df['location'].nunique()

241

In [18]:
df['price_per_sqft'] = (df['price'] * 100000) / df['total_sqft']
df[['price', 'total_sqft', 'price_per_sqft']].head()

Unnamed: 0,price,total_sqft,price_per_sqft
0,39.07,1056.0,3699.810606
1,120.0,2600.0,4615.384615
2,62.0,1440.0,4305.555556
3,95.0,1521.0,6245.890861
4,51.0,1200.0,4250.0


# 2.Building chatbot core

In [19]:
# Example test
filtered = df[
    (df['location'] == "Whitefield") &
    (df['bhk'] == 3) &
    (df['price'] <= 150)
]

filtered.head()

Unnamed: 0,area_type,availability,location,size,bath,balcony,price,bhk,total_sqft,price_per_sqft
10,Super built-up Area,18-Feb,Whitefield,3 BHK,2.0,2.0,70.0,3,1800.0,3888.888889
27,Built-up Area,20-Dec,Whitefield,3 BHK,3.0,2.0,81.0,3,1610.0,5031.055901
52,Built-up Area,Ready To Move,Whitefield,3 BHK,3.0,2.0,91.0,3,2010.0,4527.363184
541,Built-up Area,20-Dec,Whitefield,3 BHK,3.0,2.0,101.0,3,1768.0,5712.669683
544,Plot Area,18-Mar,Whitefield,3 Bedroom,3.0,1.0,61.95,3,1500.0,4130.0


In [20]:
import re

def extract_filters(query):
    filters = {}

    # Extract BHK
    bhk_match = re.search(r'(\d+)\s*bhk', query.lower())
    if bhk_match:
        filters['bhk'] = int(bhk_match.group(1))

    # Extract price (under X)
    price_match = re.search(r'under\s*(\d+)', query.lower())
    if price_match:
        filters['max_price'] = int(price_match.group(1))

    # Extract location
    for loc in df['location'].unique():
        if loc.lower() in query.lower():
            filters['location'] = loc
            break

    return filters

In [21]:
extract_filters("Show me 3 BHK in Whitefield under 100")

{'bhk': 3, 'max_price': 100, 'location': 'Whitefield'}

In [22]:
def search_properties(query):
    filters = extract_filters(query)

    results = df.copy()

    if 'location' in filters:
        results = results[results['location'] == filters['location']]

    if 'bhk' in filters:
        results = results[results['bhk'] == filters['bhk']]

    if 'max_price' in filters:
        results = results[results['price'] <= filters['max_price']]

    return results.head(5)

In [23]:
search_properties("Show me 3 BHK in Whitefield under 100")

Unnamed: 0,area_type,availability,location,size,bath,balcony,price,bhk,total_sqft,price_per_sqft
10,Super built-up Area,18-Feb,Whitefield,3 BHK,2.0,2.0,70.0,3,1800.0,3888.888889
27,Built-up Area,20-Dec,Whitefield,3 BHK,3.0,2.0,81.0,3,1610.0,5031.055901
52,Built-up Area,Ready To Move,Whitefield,3 BHK,3.0,2.0,91.0,3,2010.0,4527.363184
544,Plot Area,18-Mar,Whitefield,3 Bedroom,3.0,1.0,61.95,3,1500.0,4130.0
619,Super built-up Area,Ready To Move,Whitefield,3 BHK,3.0,2.0,100.0,3,1790.0,5586.592179


In [24]:
df.to_csv("clean_bangalore_real_estate.csv", index=False)

#3. LLM integration

In [25]:
!pip install -U transformers accelerate bitsandbytes pandas

Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pandas
  Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=250,
    temperature=0.3
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]



special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Passing `generation_config` together with generation-related arguments=({'max_new_tokens', 'temperature'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


In [2]:
import pandas as pd

df = pd.read_csv("clean_bangalore_real_estate.csv")
df.shape

(13241, 10)

In [3]:
pipe("Hello, introduce yourself as a real estate agent.")[0]["generated_text"]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=250) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


"Hello, introduce yourself as a real estate agent.\n\nHi there, I'm a real estate agent. I specialize in helping people find their dream homes and guiding them through the buying and selling process. How can I assist you today?"

In [4]:
conversation_memory = {}

In [5]:
import re

def extract_filters(query):
    global conversation_memory

    filters = conversation_memory.copy()

    bhk_match = re.search(r'(\d+)\s*bhk', query.lower())
    if bhk_match:
        filters['bhk'] = int(bhk_match.group(1))

    price_match = re.search(r'under\s*(\d+)', query.lower())
    if price_match:
        filters['max_price'] = int(price_match.group(1))

    for loc in df['location'].unique():
        if loc.lower() in query.lower():
            filters['location'] = loc
            break

    conversation_memory = filters
    return filters

def apply_sorting_logic(results, query):
    query = query.lower()

    if "cheapest" in query or "lowest price" in query:
        return results.sort_values("price").head(3)

    if "largest" in query or "biggest" in query:
        return results.sort_values("total_sqft", ascending=False).head(3)

    if "ready" in query:
        return results[results["availability"].str.contains("Ready", case=False, na=False)]

    return results


def search_properties(query):
    filters = extract_filters(query)
    results = df.copy()

    if 'location' in filters:
        results = results[results['location'] == filters['location']]

    if 'bhk' in filters:
        results = results[results['bhk'] == filters['bhk']]

    if 'max_price' in filters:
        results = results[results['price'] <= filters['max_price']]

    results = apply_sorting_logic(results, query)
    return results.head(5)


In [9]:
def format_properties(properties_df):
    formatted_output = []
    for index, row in properties_df.iterrows():
        formatted_output.append(
            f"- Location: {row['location']}, BHK: {int(row['bhk'])}, "
            f"Total Sqft: {row['total_sqft']:.2f}, Bath: {int(row['bath'])}, "
            f"Balcony: {int(row['balcony'])}, Price: {row['price']:.2f} Lakhs"
        )
    return "\n".join(formatted_output)

def generate_response(query):
    results = search_properties(query)

    if results.empty:
        return "Sorry, I couldn't find matching properties. Would you like to adjust your budget or location?"

    property_text = format_properties(results)

    instruction = f"""
You are a professional Bangalore real estate consultant.

User Request:
{query}

Here are the matching properties:
{property_text}

Strict Rules:
- Use ONLY the values exactly as written.
- Do NOT modify units.
- Balcony is a count, not an area.
- Area is in sqft.
- Price is in Lakhs.
- Present in bullet format.
- Do NOT add extra commentary.
- Do NOT add disclaimers.
- Keep response strictly factual and concise.

"""


    # Proper Mistral Instruct format
    prompt = f"<s>[INST] {instruction.strip()} [/INST]"

    output = pipe(
        prompt,
        max_new_tokens=200,
        temperature=0.3,
        do_sample=True,
        return_full_text=False   # prevents prompt echo
    )[0]["generated_text"]

    return output.strip()

In [10]:
if 'bhk' not in df.columns:
    df['bhk'] = df['size'].str.extract(r'(\d+)').astype(int)

generate_response("Show me 3 BHK in Whitefield under 100 lakhs")

Passing `generation_config` together with generation-related arguments=({'max_new_tokens', 'temperature', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'- Location: Whitefield, BHK: 3, Total Sqft: 1800.00, Bath: 2, Balcony: 2, Price: 70.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1610.00, Bath: 3, Balcony: 2, Price: 81.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 2010.00, Bath: 3, Balcony: 2, Price: 91.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1500.00, Bath: 3, Balcony: 1, Price: 61.95'

In [11]:
if 'bhk' not in df.columns:
    df['bhk'] = df['size'].str.extract(r'(\d+)').astype(int)

In [12]:
generate_response("Show me 3 BHK in Whitefield")
generate_response("Under 90 lakhs")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'Here are the matching properties under 90 lakhs in Bangalore:\n\n* Location: Whitefield, BHK: 3, Total Sqft: 1800.00, Bath: 2, Balcony: 2, Price: 70.00 Lakhs\n* Location: Whitefield, BHK: 3, Total Sqft: 1610.00, Bath: 3, Balcony: 2, Price: 81.00 Lakhs\n* Location: Whitefield, BHK: 3, Total Sqft: 1500.00, Bath: 3, Balcony: 1, Price: 61.95 Lakhs\n* Location: Whitefield, BHK: 3, Total Sqft: 1650.00, Bath'

In [13]:
generate_response("Show me 3 BHK in Whitefield")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'- Location: Whitefield, BHK: 3, Total Sqft: 1800.00, Bath: 2, Balcony: 2, Price: 70.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1610.00, Bath: 3, Balcony: 2, Price: 81.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1500.00, Bath: 3, Balcony: 1, Price: 61.95 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1650.00, Bath: 3, Balcony: 2, Price: 60.00'

In [14]:
generate_response("Which one is the cheapest?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'- Location: Whitefield, BHK: 3, Total Sqft: 1150.00, Bath: 2, Balcony: 3, Price: 44.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1322.50, Bath: 3, Balcony: 0, Price: 40.98 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1410.00, Bath: 3, Balcony: 2, Price: 43.71 Lakhs'

#Price Prediction

In [16]:
# Remove very low sqft per bhk
df = df[df['total_sqft']/df['bhk'] > 300]

# Remove extreme luxury price outliers
df = df[df['price'] < df['price'].quantile(0.99)]

In [17]:
import numpy as np

df['log_price'] = np.log(df['price'])

In [18]:
df_encoded = pd.get_dummies(df[['total_sqft','bath','balcony','bhk','location']], drop_first=True)

X = df_encoded
y = df['log_price']

In [19]:
# @title
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_rf = RandomForestRegressor(n_estimators=150, random_state=42)
model_rf.fit(X_train, y_train)

preds_log = model_rf.predict(X_test)

print("R2 Score:", r2_score(y_test, preds_log))
print("MAE (log scale):", mean_absolute_error(y_test, preds_log))

R2 Score: 0.7744842599126134
MAE (log scale): 0.21558088277962767


In [20]:
def predict_price(location, bhk, sqft, bath=2, balcony=1):
    input_dict = {
        'total_sqft': sqft,
        'bath': bath,
        'balcony': balcony,
        'bhk': bhk
    }

    for col in X.columns:
        if col.startswith("location_"):
            input_dict[col] = 1 if col == f"location_{location}" else 0

    input_df = pd.DataFrame([input_dict])

    # Ensure all missing dummy columns exist
    for col in X.columns:
        if col not in input_df.columns:
            input_df[col] = 0

    input_df = input_df[X.columns]

    log_pred = model_rf.predict(input_df)[0]
    price_pred = np.exp(log_pred)

    return round(price_pred, 2)

In [21]:
predict_price("Whitefield", 3, 1500)

np.float64(85.18)

In [31]:
actual_price = np.exp(y_test)
preds_price = np.exp(preds_log)
mae_lakhs = mean_absolute_error(actual_price, preds_price)
print("MAE (Lakhs):", mae_lakhs)

MAE (Lakhs): 23.87889666601722


In [37]:
generate_response("How much would a 3 BHK 1500 sqft in Whitefield cost?")

'Estimated price range for a 3 BHK property of 1500 sqft in Whitefield is approximately 73.24 – 97.12 Lakhs '

In [38]:
generate_response("Estimate price for 2 BHK in Whitefield")

'Estimated price range for a 2 BHK property of 1500 sqft in Whitefield is approximately 69.46 – 93.34 Lakhs '

In [39]:
generate_response("Show me 3 BHK in Whitefield under 100")

Both `max_new_tokens` (=200) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'- Location: Whitefield, BHK: 3, Total Sqft: 1800.00, Bath: 2, Balcony: 2, Price: 70.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1610.00, Bath: 3, Balcony: 2, Price: 81.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 2010.00, Bath: 3, Balcony: 2, Price: 91.00 Lakhs\n- Location: Whitefield, BHK: 3, Total Sqft: 1500.00, Bath: 3, Balcony: 1, Price: 61.95'

XAI

In [40]:
!pip install shap



In [48]:
import shap

# Use TreeExplainer for RandomForest
# Added feature_perturbation='interventional' as suggested by the error message
explainer = shap.TreeExplainer(model_rf, feature_perturbation='interventional')



In [54]:
def explain_prediction(location, bhk, sqft, bath=2, balcony=1):

    input_dict = {
        'total_sqft': sqft,
        'bath': bath,
        'balcony': balcony,
        'bhk': bhk
    }

    for col in X.columns:
        if col.startswith("location_"):
            input_dict[col] = 1 if col == f"location_{location}" else 0

    input_df = pd.DataFrame([input_dict])

    for col in X.columns:
        if col not in input_df.columns:
            input_df[col] = 0

    input_df = input_df[X.columns]

    shap_values = explainer.shap_values(input_df, check_additivity=False)

    shap_series = pd.Series(shap_values[0], index=X.columns)

    # Keep only meaningful features
    meaningful_features = []

    for feature in shap_series.index:
        if feature.startswith("location_") and input_df[feature].iloc[0] == 1:
            meaningful_features.append(feature)
        elif feature in ['total_sqft', 'bhk', 'bath', 'balcony']:
            meaningful_features.append(feature)

    shap_series = shap_series[meaningful_features]

    top_features = shap_series.abs().sort_values(ascending=False).head(3)

    explanations = []

    for feature in top_features.index:
        impact = shap_series[feature]
        direction = "increased" if impact > 0 else "decreased"

        # Business-friendly wording
        if feature == "total_sqft":
            text = "Larger built-up area"
        elif feature == "bhk":
            text = "Number of bedrooms"
        elif feature == "bath":
            text = "Number of bathrooms"
        elif feature == "balcony":
            text = "Balcony count"
        elif feature.startswith("location_"):
            loc_name = feature.replace("location_", "")
            text = f"{loc_name} location"
        else:
            text = feature

        if direction == "increased":
            explanations.append(f"{text} increased the estimated price")
        else:
            explanations.append(f"{text} slightly reduced the estimated price")

    return explanations


In [55]:
explain_prediction("Whitefield", 3, 1500)


['Number of bathrooms slightly reduced the estimated price',
 'Larger built-up area increased the estimated price',
 'Whitefield location increased the estimated price']

In [56]:
generate_response("How much would a 3 BHK 1500 sqft in Whitefield cost?")

'Estimated price range for a 3 BHK property of 1500 sqft in Whitefield is approximately 73.24 – 97.12 Lakhs '

In [57]:
def generate_response(query):
    lower_query = query.lower()

    # ---------- PRICE ESTIMATION INTENT ----------
    if "estimate" in lower_query or "how much" in lower_query or "price of" in lower_query:

        filters = extract_filters(query)

        # Extract sqft if mentioned
        sqft_match = re.search(r'(\d+)\s*sqft', lower_query)
        sqft = int(sqft_match.group(1)) if sqft_match else 1500  # default if not provided

        if 'location' in filters and 'bhk' in filters:
            predicted_price = predict_price(filters['location'], filters['bhk'], sqft)

            # Data-driven uncertainty using model MAE
            error_margin = mae_lakhs * 0.5

            lower_bound = round(predicted_price - error_margin, 2)
            upper_bound = round(predicted_price + error_margin, 2)

            # Get SHAP-based explanation
            explanations = explain_prediction(filters['location'], filters['bhk'], sqft)

            return (
                f"Estimated price range for a {filters['bhk']} BHK property of "
                f"{sqft} sqft in {filters['location']} is approximately "
                f"{lower_bound} – {upper_bound} Lakhs.\n\n"
                f"Key factors influencing this estimate:\n"
                f"- {explanations[0]}\n"
                f"- {explanations[1]}\n"
                f"- {explanations[2]}"
            )

        else:
            return "Please specify location and BHK to estimate the price."

    # ---------- PROPERTY SEARCH ----------
    results = search_properties(query)

    if results.empty:
        return "Sorry, I couldn't find matching properties. Would you like to adjust your budget or location?"

    property_text = format_properties(results)

    instruction = f"""
You are a professional Bangalore real estate consultant.

User Request:
{query}

Here are the matching properties:
{property_text}

Strict Rules:
- Use ONLY the values exactly as written.
- Do NOT modify units.
- Balcony is a count.
- Area is in sqft.
- Price is in Lakhs.
- Present in bullet format.
- Do NOT add commentary.
- Keep response concise.
"""

    prompt = f"<s>[INST] {instruction.strip()} [/INST]"

    output = pipe(
        prompt,
        max_new_tokens=200,
        temperature=0.3,
        do_sample=True,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id
    )[0]["generated_text"]

    return output.strip()


In [58]:
generate_response("How much would a 3 BHK 1500 sqft in Whitefield cost?")

'Estimated price range for a 3 BHK property of 1500 sqft in Whitefield is approximately 73.24 – 97.12 Lakhs.\n\nKey factors influencing this estimate:\n- Number of bathrooms slightly reduced the estimated price\n- Larger built-up area increased the estimated price\n- Whitefield location increased the estimated price'

In [59]:
import joblib

joblib.dump(model_rf, "price_model.pkl")
joblib.dump(explainer, "shap_explainer.pkl")
joblib.dump(X.columns, "feature_columns.pkl")

['feature_columns.pkl']

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

model_rf = RandomForestRegressor(
    n_estimators=60,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

model_rf.fit(X_train, y_train)

preds_log = model_rf.predict(X_test)

print("R2 Score:", r2_score(y_test, preds_log))
print("MAE (log scale):", mean_absolute_error(y_test, preds_log))


R2 Score: 0.7657003210386494
MAE (log scale): 0.23489819477464174


In [61]:
preds_price = np.exp(preds_log)
actual_price = np.exp(y_test)

mae_lakhs = mean_absolute_error(actual_price, preds_price)

print("MAE in Lakhs:", mae_lakhs)

MAE in Lakhs: 25.5763765997413


In [62]:
import joblib

joblib.dump(model_rf, "price_model.pkl", compress=3)
joblib.dump(X.columns, "feature_columns.pkl")

['feature_columns.pkl']

In [63]:
from google.colab import files

files.download("price_model.pkl")
files.download("feature_columns.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [64]:
preds_log = model_rf.predict(X_train)
train_preds = np.exp(preds_log)
train_actual = np.exp(y_train)

residuals = train_actual - train_preds
residual_std = np.std(residuals)

print("Residual STD:", residual_std)

Residual STD: 38.60454993205138
