In [4]:
import pandas as pd
from dotenv import load_dotenv
import os
import sqlite3
import re

from langchain_community.document_loaders import CSVLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS



In [5]:
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

if not api_key:
    raise ValueError("‚ùå GOOGLE_API_KEY not found in .env file")
else:
    print("‚úÖ API key loaded successfully")


‚úÖ API key loaded successfully


In [3]:
csv_path = "data/10final_merged_realestate_data.csv"  
loader = CSVLoader(file_path=csv_path)
documents = loader.load()

print(f"‚úÖ Loaded {len(documents)} documents from CSV")


‚úÖ Loaded 83 documents from CSV


In [None]:
CSV_PATH = "data/10final_merged_realestate_data.csv"
DB_PATH = "properties_sql.db"

# Load CSV
df = pd.read_csv(CSV_PATH)

# Create SQLite DB
conn = sqlite3.connect(DB_PATH)
df.to_sql("properties", conn, if_exists="replace", index=False)

conn.close()
print("‚úÖ CSV loaded into SQLite")

‚úÖ CSV loaded into SQLite


In [None]:
CSV_PATH = "data/10final_merged_realestate_data.csv"
DB_PATH = "properties_sql.db"

# Load CSV
df = pd.read_csv(CSV_PATH)

In [7]:
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [5]:
print(documents[0])

page_content='unique_property_id: cmff8vfoq0013vxp7h3onmg46
id_x: cmf53kkzy000fvcu8tx8jwjmr
projectType: RESIDENTIAL
projectName: Ashwini
projectCategory: STANDALONE
slug: luxury-ashwini-ashoknagar-chembur-mumbai-675058
projectAge: 
projectSummary: _
possessionDate: 2025-09-28 00:00:00
id_y: cmf53kl01000nvcu8ibut7fka
landmark: Babys school
fullAddress: Mumbai chembur
pincode: 411017
propertyCategory: RESIDENTIAL
type: 1BHK
configurationId: cmf53kkzz000ivcu89r5399s4
bathrooms: 1
balcony: 1.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757584023815-67012c27580e3e23.jpg
carpetArea: 123.0
price: 11111111
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1756971672464-1e5179453b5df91d.jpg"]
lift: 0
ready_to_move: 0
context: Project Name: Ashwini and type :1BHK. Located at : Mumbai chembur , near luxury-ashwini-ashoknagar-chembur-mumbai-675058 , having landmark :Babys school Property Type: . Price: Rs1

In [8]:
import time
from tqdm import tqdm

# Process documents in smaller batches with rate limiting
batch_size = 10  # Adjust this based on your quota
delay_between_batches = 15  # seconds

print(f"Processing {len(documents)} documents in batches of {batch_size}...")

# Split documents into batches
all_embeddings = []
for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i + batch_size]
    
    try:
        # Create FAISS index from batch
        if i == 0:
            # First batch - create new index
            vectordb = FAISS.from_documents(batch, embeddings)
        else:
            # Subsequent batches - merge with existing index
            batch_db = FAISS.from_documents(batch, embeddings)
            vectordb.merge_from(batch_db)
        
        print(f"‚úÖ Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
        
        # Rate limiting - wait between batches
        if i + batch_size < len(documents):
            time.sleep(delay_between_batches)
            
    except Exception as e:
        print(f"Error processing batch at index {i}: {e}")
        print(f"Try reducing batch_size or increasing delay_between_batches")
        raise

# Save the vector store
vectordb.save_local("faiss_realestate_index")
print("‚úÖ FAISS vector store created and saved as 'faiss_realestate_index'")


Processing 83 documents in batches of 10...


  0%|          | 0/9 [00:00<?, ?it/s]

‚úÖ Processed batch 1/9


 11%|‚ñà         | 1/9 [00:16<02:12, 16.53s/it]

‚úÖ Processed batch 2/9


 22%|‚ñà‚ñà‚ñè       | 2/9 [00:33<01:56, 16.63s/it]

‚úÖ Processed batch 3/9


 33%|‚ñà‚ñà‚ñà‚ñé      | 3/9 [00:49<01:40, 16.67s/it]

‚úÖ Processed batch 4/9


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 4/9 [01:06<01:23, 16.61s/it]

‚úÖ Processed batch 5/9


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 5/9 [01:23<01:06, 16.63s/it]

‚úÖ Processed batch 6/9


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 6/9 [01:39<00:49, 16.65s/it]

‚úÖ Processed batch 7/9


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 7/9 [01:56<00:33, 16.65s/it]

‚úÖ Processed batch 8/9


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [02:29<00:00, 16.65s/it]

‚úÖ Processed batch 9/9
‚úÖ FAISS vector store created and saved as 'faiss_realestate_index'





In [8]:
# Load FAISS index
db = FAISS.load_local("faiss_realestate_index", embeddings, allow_dangerous_deserialization=True)

# Example query
query = "3BHK flats with lift in Yashvant Seth Jadhav Marg"
results = db.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\nüîπ Result {i}:")
    print(res)



üîπ Result 1:
page_content='unique_property_id: cmfdmdqmt000hvc90cqvdwrq7
id_x: cmfdmdqmq0008vc90svu2sfto
projectType: RESIDENTIAL
projectName: Antriksh
projectCategory: STANDALONE
slug: luxury-antriksh-somwarpeth-camp-pune-997560
projectAge: 
projectSummary: _
possessionDate: 
id_y: cmfdmdqmu000jvc90b6auuc6b
landmark: Station Road Saraswat Colony
fullAddress: CTS NO 391, Station Rd, opp. Zilla Parishad, Mangalwar Peth, Somwar Peth, Pune, Maharashtra 411011
pincode: 411011
propertyCategory: RESIDENTIAL
type: 3BHK
configurationId: cmfdmdqmt000gvc90h6gcwpga
bathrooms: 3
balcony: 2.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757486991499-74fa17a4c97985e2.jpg
carpetArea: 1095.0
price: 22900000
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757486991505-2373b8e162de669f.jpg"]
lift: 0
ready_to_move: 0
context: Project Name: Antriksh and type :3BHK. Located at : CTS NO 391, Station Rd, opp. Zill

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI

query = "Which projects have  apartments near Babys school?"

results = db.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\nüîπ Result {i}:")
    print(res)


prompt = f"""
Use the following property data to answer the user's query.
Return factual, concise answers.
and identify each property by its unique_property_ID i.e the first column in the database.
results:
{results}

Question:
{query}
and if no projects are found, respond with "No projects found."or can list only the relevant projects.
"""

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",  
    google_api_key=api_key
)
response = model.invoke(prompt)
print(response)



üîπ Result 1:
page_content='unique_property_id: cmff8vfoq0013vxp7h3onmg46
id_x: cmf53kkzy000fvcu8tx8jwjmr
projectType: RESIDENTIAL
projectName: Ashwini
projectCategory: STANDALONE
slug: luxury-ashwini-ashoknagar-chembur-mumbai-675058
projectAge: 
projectSummary: _
possessionDate: 2025-09-28 00:00:00
id_y: cmf53kl01000nvcu8ibut7fka
landmark: Babys school
fullAddress: Mumbai chembur
pincode: 411017
propertyCategory: RESIDENTIAL
type: 1BHK
configurationId: cmf53kkzz000ivcu89r5399s4
bathrooms: 1
balcony: 1.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757584023815-67012c27580e3e23.jpg
carpetArea: 123.0
price: 11111111
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1756971672464-1e5179453b5df91d.jpg"]
lift: 0
ready_to_move: 0
context: Project Name: Ashwini and type :1BHK. Located at : Mumbai chembur , near luxury-ashwini-ashoknagar-chembur-mumbai-675058 , having landmark :Babys school Property Ty

In [7]:
db = FAISS.load_local("faiss_realestate_index", embeddings, allow_dangerous_deserialization=True)


In [9]:
from langchain_core.prompts import ChatPromptTemplate

query = "Which projects have  apartments near Yashvant Seth Jadhav Marg ?"
results = db.similarity_search(query, k=5)

print(f"üîç Retrieved {len(results)} most relevant property entries.\n")

for i, res in enumerate(results, 1):
    print(f"üîπ Result {i} Preview:")
    print(res.page_content[:], "...\n")


prompt_template = ChatPromptTemplate.from_template("""
You are a Real Estate Expert Assistant helping the user find matching properties.

You will be given:
1. Retrieved property data (from a structured CSV embedding)
2. A user query

Your task:
- Identify which properties match **all** conditions in the query.
- Mention their `unique_property_id` and key details (projectName, location, price,area,pincode,type amenities if available).
- If any part of the user query is not satisfied (e.g., "near Babys school" missing or unclear), explicitly say so under **Unmatched Points**.
- Never hallucinate or assume data not present in the retrieved content.
- If nothing matches, say **"No projects found."**

---
üßæ Retrieved Property Data:
{context}

üí¨ User Query:
{question}

Now provide a structured answer:
1. ‚úÖ Matching Projects (ID + key info)
2. ‚ö†Ô∏è Unmatched Points (if any)
3. üí° Explanation (why missing, limitations, etc.)
""")

context = "\n\n".join([doc.page_content for doc in results])

final_prompt = prompt_template.format(context=context, question=query)

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",  # fast, cost-efficient
    google_api_key=api_key,
    temperature=0.2
)
response = model.invoke(final_prompt)


print(" FINAL ANSWER:")
print(response.content)


print("\n CONTEXT USED:")
for i, doc in enumerate(results, 1):
    print(f"\nDocument {i}:\n", doc.page_content[:])

üîç Retrieved 5 most relevant property entries.

üîπ Result 1 Preview:
unique_property_id: cmfaycwy9003bvc189xf97r7w
id_x: cmfaycwy70036vc18ppmb8mwh
projectType: RESIDENTIAL
projectName: Om makarand heights
projectCategory: STANDALONE
slug: om-makarand-heights-ashoknagar-chembur-mumbai-716337
projectAge: 
projectSummary: _
possessionDate: 2025-09-21 00:00:00
id_y: cmfaycwya003hvc18yc5gtwtc
landmark: Hind high school
fullAddress: 104, Yashvant Seth Jadhav Marg, Gauri Shankar Wadi No. 2, Savitribai Phule Nagar, Pant Nagar, Ghatkopar East, Mumbai, Maharashtra 400075
pincode: 400075
propertyCategory: RESIDENTIAL
type: 1BHK
configurationId: cmfaycwy80039vc18b1epdodx
bathrooms: 1
balcony: 1.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757325712593-25cd2d9a5765be22.jpg
carpetArea: 354.0
price: 790000
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757325712594-cd33c56e7db9cd37.jpg"]
lift: 0
ready_

In [9]:
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import Optional, List

class PropertyMatch(BaseModel):
    id: str = Field(..., description="Unique property ID")
    projectName: Optional[str] = Field(None, description="Name of the real estate project")
    location: Optional[str] = Field(None, description="Project location or address")
    price: Optional[str] = Field(None, description="Price in INR")
    area: Optional[str] = Field(None, description="Total or built-up area")
    pincode: Optional[str] = Field(None, description="Project pincode")
    type: Optional[str] = Field(None, description="Property type: apartment, villa, plot, etc.")
    landmark: Optional[str] = Field(None, description="Nearby landmark")
    amenities: Optional[str] = Field(None, description="Mentioned amenities")

class RAGAnswer(BaseModel):
    matching_projects: List[PropertyMatch] = Field(default_factory=list)
    unmatched_points: List[str] = Field(default_factory=list)
    explanation: str = Field(..., description="Reasoning")
    min_price: Optional[int] = Field(None, description="Minimum price constraint from query in INR")
    max_price: Optional[int] = Field(None, description="Maximum price constraint from query in INR")
    sort_by: Optional[str] = Field(None, description="Sorting preference: 'price_asc', 'price_desc', or None")


In [10]:
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

parser = PydanticOutputParser(pydantic_object=RAGAnswer)

prompt = ChatPromptTemplate.from_template("""
You are a Real Estate Expert Assistant helping a user find matching properties.

You will be given retrieved property data (from embeddings) and a user query.

Your job:
1. Identify properties that match **all** conditions in the query.
2. Return your answer strictly as JSON according to the provided format instructions.
3. If some query conditions are not met, list them under `unmatched_points`.
4. Never assume data not present in the retrieved context.
5. If nothing matches, leave `matching_projects` empty and explain why.
6. If there is a limit for price range, extract it and convert to INR:
   - "under 50 lakh" ‚Üí max_price: 5000000
   - "30-50 crore" ‚Üí min_price: 300000000, max_price: 500000000
7. Extract sort preference:
   - Look for "cheapest", "affordable", "budget", "lowest" ‚Üí sort_by: "price_asc"
   - Look for "premium", "luxury", "expensive", "highest" ‚Üí sort_by: "price_desc"
   - Otherwise leave sort_by as null

---
Retrieved Property Data:
{context}

User Query:
{question}

{format_instructions}
""")


In [11]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.2
)

# Build the LangChain pipeline with the UPDATED RAGAnswer class
rag_chain = (
    prompt
    | model
    | parser
)

# SQL Filtering Function
def sql_filter_with_ids(
    property_ids: list[str],
    min_price: int | None = None,
    max_price: int | None = None,
    sort_by: str | None = None
):
    """Filter properties by ID list and price constraints, then sort."""
    if not property_ids:
        return []

    placeholders = ",".join(["?"] * len(property_ids))
    query = f"""
        SELECT *
        FROM properties
        WHERE unique_property_id IN ({placeholders})
    """

    params = list(property_ids)
    conditions = []

    if min_price is not None:
        conditions.append("price >= ?")
        params.append(min_price)

    if max_price is not None:
        conditions.append("price <= ?")
        params.append(max_price)

    if conditions:
        query += " AND " + " AND ".join(conditions)

    if sort_by == "price_asc":
        query += " ORDER BY price ASC"
    elif sort_by == "price_desc":
        query += " ORDER BY price DESC"

    conn = sqlite3.connect("properties_sql.db")
    rows = conn.execute(query, params).fetchall()
    conn.close()

    return rows


In [22]:
query = "List projects with apartments near Sindhi Society under 3 crore in a low to high manner"
results = db.similarity_search(query, k=10)

print(f"üîç Retrieved {len(results)} most relevant property entries.\n")

context = "\n\n---\n\n".join([
    f"Property ID: {doc.metadata.get('unique_property_ID', 'Unknown')}\n{doc.page_content}"
    for doc in results
])

input_data = {
    "context": context,
    "question": query,
    "format_instructions": parser.get_format_instructions()
}


üîç Retrieved 10 most relevant property entries.



In [23]:
response = rag_chain.invoke(input_data)


In [24]:
print(response)

matching_projects=[PropertyMatch(id='cmfaxq2or0024vc18pjebowp8', projectName='Hari om', location='Cts 300, plot no 37, opposite midtown 71, Sindhi Society, Chembur, Mumbai, Maharashtra 400071', price='13000000', area='443.37', pincode='400069', type='1BHK', landmark='Sindhi Society', amenities=None), PropertyMatch(id='cmfawdrnq000bvc188680qjyx', projectName='Gurukripa', location='Sindhi Society, Near Swami Vivekanand Jr College, Chembur, Mumbai Harbour, Mumbai', price='13000000', area='426.57', pincode='400071', type='1BHK', landmark='Sindhi Society', amenities=None), PropertyMatch(id='cmfawdrnr000cvc1897rpsu1b', projectName='Gurukripa', location='Sindhi Society, Near Swami Vivekanand Jr College, Chembur, Mumbai Harbour, Mumbai', price='15000000', area='460.8', pincode='400071', type='1BHK', landmark='Sindhi Society', amenities=None), PropertyMatch(id='cmfaxq2os0026vc18v92cqw3r', projectName='Hari om', location='Cts 300, plot no 37, opposite midtown 71, Sindhi Society, Chembur, Mumbai,

matching_projects=[PropertyMatch(id='cmff8vfoq0013vxp7h3onmg46', projectName='Ashwini', location='Mumbai chembur', price='Rs11111111', area='123.0 sq meter', pincode='411017', type='1BHK', landmark='Babys school', amenities=None)] unmatched_points=[] explanation='One property matched all criteria: located near Babys school and priced under 2 crore. Other properties were excluded due to not meeting the landmark or price conditions.' min_price=None max_price=20000000 sort_by=None


In [30]:
import rich 
from rich.panel import Panel
from rich.table import Table
rich.print(response)
print("\n‚úÖ Matching Projects")
if response.matching_projects:
    table = Table("show_header=True")
    table.add_column("ID")
    table.add_column("Project Name")
    table.add_column("Location")
    table.add_column("Price")
    table.add_column("Area")
    table.add_column("Pincode")
    table.add_column("Type")
    table.add_column("Landmark")
    table.add_column("Amenities")

    for prop in response.matching_projects:
        table.add_row(
            prop.id or "-",
            prop.projectName or "-",
            prop.location or "-",
            prop.price or "-",
            prop.area or "-",
            prop.pincode or "-",
            prop.type or "-",
            prop.landmark or "-",
            prop.amenities or "-"
        )
    print(table)
else:
    print("No matching projects found.")

print("\n‚ö†Ô∏è Unmatched Points:", response.unmatched_points)
print("\nüí° Explanation:", response.explanation)

# Reference cards (all docs used)
print("\nüìö Referenced Property Data Cards:")
for i, doc in enumerate(results, 1):
    print(Panel(f"{doc.page_content}", title=f"Property {i}", expand=False))



‚úÖ Matching Projects
<rich.table.Table object at 0x000001A5FF42F230>

‚ö†Ô∏è Unmatched Points: []

üí° Explanation: Successfully identified 8 properties near Sindhi Society with prices under 3 crore (30,000,000 INR). The results are sorted by price in ascending order as requested.

üìö Referenced Property Data Cards:
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>
<rich.panel.Panel object at 0x000001A5FF42EE40>


In [32]:
from builtins import print
print(response)

matching_projects=[PropertyMatch(id='cmfaxq2or0024vc18pjebowp8', projectName='Hari om', location='Cts 300, plot no 37, opposite midtown 71, Sindhi Society, Chembur, Mumbai, Maharashtra 400071', price='13000000', area='443.37', pincode='400069', type='1BHK', landmark='Sindhi Society', amenities=None), PropertyMatch(id='cmfawdrnq000bvc188680qjyx', projectName='Gurukripa', location='Sindhi Society, Near Swami Vivekanand Jr College, Chembur, Mumbai Harbour, Mumbai', price='13000000', area='426.57', pincode='400071', type='1BHK', landmark='Sindhi Society', amenities=None), PropertyMatch(id='cmfawdrnr000cvc1897rpsu1b', projectName='Gurukripa', location='Sindhi Society, Near Swami Vivekanand Jr College, Chembur, Mumbai Harbour, Mumbai', price='15000000', area='460.8', pincode='400071', type='1BHK', landmark='Sindhi Society', amenities=None), PropertyMatch(id='cmfaxq2os0026vc18v92cqw3r', projectName='Hari om', location='Cts 300, plot no 37, opposite midtown 71, Sindhi Society, Chembur, Mumbai,

In [31]:
# Stage 1: RAG Filtering - Extract matched IDs
matched_ids = [prop.id for prop in response.matching_projects if getattr(prop, "id", None)]
print("‚úÖ Stage 1 - RAG Filtered IDs:", matched_ids)
print(f"   Found {len(matched_ids)} matching properties from RAG\n")


‚úÖ Stage 1 - RAG Filtered IDs: ['cmfaxq2or0024vc18pjebowp8', 'cmfawdrnq000bvc188680qjyx', 'cmfawdrnr000cvc1897rpsu1b', 'cmfaxq2os0026vc18v92cqw3r', 'cmfaxq2os0028vc18z4otsg17', 'cmfawdrns000hvc18t3u6jukg', 'cmfawdrnr000evc18jwvlery4', 'cmfawdrnr000fvc18af74elnu']
   Found 8 matching properties from RAG



In [26]:
# Stage 2: SQL Filtering - Apply price and sorting constraints
if not matched_ids:
    print("‚ùå No properties matched the RAG filter.")
    print("\nüìã Explanation:", response.explanation)
    final_results = []
    final_matching_ids = []
else:
    # Extract price and sorting preferences from the RAG response
    print(f"üîç Stage 2 - SQL Filtering with:")
    print(f"   Min Price: {response.min_price}")
    print(f"   Max Price: {response.max_price}")
    print(f"   Sort By: {response.sort_by}\n")
    
    # Apply SQL filtering using values extracted by LLM
    final_results = sql_filter_with_ids(
        property_ids=matched_ids,
        min_price=response.min_price,
        max_price=response.max_price,
        sort_by=response.sort_by
    )
    
    final_matching_ids = [row[0] for row in final_results]  # Assuming first column is unique_property_id
    
    print(f"‚úÖ Stage 2 Complete - SQL Filtered Results:")
    print(f"   Remaining properties after price filtering: {len(final_matching_ids)}")
    print(f"   Final Matching IDs: {final_matching_ids}\n")


üîç Stage 2 - SQL Filtering with:
   Min Price: None
   Max Price: 30000000
   Sort By: price_asc

‚úÖ Stage 2 Complete - SQL Filtered Results:
   Remaining properties after price filtering: 8
   Final Matching IDs: ['cmfawdrnq000bvc188680qjyx', 'cmfaxq2or0024vc18pjebowp8', 'cmfawdrnr000cvc1897rpsu1b', 'cmfaxq2os0026vc18v92cqw3r', 'cmfaxq2os0028vc18z4otsg17', 'cmfawdrns000hvc18t3u6jukg', 'cmfawdrnr000evc18jwvlery4', 'cmfawdrnr000fvc18af74elnu']

