In [1]:
import pandas as pd
from dotenv import load_dotenv
import os

from langchain_community.document_loaders import CSVLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS


In [2]:
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found in .env file")
else:
    print("✅ API key loaded successfully")


✅ API key loaded successfully


In [3]:
csv_path = "data/10final_merged_realestate_data.csv"  
loader = CSVLoader(file_path=csv_path)
documents = loader.load()

print(f"✅ Loaded {len(documents)} documents from CSV")


✅ Loaded 83 documents from CSV


In [4]:
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [5]:
print(documents[0])

page_content='unique_property_id: cmff8vfoq0013vxp7h3onmg46
id_x: cmf53kkzy000fvcu8tx8jwjmr
projectType: RESIDENTIAL
projectName: Ashwini
projectCategory: STANDALONE
slug: luxury-ashwini-ashoknagar-chembur-mumbai-675058
projectAge: 
projectSummary: _
possessionDate: 2025-09-28 00:00:00
id_y: cmf53kl01000nvcu8ibut7fka
landmark: Babys school
fullAddress: Mumbai chembur
pincode: 411017
propertyCategory: RESIDENTIAL
type: 1BHK
configurationId: cmf53kkzz000ivcu89r5399s4
bathrooms: 1
balcony: 1.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757584023815-67012c27580e3e23.jpg
carpetArea: 123.0
price: 11111111
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1756971672464-1e5179453b5df91d.jpg"]
lift: 0
ready_to_move: 0
context: Project Name: Ashwini and type :1BHK. Located at : Mumbai chembur , near luxury-ashwini-ashoknagar-chembur-mumbai-675058 , having landmark :Babys school Property Type: . Price: Rs1

In [8]:
import time
from tqdm import tqdm

# Process documents in smaller batches with rate limiting
batch_size = 10  # Adjust this based on your quota
delay_between_batches = 15  # seconds

print(f"Processing {len(documents)} documents in batches of {batch_size}...")

# Split documents into batches
all_embeddings = []
for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i + batch_size]
    
    try:
        # Create FAISS index from batch
        if i == 0:
            # First batch - create new index
            vectordb = FAISS.from_documents(batch, embeddings)
        else:
            # Subsequent batches - merge with existing index
            batch_db = FAISS.from_documents(batch, embeddings)
            vectordb.merge_from(batch_db)
        
        print(f"✅ Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
        
        # Rate limiting - wait between batches
        if i + batch_size < len(documents):
            time.sleep(delay_between_batches)
            
    except Exception as e:
        print(f"Error processing batch at index {i}: {e}")
        print(f"Try reducing batch_size or increasing delay_between_batches")
        raise

# Save the vector store
vectordb.save_local("faiss_realestate_index")
print("✅ FAISS vector store created and saved as 'faiss_realestate_index'")


Processing 83 documents in batches of 10...


  0%|          | 0/9 [00:00<?, ?it/s]

✅ Processed batch 1/9


 11%|█         | 1/9 [00:16<02:12, 16.53s/it]

✅ Processed batch 2/9


 22%|██▏       | 2/9 [00:33<01:56, 16.63s/it]

✅ Processed batch 3/9


 33%|███▎      | 3/9 [00:49<01:40, 16.67s/it]

✅ Processed batch 4/9


 44%|████▍     | 4/9 [01:06<01:23, 16.61s/it]

✅ Processed batch 5/9


 56%|█████▌    | 5/9 [01:23<01:06, 16.63s/it]

✅ Processed batch 6/9


 67%|██████▋   | 6/9 [01:39<00:49, 16.65s/it]

✅ Processed batch 7/9


 78%|███████▊  | 7/9 [01:56<00:33, 16.65s/it]

✅ Processed batch 8/9


100%|██████████| 9/9 [02:29<00:00, 16.65s/it]

✅ Processed batch 9/9
✅ FAISS vector store created and saved as 'faiss_realestate_index'





In [None]:
vectordb = FAISS.from_documents(documents, embeddings)
vectordb.save_local("faiss_realestate_index")

print("✅ FAISS vector store created and saved as 'faiss_realestate_index'")

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    length_function=len
)

chunks = text_splitter.split_documents(documents)
print(f"Split the documents into {len(chunks)} chunks.")

SOURCE_DIR   = Path("docs")             # put your files here
INDEX_DIR    = Path("chroma_db_1")        # will be created if missing
COLLECTION   = "kb_collection"
EMBED_MODEL  = "gemini-embedding-001"

embeddings = GoogleGenerativeAIEmbeddings(model=EMBED_MODEL)
vectordb   = FAISS.from_documents(
    documents         = chunks,
    embedding         = embeddings,
    persist_directory = str(INDEX_DIR),
    collection_name   = COLLECTION,
)

Split the documents into 175 chunks.


GoogleGenerativeAIError: Error embedding content: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0 [violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerUserPerProjectPerModel-FreeTier"
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
]

In [14]:
# Load FAISS index
db = FAISS.load_local("faiss_realestate_index", embeddings, allow_dangerous_deserialization=True)

# Example query
query = "3BHK flats with lift in Yashvant Seth Jadhav Marg"
results = db.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\n🔹 Result {i}:")
    print(res)



🔹 Result 1:
page_content='unique_property_id: cmfdmdqmt000hvc90cqvdwrq7
id_x: cmfdmdqmq0008vc90svu2sfto
projectType: RESIDENTIAL
projectName: Antriksh
projectCategory: STANDALONE
slug: luxury-antriksh-somwarpeth-camp-pune-997560
projectAge: 
projectSummary: _
possessionDate: 
id_y: cmfdmdqmu000jvc90b6auuc6b
landmark: Station Road Saraswat Colony
fullAddress: CTS NO 391, Station Rd, opp. Zilla Parishad, Mangalwar Peth, Somwar Peth, Pune, Maharashtra 411011
pincode: 411011
propertyCategory: RESIDENTIAL
type: 3BHK
configurationId: cmfdmdqmt000gvc90h6gcwpga
bathrooms: 3
balcony: 2.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757486991499-74fa17a4c97985e2.jpg
carpetArea: 1095.0
price: 22900000
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757486991505-2373b8e162de669f.jpg"]
lift: 0
ready_to_move: 0
context: Project Name: Antriksh and type :3BHK. Located at : CTS NO 391, Station Rd, opp. Zilla P

In [28]:
from langchain_google_genai import ChatGoogleGenerativeAI

query = "Which projects have  apartments near Babys school?"

results = db.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\n🔹 Result {i}:")
    print(res)


prompt = f"""
Use the following property data to answer the user's query.
Return factual, concise answers.
and identify each property by its unique_property_ID i.e the first column in the database.
results:
{results}

Question:
{query}
and if no projects are found, respond with "No projects found."or can list only the relevant projects.
"""

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",  
    google_api_key=api_key
)
response = model.invoke(prompt)
print(response)



🔹 Result 1:
page_content='unique_property_id: cmff8vfoq0013vxp7h3onmg46
id_x: cmf53kkzy000fvcu8tx8jwjmr
projectType: RESIDENTIAL
projectName: Ashwini
projectCategory: STANDALONE
slug: luxury-ashwini-ashoknagar-chembur-mumbai-675058
projectAge: 
projectSummary: _
possessionDate: 2025-09-28 00:00:00
id_y: cmf53kl01000nvcu8ibut7fka
landmark: Babys school
fullAddress: Mumbai chembur
pincode: 411017
propertyCategory: RESIDENTIAL
type: 1BHK
configurationId: cmf53kkzz000ivcu89r5399s4
bathrooms: 1
balcony: 1.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757584023815-67012c27580e3e23.jpg
carpetArea: 123.0
price: 11111111
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1756971672464-1e5179453b5df91d.jpg"]
lift: 0
ready_to_move: 0
context: Project Name: Ashwini and type :1BHK. Located at : Mumbai chembur , near luxury-ashwini-ashoknagar-chembur-mumbai-675058 , having landmark :Babys school Property Type:

In [None]:
db = FAISS.load_local("faiss_realestate_index", embeddings, allow_dangerous_deserialization=True)


In [None]:
from langchain_core.prompts import ChatPromptTemplate
query = "Which projects have  apartments near Yashvant Seth Jadhav Marg ?"
results = db.similarity_search(query, k=5)

print(f"🔍 Retrieved {len(results)} most relevant property entries.\n")

for i, res in enumerate(results, 1):
    print(f"🔹 Result {i} Preview:")
    print(res.page_content[:], "...\n")


prompt_template = ChatPromptTemplate.from_template("""
You are a Real Estate Expert Assistant helping the user find matching properties.

You will be given:
1. Retrieved property data (from a structured CSV embedding)
2. A user query

Your task:
- Identify which properties match **all** conditions in the query.
- Mention their `unique_property_id` and key details (projectName, location, price,area,pincode,type amenities if available).
- If any part of the user query is not satisfied (e.g., "near Babys school" missing or unclear), explicitly say so under **Unmatched Points**.
- Never hallucinate or assume data not present in the retrieved content.
- If nothing matches, say **"No projects found."**

---
🧾 Retrieved Property Data:
{context}

💬 User Query:
{question}

Now provide a structured answer:
1. ✅ Matching Projects (ID + key info)
2. ⚠️ Unmatched Points (if any)
3. 💡 Explanation (why missing, limitations, etc.)
""")

context = "\n\n".join([doc.page_content for doc in results])

final_prompt = prompt_template.format(context=context, question=query)

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",  # fast, cost-efficient
    google_api_key=api_key,
    temperature=0.2
)
response = model.invoke(final_prompt)


print(" FINAL ANSWER:")
print(response.content)


print("\n CONTEXT USED:")
for i, doc in enumerate(results, 1):
    print(f"\nDocument {i}:\n", doc.page_content[:])

🔍 Retrieved 5 most relevant property entries.

🔹 Result 1 Preview:
unique_property_id: cmfaycwy9003bvc189xf97r7w
id_x: cmfaycwy70036vc18ppmb8mwh
projectType: RESIDENTIAL
projectName: Om makarand heights
projectCategory: STANDALONE
slug: om-makarand-heights-ashoknagar-chembur-mumbai-716337
projectAge: 
projectSummary: _
possessionDate: 2025-09-21 00:00:00
id_y: cmfaycwya003hvc18yc5gtwtc
landmark: Hind high school
fullAddress: 104, Yashvant Seth Jadhav Marg, Gauri Shankar Wadi No. 2, Savitribai Phule Nagar, Pant Nagar, Ghatkopar East, Mumbai, Maharashtra 400075
pincode: 400075
propertyCategory: RESIDENTIAL
type: 1BHK
configurationId: cmfaycwy80039vc18b1epdodx
bathrooms: 1
balcony: 1.0
furnishedType: UNFURNISHED
furnishingType: []
floorPlanImage: https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757325712593-25cd2d9a5765be22.jpg
carpetArea: 354.0
price: 790000
propertyImages: ["https://pub-d28896f69c604ec5aa743cb0397740d9.r2.dev/1757325712594-cd33c56e7db9cd37.jpg"]
lift: 0
ready_to_mov

In [80]:
from pydantic import BaseModel, Field
from typing import List, Optional

class PropertyMatch(BaseModel):
    id: str = Field(..., description="Unique property ID , with coulmn name unique_property_id")
    projectName: Optional[str] = Field(None, description="Name of the real estate project")
    location: Optional[str] = Field(None, description="Project location or address")
    price: Optional[str] = Field(None, description="Price or price range, where 1 crore or 1cr = 10000000")
    area: Optional[str] = Field(None, description="Total or built-up area details")
    pincode: Optional[str] = Field(None, description="Project pincode")
    type: Optional[str] = Field(None, description="Property type, e.g. apartment, villa, plot, etc.")
    landmark: Optional[str] = Field(None, description="Nearby landmark if available")
    amenities: Optional[str] = Field(None, description="Mentioned amenities if available")

class RAGAnswer(BaseModel):
    matching_projects: List[PropertyMatch] = Field(default_factory=list)
    unmatched_points: List[str] = Field(default_factory=list)
    explanation: str = Field(..., description="Reasoning or context explanation")


In [86]:
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

parser = PydanticOutputParser(pydantic_object=RAGAnswer)

prompt = ChatPromptTemplate.from_template("""
You are a Real Estate Expert Assistant helping a user find matching properties.

You will be given retrieved property data (from embeddings) and a user query.

Your job:
1. Identify properties that match **all** conditions in the query.
2. Return your answer strictly as JSON according to the provided format instructions.
3. If some query conditions are not met, list them under `unmatched_points`.
4. Never assume data not present in the retrieved context.
5. If nothing matches, leave `matching_projects` empty and explain why.
6. if there is a limit for price range, make sure to follow it.

---
Retrieved Property Data:
{context}

User Query:
{question}

{format_instructions}
""")


In [87]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    google_api_key=api_key,
    temperature=0.2
)

# Build the LangChain pipeline
rag_chain = (
    prompt
    | model
    | parser
)


In [None]:
query = "List projects with apartments near subhash nagar with lift"
results = db.similarity_search(query, k=5)

print(f"🔍 Retrieved {len(results)} most relevant property entries.\n")

context = "\n\n---\n\n".join([
    f"Property ID: {doc.metadata.get('unique_property_ID', 'Unknown')}\n{doc.page_content}"
    for doc in results
])

input_data = {
    "context": context,
    "question": query,
    "format_instructions": parser.get_format_instructions()
}


In [92]:
response = rag_chain.invoke(input_data)


In [93]:
from rich import print
from rich.panel import Panel
from rich.table import Table
print(response)
print("\n✅ [bold cyan]Matching Projects[/bold cyan]")
if response.matching_projects:
    table = Table(show_header=True, header_style="bold magenta")
    table.add_column("ID")
    table.add_column("Project Name")
    table.add_column("Location")
    table.add_column("Price")
    table.add_column("Area")
    table.add_column("Pincode")
    table.add_column("Type")
    table.add_column("Landmark")
    table.add_column("Amenities")

    for prop in response.matching_projects:
        table.add_row(
            prop.id or "-",
            prop.projectName or "-",
            prop.location or "-",
            prop.price or "-",
            prop.area or "-",
            prop.pincode or "-",
            prop.type or "-",
            prop.landmark or "-",
            prop.amenities or "-"
        )
    print(table)
else:
    print("[yellow]No matching projects found.[/yellow]")

print("\n⚠️ [bold yellow]Unmatched Points:[/bold yellow]", response.unmatched_points)
print("\n💡 [bold green]Explanation:[/bold green]", response.explanation)

# Reference cards (all docs used)
print("\n📚 [bold underline cyan]Referenced Property Data Cards:[/bold underline cyan]")
for i, doc in enumerate(results, 1):
    print(Panel(f"{doc.page_content}", title=f"Property {i}", expand=False))
