In [None]:
## Installation required: 
! pip install langchain==0.1.17
! pip install langchain_openai
! pip install openai==1.25.1
! pip install pydantic==2.7.1
! pip install pytest
! pip install chromadb==0.5.0
! pip install jupyter==1.0.0
! pip install pandas==2.2.2

### QA Chatbot on Real Estate Data
Required libraries

In [46]:
## General Libraries 
import os 
from typing import List
import pandas as pd
from io import StringIO
import csv
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import StrOutputParser ## Process llm response to string

In [4]:
# Rag
from langchain_chroma import Chroma
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

In [49]:
## Chains
from langchain.chains import create_retrieval_chain
## Main Components
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate

In [33]:
## Dhat
from langchain.chains.combine_documents import create_stuff_documents_chain # RAG Chat

In [120]:
OPENAI_API_KEY = 'your key'

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

### Data generation
Generating 30 property listing with llm

In [56]:
datagen_llm = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model="gpt-3.5-turbo-16k",
)

In [41]:
DATAGEN_TEMPLATE = """You are a datagenerator for real estate who acts as a real estate agent to create property listing for users.  
You should create list of {count} listing based on made up values. Include the following fields:
```
Neighborhood, area, price, type(Condo, apartment etc.), house size, bedrooms, bathrooms, description,contact
```
{format_instructions}
"""

In [42]:
## Data validator
class HomeData(BaseModel):
    neighborhood: str = Field(..., description="The name of the neighborhood")
    area: str = Field(..., description="The size of the home in square feet")
    price: str = Field(..., description="The listed price of the home in USD")
    type_: str = Field(..., alias='Type', description="The type of property")
    house_size: str = Field(..., description="Size of the house in square feet")
    bedrooms: str = Field(..., description="Number of bedrooms")
    bathrooms: str = Field(..., description="Number of bathrooms")
    description: str = Field(..., description="Description of the property")
    contact: str = Field(..., description="Contact information for the property")

class HomeDataList(BaseModel):
    listings: List[HomeData] = Field(description="A list of real estate listings")

In [43]:
## Data parser
parser = PydanticOutputParser(pydantic_object=HomeDataList)

In [62]:
PROMPT = PromptTemplate(
    input_variables=["count"],
    template=DATAGEN_TEMPLATE,
    partial_variables={"format_instructions": parser.get_format_instructions}
)

datagen_instruction = PROMPT.format(count=10)
print(datagen_instruction)

You are a datagenerator for real estate who acts as a real estate agent to create property listing for users.  
You should create list of 10 listing based on made up values. Include the following fields:
```
Neighborhood, area, price, type(Condo, apartment etc.), house size, bedrooms, bathrooms, description,contact
```
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"HomeData": {"properties": {"neighborhood": {"description": "The name of the neighborhood", "title": "Neighborhood", "type": "string"}, "area": {"description": "The size of the home in square feet", "title": "Area", "

### Alternative way to call llm using chain and parser in a pipeline: 
```
output_parser = DatetimeOutputParser(Hformat="%a %d/%m/%Y")
prompt = PromptTemplate(
    template="{question}\n{format_instructions}",
    input_variables=["question"],
    partial_variables={"format_instructions": format_instructions}
)

model = VertexAI()
chain = prompt | model | output_parser
ans = chain.invoke({"question":"When is Lunar New Year of 2024?"})
```

In [69]:
datagen_response = datagen_llm.invoke(datagen_instruction).content

In [70]:
datagen_response

'{\n  "listings": [\n    {\n      "neighborhood": "Westwood",\n      "area": "1500 sqft",\n      "price": "$500,000",\n      "Type": "Condo",\n      "house_size": "1200 sqft",\n      "bedrooms": "2",\n      "bathrooms": "2",\n      "description": "Beautiful condo in a prime location with stunning views",\n      "contact": "John Doe - 123-456-7890"\n    },\n    {\n      "neighborhood": "Downtown",\n      "area": "2000 sqft",\n      "price": "$800,000",\n      "Type": "Apartment",\n      "house_size": "1800 sqft",\n      "bedrooms": "3",\n      "bathrooms": "2",\n      "description": "Spacious apartment with modern amenities",\n      "contact": "Jane Smith - 987-654-3210"\n    },\n    {\n      "neighborhood": "Greenwood",\n      "area": "1800 sqft",\n      "price": "$550,000",\n      "Type": "House",\n      "house_size": "1500 sqft",\n      "bedrooms": "4",\n      "bathrooms": "3",\n      "description": "Charming house with a large backyard",\n      "contact": "Mike Johnson - 555-123-456

In [71]:
data = parser.parse(datagen_response)

In [72]:
# Convert each Pydantic model to a dictionary
listings_dicts = [listing.dict() for listing in data.listings]

In [73]:
df = pd.DataFrame(listings_dicts)
df.head()

Unnamed: 0,neighborhood,area,price,type_,house_size,bedrooms,bathrooms,description,contact
0,Westwood,1500 sqft,"$500,000",Condo,1200 sqft,2,2,Beautiful condo in a prime location with stunn...,John Doe - 123-456-7890
1,Downtown,2000 sqft,"$800,000",Apartment,1800 sqft,3,2,Spacious apartment with modern amenities,Jane Smith - 987-654-3210
2,Greenwood,1800 sqft,"$550,000",House,1500 sqft,4,3,Charming house with a large backyard,Mike Johnson - 555-123-4567
3,Northridge,1600 sqft,"$450,000",Condo,1400 sqft,2,2,Cozy condo in a quiet neighborhood,Emily Wilson - 789-456-1230
4,Brentwood,2200 sqft,"$1,200,000",House,2000 sqft,5,4,Luxurious house with high-end finishes,David Thompson - 222-333-4444


In [101]:
df.to_csv('real_estate_listings.csv', index_label = 'id')

## Semantic Search from Vector DB

Loading the data from document

In [15]:
loader = CSVLoader(file_path='./real_estate_listings.csv')
docs = loader.load()

Preparing to load the data in vector DB

In [18]:
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
split_docs = splitter.split_documents(docs)

embeddings = OpenAIEmbeddings()

In [19]:
split_docs[0].page_content

'id: 0\nneighborhood: Westwood\narea: 1,200 sqft\nprice: $500,000\ntype_: Condo\nhouse_size: 1,000 sqft\nbedrooms: 2\nbathrooms: 2\ndescription: Beautiful condo in the heart of Westwood with stunning views.\ncontact: John Smith - 555-555-5555'

In [17]:
os.path.exists("chroma_db")

True

In [21]:
persist_directory = "chromadb"

In [22]:
db = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_directory)

In [22]:

# load the document and split it into chunks
loader = CSVLoader(file_path='./real_estate_listings.csv')
documents = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

id: 5
neighborhood: Marina del Rey
area: 1,700 sqft
price: $750,000
type_: Apartment
house_size: 1,400 sqft
bedrooms: 2
bathrooms: 2
description: Luxurious apartment with stunning views of the marina in Marina del Rey.
contact: Emily Jones - 555-555-5555


In [None]:

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

# query it
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

### Seimilarity Search

In [25]:
import time

In [27]:
query = "Apartment near westwood neighborhood"

In [37]:
start_time = time.time()
result = db.similarity_search(query)
end_time = time.time()
elapsed_time = end_time - start_time
print(len(result))
print(f"Number of results: {len(result)}")
if result:
    print(f"Content of the first result: {result[0].page_content}")
print(f"Time taken for the search: {elapsed_time} seconds")

4
Number of results: 4
Content of the first result: id: 0
neighborhood: Westwood
area: 1,200 sqft
price: $500,000
type_: Condo
house_size: 1,000 sqft
bedrooms: 2
bathrooms: 2
description: Beautiful condo in the heart of Westwood with stunning views.
contact: John Smith - 555-555-5555
Time taken for the search: 0.449080228805542 seconds


## QA ChatBot For Real-Estate 
Retreival based chatbot

In [74]:
home_llm = ChatOpenAI(
    model="gpt-3.5-turbo-16k",
    temperature=0,
    max_tokens=1000
)

In [81]:
prompt_template = """
Act as an real estate agent. You will receive simple summury about an users' real-estate need
You will suggest the user the best option for him based on the following real estate listings: 
<Listings>
{context}
</Listings>

Summury: {input}
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

I will use `create_stuff_documents_chain` function.  This chain will take an incoming question, look up relevant documents, then pass those documents along with the original question into an LLM and ask it to answer the original question.

In [82]:
home_search_chain = create_stuff_documents_chain(home_llm, prompt)

#### Creating the retriever to get relavant info from the vector db:
The retreiver will provide the context variable to the home_search_chain

In [83]:
#  retreived document 
retriever = db.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, home_search_chain)

**Invoking the chain**

In [84]:
response = retrieval_chain.invoke({"input": "We need a home near Westwood neighborhood with atleast 1000 sqft area"})

In [86]:
print(response["answer"])

Based on your needs, I would recommend considering the condo in Westwood. It has a beautiful view and meets your minimum requirement of 1000 sqft area. The condo has 2 bedrooms and 2 bathrooms, making it suitable for a small family or individuals looking for extra space. The price is $500,000, which is within your budget. If you are interested, you can contact John Smith at 555-555-5555 for more information or to schedule a viewing.


### Building a chat bot with support for conversation (with history)
We will use: `create_history_aware_retriever`. 
- Create a chain that takes conversation history and returns documents.

**Parameters:**
- llm – Language model to use for generating a search term given chat history

- retriever (Runnable[str, List[Document]]) – RetrieverLike object that takes a string as input and outputs a list of Documents. (In this case it is the chroma db retriever) 

- prompt (BasePromptTemplate) – The prompt used to generate the search query for the retriever.

In [92]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage

In [106]:
### Prompt with history:
summurygen_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a real estate assistant to provide requirement summury based on users need"),
        MessagesPlaceholder(variable_name="chat_history", optional=True),
        ("user", "{input}"),
        ("user", "Given the above conversation, generate a summury of the real estate need") 
    ]
)

# This creates a search query from previous chats and return the retreived document 
retriever_chain = create_history_aware_retriever(home_llm, retriever, summurygen_prompt)

Example of using the history prompt:
```
prompt = summury_prompt.format_messages(
    chat_history=[], 
    ..other values e.b. inputs, contexts etc.
)
res = summury_gen.invoke(prompt)
```

In [107]:
chat_history = [
    HumanMessage(content="I am looking for a house in Westwood"), 
    AIMessage(content="Yeah sure. Can you tell me about the area, number of rooms and other details"),
    HumanMessage(content="Sure. Number of room should be minimum 2 with proper bathrooms but the area must be above 1000 sq ft"),
    AIMessage(content="Noted. Is there anything else you need?")
]

In [112]:
retreived_doc = retriever_chain.invoke({
    "chat_history": chat_history,
    "input": "No"
})

In [113]:
retreived_doc

[Document(page_content='id: 0\nneighborhood: Westwood\narea: 1,200 sqft\nprice: $500,000\ntype_: Condo\nhouse_size: 1,000 sqft\nbedrooms: 2\nbathrooms: 2\ndescription: Beautiful condo in the heart of Westwood with stunning views.\ncontact: John Smith - 555-555-5555', metadata={'row': 0, 'source': './real_estate_listings.csv'}),
 Document(page_content='id: 1\nneighborhood: Brentwood\narea: 2,500 sqft\nprice: $1,200,000\ntype_: House\nhouse_size: 2,000 sqft\nbedrooms: 4\nbathrooms: 3\ndescription: Spacious and modern house in the desirable Brentwood neighborhood.\ncontact: Jane Doe - 555-555-5555', metadata={'row': 1, 'source': './real_estate_listings.csv'}),
 Document(page_content='id: 6\nneighborhood: Playa Vista\narea: 2,200 sqft\nprice: $1,100,000\ntype_: House\nhouse_size: 2,000 sqft\nbedrooms: 4\nbathrooms: 3\ndescription: Modern and spacious house in the up-and-coming Playa Vista neighborhood.\ncontact: David Lee - 555-555-5555', metadata={'row': 6, 'source': './real_estate_listin

#### Creating the bot:
Now we can create a new chain to conitnue the conversation based on the retreived doc above

In [115]:
system_template = """
Act as an real estate agent. Answer users question and give suggestion based on the following real estate listings: 
<Listings>
{context}
</Listings>

Summury: {input}
"""

conversation_prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
])

 > `home_data_chain` gets invoked inside the retrieval_chain to give ai response based on the document from retriever chain

In [None]:
home_data_chain = create_stuff_documents_chain(home_llm, conversation_prompt) 

retrieval_chain = create_retrieval_chain(retriever_chain, home_data_chain)

In [116]:
retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "Give me the best offer for my need"
})

{'chat_history': [HumanMessage(content='I am looking for a house in Westwood'),
  AIMessage(content='Yeah sure. Can you tell me about the area, number of rooms and other details'),
  HumanMessage(content='Sure. Number of room should be minimum 2 with proper bathrooms but the area must be above 1000 sq ft'),
  AIMessage(content='Noted. Is there anything else you need?')],
 'input': 'Give me the best offer for my need',
 'context': [Document(page_content='id: 9\nneighborhood: Beverly Hills\narea: 4,000 sqft\nprice: $2,500,000\ntype_: House\nhouse_size: 3,500 sqft\nbedrooms: 4\nbathrooms: 4\ndescription: Elegant and luxurious house in the prestigious Beverly Hills neighborhood.\ncontact: Jennifer Smith - 555-555-5555', metadata={'row': 9, 'source': './real_estate_listings.csv'}),
  Document(page_content='id: 3\nneighborhood: Venice\narea: 1,500 sqft\nprice: $700,000\ntype_: Condo\nhouse_size: 1,200 sqft\nbedrooms: 2\nbathrooms: 2\ndescription: Modern and stylish condo in the trendy Venice

In [121]:
retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "Give me the best offer for my need"
})['answer']

'Based on your needs, I would recommend the following listing:\n\nListing ID: 1\nNeighborhood: Brentwood\nArea: 2,500 sqft\nPrice: $1,200,000\nType: House\nHouse Size: 2,000 sqft\nBedrooms: 4\nBathrooms: 3\nDescription: Spacious and modern house in the desirable Brentwood neighborhood.\n\nThis house offers ample space with 4 bedrooms and 3 bathrooms, perfect for a family or someone who values extra room. The Brentwood neighborhood is highly sought after and known for its desirable location. The price of $1,200,000 is competitive for the area and the house itself is modern and well-maintained.\n\nIf you are interested in this listing or would like to schedule a viewing, please feel free to contact Jane Doe at 555-555-5555.'

Future Work: To integrate all these things we can create AI agents to let the ai decide which step to take