This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace.


Step 1: Setting Up the Python Application


In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.docstore.document import Document
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
import csv
import json
import os
import pathlib
import random


load_dotenv()

OPENAI_API_KEY = os.getenv("API_KEY")
OPENAI_API_BASE = os.getenv("API_BASE")

LISTING_PATH: os.PathLike = pathlib.Path("listings/listings.csv").resolve()
DB_DIR: os.PathLike = pathlib.Path("chroma_db").resolve()
DB_COLLECTION: str = "real_estate_listings"
LLM_MODEL: str = "gpt-4o-mini"
EMBEDDINGS: OpenAIEmbeddings = OpenAIEmbeddings(model="text-embedding-ada-002")


Step 2: Generating Real Estate Listings


In [None]:
def prepare_listings(temperature: float = 0.7) -> None:
    """
    Prepare the listings by LLM
    """

    if LISTING_PATH.exists():
        return

    # 1. Set up the LLM
    llm = ChatOpenAI(model_name=LLM_MODEL, temperature=temperature)

    # 2. Define the schemas for the output
    response_schemas = [
        ResponseSchema(name="Neighborhood", description="The name of the neighborhood"),
        ResponseSchema(name="Price", description="The listing price, like '$800,000'"),
        ResponseSchema(name="Bedrooms", description="Number of bedrooms"),
        ResponseSchema(name="Bathrooms", description="Number of bathrooms"),
        ResponseSchema(
            name="House Size", description="Size of the house, like '2,500 sqft'"
        ),
        ResponseSchema(name="Year Built", description="Year the house was built"),
        ResponseSchema(
            name="Lot Size", description="Size of the land lot, like '5,000 sqft'"
        ),
        ResponseSchema(
            name="Description", description="Detailed description of the house"
        ),
        ResponseSchema(
            name="Neighborhood Description",
            description="Detailed description of the neighborhood",
        ),
    ]

    # 3. Create the parser
    parser = StructuredOutputParser.from_response_schemas(response_schemas)

    # 4. Create the prompt
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "You are a real estate agent assistant."),
            (
                "human",
                "Generate a real estate listing for a house located in {neighborhood}.\n{format_instructions}",
            ),
        ]
    )

    # 5. Neighborhoods to generate
    neighborhoods = ["Chelsea", "Midtown", "Greenpoint", "Williamsburg", "SoHo"]
    listings = []

    # Generate listings for each neighborhood
    for neighborhood in neighborhoods:
        for _ in range(random.randint(2, 5)):
            _prompt = prompt.format_messages(
                neighborhood=neighborhood,
                format_instructions=parser.get_format_instructions(),
            )

            # Generate the response
            response = llm(_prompt)
            print(f"Response: {response}")
            parsed = parser.parse(response.content)
            listings.append(parsed)

    # 6. Save to CSV
    fieldnames = [
        "Neighborhood",
        "Price",
        "Bedrooms",
        "Bathrooms",
        "House Size",
        "Year Built",
        "Lot Size",
        "Description",
        "Neighborhood Description",
    ]

    # Create the directory if it doesn't exist
    if not LISTING_PATH.parent.exists():
        LISTING_PATH.parent.mkdir()

    with open(LISTING_PATH, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for listing in listings:
            writer.writerow(listing)


In [None]:
# Prepare the listings if not already done
prepare_listings(temperature=0.8)

Step 3: Storing Listings in a Vector Database


In [None]:
def load_listings_from_csv(file_path: str | os.PathLike) -> list[Document]:
    """
    Load listings from a CSV file and return a list of Document objects.
    Args:
        file_path (str | os.PathLike): Path to the CSV file.
    Returns:
        list: A list of Document objects containing the listings data.
    """
    loader = CSVLoader(
        file_path=file_path,
        source_column="Neighborhood",
        metadata_columns=[
            "Neighborhood",
            "Price",
            "Bedrooms",
            "Bathrooms",
            "House Size",
            "Year Built",
            "Lot Size",
        ],
    )

    raw_data = loader.load()

    documents = []
    for item in raw_data:
        documents.append(
            Document(page_content=item.page_content, metadata=item.metadata)
        )

    return documents

In [None]:
def add_documents_to_chromadb(documents: list[Document]) -> Chroma:
    """
    Add documents to ChromaDB
    Args:
        documents (list[Document]): List of Document objects to add.
    Returns:
        Chroma: The ChromaDB instance with the added documents.
    """
    # Create a new ChromaDB instance and add documents
    chrome_client = Chroma.from_documents(
        documents=documents,
        embedding=EMBEDDINGS,
        persist_directory=str(DB_DIR),
        collection_name=DB_COLLECTION,
    )
    return chrome_client


if DB_DIR.exists():
    print(f"load the existing ChromaDB in {DB_DIR}")
    # load the existing ChromaDB
    chrome_client = Chroma(
        persist_directory=str(DB_DIR),
        embedding_function=EMBEDDINGS,
        collection_name=DB_COLLECTION,
    )

    # Check if the collection is empty
    if chrome_client._collection.count() == 0:
        print("Collection is empty, adding documents...")
        # If the collection is empty, add the documents
        add_documents_to_chromadb(load_listings_from_csv(LISTING_PATH))
else:
    print(f"Creating a new ChromaDB in {DB_DIR}")
    # If the persist directory doesn't exist, create and initialize
    chrome_client = add_documents_to_chromadb(load_listings_from_csv(LISTING_PATH))

print(chrome_client._collection.count())


# vectordb.delete_collection()


In [None]:
# Perform a similarity search
query = "modern house with a gourmet kitchen in a lively area"
# query = "eco-friendly house with solar panels"
results = chrome_client.similarity_search(
    query, k=2
)  # Retrieve the top 2 most similar documents

print("Search results:")
for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 20)

print(results)

In [None]:
retriever = chrome_client.as_retriever(search_type="mmr", search_kwargs={"k": 2})
results_retriever = retriever.invoke(query)

print("Search results from retriever:")
for doc in results_retriever:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 20)

Step 4: Building the User Preference Interface


Collect buyer preferences, such as the number of bedrooms, bathrooms, location, and other specific requirements from a set of questions or telling the buyer to enter their preferences in natural language. You can hard-code the buyer preferences in questions and answers, or collect them interactively however you'd like, example:

questions = [
"How big do you want your house to be?"
"What are 3 most important things for you in choosing this property?",
"Which amenities would you like?",
"Which transportation options are important to you?",
"How urban do you want your neighborhood to be?",
]

answers = [
"A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
"A quiet neighborhood, good local schools, and convenient shopping options.",
"A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
"Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
"A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]


In [None]:
# Define the questions and possible answers

questions = [
    "How many bedrooms and bathrooms would you like?",
    "What size home are you comfortable with?",
    "Which features are must-haves in your home?",
    "What kind of neighborhood do you prefer?",
    "How do you usually commute, and what transportation matters to you?",
]

possible_answers = [
    "I'd like at least 3 bedrooms and 2 bathrooms to accommodate my family.",
    "Around 2,000 square feet would be perfect — not too big but still roomy enough.",
    "A modern kitchen, energy-efficient appliances, a backyard for the kids, and a two-car garage are must-haves.",
    "I prefer a quiet, suburban neighborhood with tree-lined streets and good schools nearby.",
    "I usually drive to work, but I'd love to have nearby public transportation and bike-friendly roads as backup.",
]

answers = []
for index, q in enumerate(questions):
    # Simulate user input for demonstration purposes
    user_input = possible_answers[index]
    # user_input = input("You: ")
    answers.append(f"{q} {user_input}")


Step 5: Searching Based on Preferences


- Semantic Search Implementation: Use the structured buyer preferences to perform a semantic search on the vector database, retrieving listings that most closely match the user's requirements.
- Listing Retrieval Logic: Fine-tune the retrieval algorithm to ensure that the most relevant listings are selected based on the semantic closeness to the buyer’s preferences.


In [None]:
def preferences_to_query_string(preferences: dict) -> str:
    """Turn structured preferences into a single semantic query string."""
    parts = []

    if preferences.get("bedrooms"):
        parts.append(f"{preferences['bedrooms']} bedrooms")
    if preferences.get("bathrooms"):
        parts.append(f"{preferences['bathrooms']} bathrooms")
    if preferences.get("house_size"):
        parts.append(f"house size: {preferences['house_size']}")
    if preferences.get("features"):
        parts.append(f"features: {', '.join(preferences['features'])}")
    if preferences.get("neighborhood_type"):
        parts.append(f"{preferences['neighborhood_type']} neighborhood")
    if preferences.get("transport"):
        parts.append(f"transport: {', '.join(preferences['transport'])}")
    if preferences.get("amenities"):
        parts.append(f"amenities: {', '.join(preferences['amenities'])}")

    return ". ".join(parts)


def search_listings(preferences: dict, top_k: int = 5):
    query_text = preferences_to_query_string(preferences)
    results = chrome_client.similarity_search(query=query_text, k=top_k)

    return results


def jsonify_output(text: str) -> dict:
    """Convert the structured output string to a JSON object."""
    try:
        # Attempt to parse the structured output as JSON
        cleaned_output = text.strip().removeprefix("```json").removesuffix("```")
        return json.loads(cleaned_output)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Problematic string: '{cleaned_output}'")

In [None]:
# Define the prompt for summarizing the answers
summary_prompt = PromptTemplate(
    input_variables=["answers"],
    template="""
You are a helpful real estate assistant. Based on the user's answers below, extract a JSON structure representing their real estate preferences.

Answers:
{answers}

Respond in this format:
{{
  "bedrooms": int | null,
  "bathrooms": int | null,
  "house_size": "range or description",
  "features": [list of important features],
  "neighborhood_type": "urban | suburban | rural | mixed",
  "transport": [list of transport options or keywords],
  "amenities": [optional list like parks, gyms, schools]
}}
""",
)


llm = ChatOpenAI(model_name=LLM_MODEL, temperature=0.0)
chain = LLMChain(llm=llm, prompt=summary_prompt)

combined_answers = "\n".join(answers)
structured_output = chain.run(answers=combined_answers)

print("Extracted Preferences:")
print(structured_output)


In [None]:
# Search for listings based on user preferences
results = search_listings(jsonify_output(structured_output), top_k=2)
print("Search results based on user preferences:")
for doc in results:
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 20)

Step 6: Personalizing Listing Descriptions


- LLM Augmentation: For each retrieved listing, use the LLM to augment the description, tailoring it to resonate with the buyer’s specific preferences. This involves subtly emphasizing aspects of the property that align with what the buyer is looking for.
- Maintaining Factual Integrity: Ensure that the augmentation process enhances the appeal of the listing without altering factual information.


In [None]:
prompt_template = PromptTemplate(
    input_variables=["preferences", "listing"],
    template="""
You are assisting a home buyer by rewriting a real estate listing to better match their preferences.

Here are the buyer's preferences:
{preferences}

Here is the original listing:
{listing}

Rewrite the description to highlight the aspects that most align with the buyer's preferences.
Do not add or invent new features — only rephrase or emphasize existing facts.
""",
)

augment_chain = LLMChain(llm=llm, prompt=prompt_template)


def augment_listing(listing: dict, preferences: dict):
    """
    Augment a real estate listing based on user preferences using LLM.
    """
    # Turn preferences into a readable string
    preferences_text = "\n".join(
        f"{k.capitalize()}: {v}" for k, v in preferences.items()
    )

    # Format the listing content
    listing_text = f"""
Neighborhood: {listing.get("neighborhood", "N/A")}
Price: {listing.get("price", "N/A")}
Bedrooms: {listing.get("bedrooms", "N/A")}
Bathrooms: {listing.get("bathrooms", "N/A")}
House Size: {listing.get("house_size", "N/A")}
Description: {listing.get("description", "N/A")}
"""

    # Run the LLM chain
    response = augment_chain.run(preferences=preferences_text, listing=listing_text)
    return response


# Function to run the augmentation for one listing
def augment_listings(listings: list, preferences: dict):
    """
    Augment multiple listings based on user preferences using LLM.

    Args:
        listings (list): List of listing dicts (retrieved from Chroma).
        preferences (dict): User preferences.

    Returns:
        List of dicts with original and augmented descriptions.
    """
    augmented_results = []

    for listing in listings:
        # Get the metadata for the listing if it's wrapped in a Chroma Document
        if hasattr(listing, "metadata"):
            listing_data = listing.metadata
        else:
            listing_data = listing  # fallback for plain dicts

        # Run augmentation
        new_description = augment_listing(listing_data, preferences)

        # Store the result
        augmented_results.append(
            {"original": listing_data, "augmented_description": new_description}
        )

    return augmented_results


# Call the function
augmented = augment_listings(results, jsonify_output(structured_output))

In [None]:
for i, entry in enumerate(augmented, 1):
    print(f"\n🏡 Listing {i}")
    print("Original Description:\n", entry["original"].get("description", "N/A"))
    print("\n🔁 Augmented Description:\n", entry["augmented_description"])

Step 7: Deliverables and Testing
