In [None]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install --upgrade transformers
!pip install diffusers
!pip install --upgrade langchain
!pip install langchain-community
!pip install langchain-openai
!pip install --upgrade pydantic
!pip install pydantic-settings
!pip install --upgrade chromadb
!pip install -q git+https://github.com/huggingface/transformers.git 

## Table of Contents
1. [Setup and Installation](#SETUP)
2. [LLM Prompt and Listing Generation](#LLM)
3. [Saving Listings](#SAVING)
4. [SD Prompt and Image Generation](#SD)
5. [Generate Image Embeddings](#IMAGE-EMBEDDINGS)
6. [Generate Text Embeddings](#TEXT-EMBEDDINGS)
7. [Combining Text+Image Embeddings](#COMBINED-EMBEDDINGS)
8. [Writing Listings to Database](#CHROMA-DB)
9. [Getting Buyer Profile](#PROFILE)
10. [Generate Personalized Listings](#P-LISTINGS)
11. [Saving Personalized Listings](#SP-LISTINGS)
12. [Execution Blocks](#BLOCKS)
13. [Main Execution Block](#MAIN)

## Setup and Installation <a id='SETUP'></a>

In [4]:
import torch
import numpy as np
import os
import gc
import re
import json
import shutil
import chromadb
from chromadb.config import Settings
from langchain_openai import ChatOpenAI
#from diffusers import StableDiffusionPipeline
from transformers import ColQwen2ForRetrieval, ColQwen2Processor
from PIL import Image

  warn(


In [None]:
os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

In [5]:
#Only ran this cell once to load model when "Cuda" was available.
#sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",
#                                                  torch_dtype=torch.float16,
#                                                  variant="fp16").to("cuda")

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

safety_checker/model.fp16.safetensors:   0%|          | 0.00/608M [00:00<?, ?B/s]

safety_checker/config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

(…)diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

(…)diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [6]:
#Initializing the required models
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.8)
model_name = "vidore/colqwen2-v1.0-hf"
Qwen2_model = ColQwen2ForRetrieval.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
).to("cuda")
Qwen2_processor = ColQwen2Processor.from_pretrained(model_name)

## LLM Prompt and Listing Generation <a id='LLM'></a>

In [7]:
prompt_template = """
You are a helpful real estate assistant.

Please generate a realistic real estate listing in the following format:
Neighborhood: <neighborhood name>
Price: <$xxx,xxx>
Bedrooms: <int>
Bathrooms: <int>
House Size: <sqft>

Description: <A paragraph with vivid and enticing details about the house>

Neighborhood Description: <A paragraph that describes the neighborhood and nearby amenities>
"""

In [8]:
def generate_listings(llm, prompt_template, n=10):
    return [llm.invoke(prompt_template).content.strip() for _ in range(n)]

## Saving Listings <a id='SAVING'></a>

In [9]:
def save_listings_to_json(listings, filename="listings.json"):
    with open(filename, "w") as f:
        json.dump(listings, f, indent=2)
    return listings

def load_listings_from_json(filename="listings.json"):
    with open(filename, "r") as f:
        return json.load(f)

## SD Prompt and Image Generation <a id='SD'></a>

In [10]:
def parse_listing_strings(listing_strings):
    parsed_listings = []

    for listing_str in listing_strings:
        # Split by newlines, filter out empty lines with ':'
        lines = [line.strip() for line in listing_str.strip().split('\n') if ':' in line]
        listing = {}

        for line in lines:
            key, value = line.split(':', 1)
            listing[key.strip()] = value.strip()

        parsed_listings.append(listing)

    return parsed_listings


def generate_prompts(listing_strings):
    parsed_listings = parse_listing_strings(listing_strings)
    prompts = []

    for listing in parsed_listings:
        neighborhood = listing.get("Neighborhood", "a quiet neighborhood")
        bedrooms = listing.get("Bedrooms", "3")
        bathrooms = listing.get("Bathrooms", "2")
        size = listing.get("House Size", "1800 sqft").replace("sqft", "square feet")
        description = listing.get("Description", "").strip().replace("\n", " ")
        neighborhood_desc = listing.get("Neighborhood Description", "").strip().replace("\n", " ")

        prompt = (
            f"Realistic, high-resolution exterior photo of a {bedrooms}-bedroom, {bathrooms}-bathroom home "
            f"in {neighborhood}. The house is approximately {size}, with features such as {description}. "
            f"Set in a picturesque neighborhood known for {neighborhood_desc}. "
            f"Clear blue skies, natural lighting, professional real estate photography."
        )

        prompts.append(prompt)

    return prompts

In [11]:
#Generate and save images
def generate_images(pipe,prompts,image_dir = "listing_images"):
    os.makedirs(image_dir, exist_ok=True)
    with torch.no_grad():
        for i, prompt in enumerate(prompts):
            try:
                # Generate image using the pipeline
                image = pipe(prompt).images[0]
                
                # Create a path to save the image
                image_path = os.path.join(image_dir, f"listing_{i}.png")
                
                # Save the image
                image.save(image_path)
                
                del image
                torch.cuda.empty_cache()
                gc.collect()
                
            except Exception as e:
                print(f"Error processing listing {i}: {e}")      

## Generate Image Embeddings<a id='IMAGE-EMBEDDINGS'></a>

In [12]:
def generate_image_embeddings(processor, model, image_folder="listing_images"):
    image_embeddings = {}
    image_paths = [os.path.join(image_folder, fname) for fname in os.listdir(image_folder) if fname.endswith(".png")]
    for img_path in image_paths:
        image = Image.open(img_path)
        listing_key = img_path.split('/')[-1].split('.')[0]
        input_image = processor(images=image).to(model.device)
        with torch.no_grad():
            image_emb = Qwen2_model(**input_image).embeddings
            image_embeddings[listing_key] = image_emb.squeeze()[0].cpu().tolist()  # Convert tensor to list

    return image_embeddings

## Generate Text Embeddings<a id='TEXT-EMBEDDINGS'></a>

In [13]:
def generate_text_embeddings(listings, processor, model):
    text_embeddings = []

    for text in listings:
        input_text = processor(text=text).to(model.device)
        with torch.no_grad():
          text_emb = model(**input_text).embeddings.squeeze()[0].cpu().tolist()
        text_embeddings.append(text_emb)

    return text_embeddings

## Combining Text+Image Embeddings<a id='COMBINED-EMBEDDINGS'></a>

In [14]:
def fused_embeddings_weighted(image_embeddings, text_embeddings, image_weight=0.45, text_weight=0.55):
    fused_embeddings = {}
    for i, text_emb in enumerate(text_embeddings):
        listing_key = f"listing_{i}"
        image_emb = image_embeddings[listing_key]
        image_array = np.array(image_emb)
        text_array = np.array(text_emb)
        fused = text_weight*text_array + image_weight*image_array
        fused_embeddings[listing_key] = fused.tolist()
        
    return fused_embeddings

## Writing Listings to Database <a id='CHROMA-DB'></a>

In [15]:
def write_to_chromadb(listings, text_embeddings, image_embeddings, fused_embeddings, persist_directory="./chroma_db", collection_name="real_estate"):
    if os.path.exists(persist_directory):
        shutil.rmtree(persist_directory)

    os.makedirs(persist_directory, exist_ok=True)
    client = chromadb.PersistentClient(persist_directory)
    collection = client.create_collection(name=collection_name)

    # Prepare metadata for each listing
    ids = []
    metadatas = []
    fused_embeddings_list = []

    for i, text_emb in enumerate(text_embeddings):
        listing_key = f"listing_{i}"
        image_emb = image_embeddings[listing_key]
        fused_emb = fused_embeddings[listing_key]
            
        ids.append(listing_key)
        # Serialize the list into JSON so metadata value is a str
        image_embed_json = json.dumps(image_emb)
        metadatas.append({
            "listing_id": listing_key,
            "image_embedding_json": image_embed_json
        })
        fused_embeddings_list.append(fused_emb)

    # Add listings and embeddings to the database
    collection.add(
        documents=listings,
        embeddings=fused_embeddings_list,
        metadatas=metadatas,
        ids=ids,
    )
    return collection

## Getting Buyer Profile <a id='PROFILE'></a>

In [22]:
def get_buyer_profile():
    buyer_questions = [
        "How big do you want your house to be?",
        "What are 3 most important things for you in choosing this property?",
        "Which amenities would you like?",
        "Which transportation options are important to you?",
        "How urban do you want your neighborhood to be?",
    ]
    answers = [input(q + " ") for q in buyer_questions]
    return " ".join(answers)

## Generate Personalized Listings <a id='P-LISTINGS'></a>

In [56]:
def personalize_listings(llm, db, buyer_profile, k=3):
    buyer_embeddings = generate_text_embeddings(buyer_profile, Qwen2_processor, Qwen2_model)
    results = db.query(query_embeddings=buyer_embeddings[0],n_results=k)
    personalized = {}

    for idx, res in zip(results['ids'][0], results['documents'][0]):
        prompt = f""" 
You are a helpful real estate assistant.

Buyer's Preferences:
{buyer_profile}

Original Listing:
{res}

Please rewrite the original description and neighborhood description to subtly emphasize aspects aligned with the buyer's preferences, without changing facts.
"""
        personalized[idx] = llm.invoke(prompt).content.strip()

    return personalized

## Saving Personalized Listings <a id='SP-LISTINGS'></a>

In [64]:
def save_personalized_listings(personalized, output_file='personalized_listings.json'):
    """
    Save the personalized listings to a JSON file.

    Args:
        personalized (dict): The dictionary with personalized listings.
        output_file (str): File path to save the output.
    """
    with open(output_file, "w") as f:
        json.dump(personalized, f)

## Execution Blocks <a id='BLOCKS'></a>
### Blocks that only need to run once and then results can be saved and used later 

In [13]:
print("🔧 Step 1: Generating real estate listings using the language model...")
listings = generate_listings(llm, prompt_template)

print("💾 Step 2: Saved listings to a JSON file...")
listings = save_listings_to_json(listings)

🔧 Step 1: Generating real estate listings using the language model...
💾 Step 2: Saved listings to a JSON file...


In [10]:
print("🖼️ Step 3: Generating image prompts...")
prompts = generate_prompts(listings)

🖼️ Step 3: Generating image prompts...


In [16]:
print("🧩 Step 4: Generating images using Stable Diffusion pipeline...")
generate_images(sd_pipe, prompts)
print("🧩 Step 4 Complete: Images have been saved!")

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["cozy living room, updated kitchen with granite countertops and stainless steel appliances, and a bright dining area overlooking the backyard. the master bedroom offers a private retreat with a walk - in closet and en - suite bathroom. the backyard is perfect for entertaining with a patio, lush lawn, and mature trees for added privacy.. set in a picturesque neighborhood known for oakwood estates is a family - friendly neighborhood known for its peaceful atmosphere and close - knit community. residents enjoy easy access to nearby parks, schools, and shopping centers. the neighborhood is conveniently located near major highways, making it a great choice for commuters. don 't miss out on the opportunity to make this lovely home yours in oakwood estates.. clear blue skies, natural lighting, professional real estate photography."]


🧩 Step 4: Generating images using Stable Diffusion pipeline...


  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["with stainless steel appliances, and a separate dining area perfect for entertaining guests. the master bedroom features an ensuite bathroom with a luxurious soaking tub, perfect for unwinding after a long day. step outside to the beautifully landscaped backyard with a patio area, ideal for enjoying the california sunshine. with hardwood floors throughout, ample storage space, and a 2 - car garage, this home is perfect for comfortable and convenient living.. set in a picturesque neighborhood known for westwood hills is known for its tree - lined streets, family - friendly atmosphere, and proximity to top - rated schools. residents enjoy easy access to hiking trails, parks, and shopping centers, making it a sought - after location for families and professionals alike. with a short drive to downtown and close proximity to major freeways, this neighborhood offers the perfect balance of

  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['steel appliances, and a dining area perfect for entertaining. the master bedroom boasts an en - suite bathroom and ample closet space. the backyard is a private oasis with a patio and lush landscaping, ideal for enjoying the outdoors. with hardwood floors throughout and plenty of natural light, this home is move - in ready.. set in a picturesque neighborhood known for forest hills is known for its tree - lined streets, family - friendly atmosphere, and proximity to parks and recreational activities. residents enjoy easy access to hiking trails, shopping centers, and top - rated schools. the neighborhood also features a vibrant dining scene with trendy restaurants and cafes. with its convenient location and welcoming community, forest hills is a sought - after place to call home.. clear blue skies, natural lighting, professional real estate photography.']


  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['- line appliances, and a spacious living area with a cozy fireplace. the master suite boasts a spa - like bathroom and a private balcony overlooking the hollywood sign. the outdoor space is perfect for entertaining with a large deck, infinity pool, and built - in barbecue area.. set in a picturesque neighborhood known for hollywood hills is known for its exclusive homes, celebrity residents, and iconic views of the city. this property is just minutes away from the trendy shops and restaurants of sunset boulevard and the excitement of hollywood. residents can enjoy hiking in nearby runyon canyon or taking a short drive to the beaches of santa monica. with its prime location and luxurious amenities, this home offers the ultimate hollywood lifestyle.. clear blue skies, natural lighting, professional real estate photography.']


  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["and a dining area perfect for entertaining guests. upstairs, you 'll find three well - appointed bedrooms with ample closet space and two full bathrooms. the lower level offers a cozy family room and a laundry room. the private backyard is a serene oasis, ideal for relaxing or hosting summer bbqs.. set in a picturesque neighborhood known for park slope is known for its tree - lined streets, trendy boutiques, and top - rated restaurants. prospect park is just a short walk away, offering plenty of outdoor recreational activities. the neighborhood also boasts excellent schools, convenient access to public transportation, and a vibrant community atmosphere. don 't miss this opportunity to own a piece of brooklyn's most coveted neighborhood.. clear blue skies, natural lighting, professional real estate photography."]


  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['that fill the space with natural light. the updated kitchen boasts stainless steel appliances, granite countertops, and a breakfast bar. the master suite includes a private balcony overlooking the beautifully landscaped backyard, perfect for enjoying morning coffee or evening sunsets. additional features include hardwood floors throughout, a formal dining room, and a detached garage.. set in a picturesque neighborhood known for pacific heights is known for its upscale and elegant feel, with tree - lined streets, stunning views of the san francisco bay, and proximity to top - rated schools. residents enjoy easy access to boutique shops, upscale restaurants, and iconic landmarks like the presidio and golden gate park. with its combination of luxury homes and vibrant community, pacific heights offers a truly unmatched living experience in san francisco.. clear blue skies, natural light

  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['stainless steel appliances, and a spacious living room with a cozy fireplace. the master suite boasts a luxurious en - suite bathroom with a soaking tub and a walk - in closet. step outside to the private backyard oasis complete with a deck, perfect for entertaining guests or enjoying a quiet evening under the stars.. set in a picturesque neighborhood known for west hollywood is known for its vibrant nightlife, trendy shops, and delicious restaurants. this home is conveniently located near sunset strip, melrose avenue, and santa monica boulevard, offering a variety of entertainment options and dining experiences. enjoy easy access to parks, fitness studios, and grocery stores, making it a desirable location for those seeking a lively and dynamic lifestyle.. clear blue skies, natural lighting, professional real estate photography.']


  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['tops and stainless steel appliances, and a master suite with a walk - in closet and ensuite bathroom. the backyard is perfect for entertaining with a covered patio and lush greenery. with fresh paint and new flooring throughout, this home is move - in ready and waiting for its new owners to make it their own.. set in a picturesque neighborhood known for willow creek is a family - friendly neighborhood with tree - lined streets and well - maintained homes. the community offers a park with playgrounds and walking trails, perfect for outdoor activities. nearby amenities include shopping centers, restaurants, and top - rated schools, making it an ideal place to call home for families and professionals alike. with easy access to major highways, commuting to downtown and other parts of the city is a breeze.. clear blue skies, natural lighting, professional real estate photography.']


  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['large windows that flood the space with natural light. the gourmet kitchen is equipped with stainless steel appliances, quartz countertops, and a spacious island. the master bedroom boasts a walk - in closet and a luxurious en - suite bathroom with a soaking tub and separate shower. the backyard oasis includes a deck, perfect for outdoor dining, and a lush lawn, ideal for relaxing or entertaining.. set in a picturesque neighborhood known for larchmont village is a highly sought - after neighborhood known for its tree - lined streets, historic homes, and vibrant community. residents enjoy strolling to the nearby shops, cafes, and restaurants on larchmont boulevard. this prime location also offers easy access to hancock park, hollywood, and downtown la, making it a convenient and desirable place to call home.. clear blue skies, natural lighting, professional real estate photography.']

  0%|          | 0/50 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["stainless steel appliances, and a large backyard perfect for entertaining. the master bedroom boasts an en - suite bathroom and walk - in closet, while the other bedrooms offer ample space and natural light. with hardwood floors throughout, a two - car garage, and a convenient location near shops and restaurants, this house is a perfect place to call home.. set in a picturesque neighborhood known for sunset heights is a vibrant and family - friendly neighborhood known for its tree - lined streets and close - knit community. residents enjoy easy access to parks, schools, and public transportation, making it a convenient location for families and commuters alike. with trendy cafes, boutiques, and local markets just a short walk away, sunset heights offers the perfect blend of urban convenience and suburban charm. don 't miss out on this opportunity to live in one of the most sought - 

  0%|          | 0/50 [00:00<?, ?it/s]

🧩 Step 4 Complete: Images have been saved!


In [17]:
print("🧠 Step 5: Generating image embeddings using Qwen2 model...")
image_embeddings = generate_image_embeddings(Qwen2_processor, Qwen2_model)

print("📝 Step 6: Generating text embeddings for listings...")
text_embeddings = generate_text_embeddings(listings, Qwen2_processor, Qwen2_model)

🧠 Step 5: Generating image embeddings using Qwen2 model...
📝 Step 6: Generating text embeddings for listings...


In [18]:
print("📝 Step 7: Generating fused embeddings using text and image embeddings...")
fused_embeddings = fused_embeddings_weighted(image_embeddings, text_embeddings, image_weight=0.45, text_weight=0.55)

📝 Step 7: Generating fused embeddings using text and image embeddings...


In [19]:
print("💾 Step 8: Saving listings and embeddings to ChromaDB...")
db = write_to_chromadb(listings, text_embeddings, image_embeddings, fused_embeddings)
print("🎉 Listings successfully saved to ChromaDB.")

💾 Step 8: Saving listings and embeddings to ChromaDB...
🎉 Listings successfully saved to ChromaDB.


## Main Execution Block <a id='MAIN'></a>
### Blocks that can be run multiple times

In [66]:
def main():
    print("🔍 Step 9: Retrieving information from ChromaDB...")
    #Only run this when db is already available
    #And if you run this, please replace db with collection in the code ahead
    #client = chromadb.PersistentClient(path="chromadb")
    #collection = client.get_collection("real_estate", embedding_function=None)
    
    print("👤 Step 10: Getting buyer profile input...")
    buyer_profile = get_buyer_profile()
    
    print("✨ Step 11: Personalizing listings based on buyer profile...")
    personalized = personalize_listings(llm, db, buyer_profile)
    
    print("💾 Step 12: Saving personalized listings to output file...")
    save_personalized_listings(personalized)

    print("🎉 Personalized listings saved successfully.")

In [67]:
if __name__ == "__main__":
    main()

🔍 Step 9: Retrieving information from ChromaDB...
👤 Step 10: Getting buyer profile input...
How big do you want your house to be? comfortable three-bedroom house with a spacious kitchen and a cozy living room.
What are 3 most important things for you in choosing this property? quiet neighborhood, good local schools, and convenient shopping options.
Which amenities would you like? backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.
Which transportation options are important to you? Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.
How urban do you want your neighborhood to be? A balance between suburban tranquility and access to urban amenities like restaurants and theaters.
✨ Step 11: Personalizing listings based on buyer profile...
💾 Step 12: Saving personalized listings to output file...
🎉 Personalized listings saved successfully.
