# HomeMatch - Real Estate Property Matching System

This notebook implements an AI-powered real estate matching system that uses Large Language Models (LLMs) and vector databases to transform standard real estate listings into personalized narratives.

## Project Overview
- Generate synthetic real estate listings using OpenAI GPT
- Create vector embeddings and store in ChromaDB
- Implement semantic search based on buyer preferences
- Generate personalized listing descriptions

## Step 1: Environment Setup and Imports

In [None]:
# Environment setup
import os

os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

from langchain.llms import OpenAI

In [None]:
# Import required libraries
import json
import pandas as pd
from typing import List, Dict, Any
from pydantic import BaseModel, Field

# LangChain imports
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser

# Vector database and embeddings
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import numpy as np

# For progress tracking
from tqdm import tqdm

## Step 2: Initialize Models and Components

In [None]:
# Initialize OpenAI LLM
llm = OpenAI(
    temperature=0.7,
    max_tokens=1000,
    model_name="text-davinci-003"
)

# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize ChromaDB client
chroma_client = chromadb.Client()

print("Models initialized successfully!")

## Step 3: Define Data Models

In [None]:
# Pydantic models for structured output
class PropertyListing(BaseModel):
    """Model for a real estate property listing"""
    neighborhood: str = Field(description="Name of the neighborhood")
    price: int = Field(description="Property price in USD")
    bedrooms: int = Field(description="Number of bedrooms")
    bathrooms: int = Field(description="Number of bathrooms")
    house_size: int = Field(description="House size in square feet")
    description: str = Field(description="Detailed property description")
    neighborhood_description: str = Field(description="Description of the neighborhood")

class BuyerPreferences(BaseModel):
    """Model for buyer preferences"""
    house_size: str = Field(description="Preferred house size description")
    important_features: str = Field(description="Three most important features")
    amenities: str = Field(description="Desired amenities")
    transportation: str = Field(description="Transportation preferences")
    neighborhood_type: str = Field(description="Preferred neighborhood type")

print("Data models defined successfully!")

## Step 4: Generate Real Estate Listings

In [None]:
# Property listing generation prompt
listing_prompt = PromptTemplate(
    input_variables=["listing_number"],
    template="""
    Generate a detailed real estate listing for property #{listing_number}. 
    Create diverse properties including urban apartments, suburban homes, rural properties, 
    eco-friendly homes, luxury estates, and starter homes.
    
    Include the following details:
    - Neighborhood: Choose from diverse areas (urban, suburban, rural, waterfront, etc.)
    - Price: Realistic price range ($200,000 - $2,000,000)
    - Bedrooms: 1-5 bedrooms
    - Bathrooms: 1-4 bathrooms  
    - House Size: 800-4000 square feet
    - Description: Compelling 100-150 word description highlighting unique features
    - Neighborhood Description: 50-75 word description of the local area
    
    Format as JSON with keys: neighborhood, price, bedrooms, bathrooms, house_size, description, neighborhood_description
    """
)

# Generate listings
def generate_listings(num_listings: int = 12) -> List[Dict]:
    """Generate synthetic real estate listings"""
    listings = []
    chain = LLMChain(llm=llm, prompt=listing_prompt)
    
    print(f"Generating {num_listings} property listings...")
    
    for i in tqdm(range(1, num_listings + 1)):
        try:
            result = chain.run(listing_number=i)
            # Clean the result to extract JSON
            json_start = result.find('{')
            json_end = result.rfind('}') + 1
            if json_start != -1 and json_end != 0:
                json_str = result[json_start:json_end]
                listing_data = json.loads(json_str)
                listing_data['id'] = i
                listings.append(listing_data)
            else:
                print(f"Failed to parse listing {i}")
        except Exception as e:
            print(f"Error generating listing {i}: {e}")
    
    return listings

# Generate the listings
property_listings = generate_listings(12)
print(f"Successfully generated {len(property_listings)} listings")

# Display first listing as example
if property_listings:
    print("\nExample listing:")
    print(json.dumps(property_listings[0], indent=2))

## Step 5: Save Listings to File

In [None]:
# Save listings to JSON file
with open('listings.json', 'w') as f:
    json.dump(property_listings, f, indent=2)

print("Listings saved to 'listings.json'")

# Create a summary DataFrame
listings_df = pd.DataFrame(property_listings)
print("\nListings Summary:")
print(f"Total Properties: {len(listings_df)}")
print(f"Price Range: ${listings_df['price'].min():,} - ${listings_df['price'].max():,}")
print(f"Bedrooms Range: {listings_df['bedrooms'].min()} - {listings_df['bedrooms'].max()}")
print(f"Average House Size: {listings_df['house_size'].mean():.0f} sq ft")

## Step 6: Create Vector Database and Store Embeddings

In [None]:
# Create ChromaDB collection
def create_vector_database(listings: List[Dict]):
    """Create vector database with property embeddings"""
    
    # Create or get collection
    collection_name = "property_listings"
    try:
        # Delete existing collection if it exists
        chroma_client.delete_collection(name=collection_name)
    except:
        pass
    
    collection = chroma_client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}  # Use cosine similarity
    )
    
    # Prepare documents for embedding
    documents = []
    metadatas = []
    ids = []
    
    print("Preparing documents for embedding...")
    
    for listing in listings:
        # Combine all text fields for embedding
        combined_text = f"""
        Neighborhood: {listing['neighborhood']}
        Price: ${listing['price']:,}
        Bedrooms: {listing['bedrooms']}
        Bathrooms: {listing['bathrooms']}
        Size: {listing['house_size']} sq ft
        Description: {listing['description']}
        Area: {listing['neighborhood_description']}
        """.strip()
        
        documents.append(combined_text)
        metadatas.append({
            "id": listing['id'],
            "neighborhood": listing['neighborhood'],
            "price": listing['price'],
            "bedrooms": listing['bedrooms'],
            "bathrooms": listing['bathrooms'],
            "house_size": listing['house_size']
        })
        ids.append(str(listing['id']))
    
    # Generate embeddings
    print("Generating embeddings...")
    embeddings = embedding_model.encode(documents, show_progress_bar=True)
    
    # Add to collection
    print("Adding to vector database...")
    collection.add(
        documents=documents,
        embeddings=embeddings.tolist(),
        metadatas=metadatas,
        ids=ids
    )
    
    print(f"Successfully added {len(documents)} listings to vector database")
    return collection

# Create the vector database
property_collection = create_vector_database(property_listings)

## Step 7: Define Buyer Preferences

In [None]:
# Define sample buyer preferences
sample_questions = [
    "How big do you want your house to be?",
    "What are 3 most important things for you in choosing this property?",
    "Which amenities would you like?",
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?"
]

# Sample buyer preference scenarios
buyer_scenarios = {
    "young_professional": {
        "house_size": "A modern 2-3 bedroom apartment or condo, around 1200-1500 sq ft",
        "important_features": "Modern amenities, good internet connectivity, and proximity to work/entertainment",
        "amenities": "Fitness center, rooftop terrace, modern kitchen with updated appliances",
        "transportation": "Easy access to public transit, bike-friendly roads, proximity to major highways",
        "neighborhood_type": "Urban or trendy suburban area with restaurants, cafes, and nightlife"
    },
    "growing_family": {
        "house_size": "A comfortable 3-4 bedroom house with a spacious kitchen and living room, 2000-2500 sq ft",
        "important_features": "Good schools nearby, safe neighborhood, and family-friendly amenities",
        "amenities": "Large backyard, two-car garage, modern appliances, and good storage space",
        "transportation": "Family-friendly roads, school bus access, safe walking paths",
        "neighborhood_type": "Quiet suburban neighborhood with parks, schools, and family amenities"
    },
    "eco_conscious": {
        "house_size": "An energy-efficient 2-3 bedroom home with sustainable features, around 1800-2200 sq ft",
        "important_features": "Eco-friendly materials, energy efficiency, and sustainable living options",
        "amenities": "Solar panels, energy-efficient appliances, composting area, vegetable garden space",
        "transportation": "Bike paths, walkable neighborhood, access to public transportation",
        "neighborhood_type": "Environmentally conscious community with green spaces and sustainability focus"
    }
}

print("Buyer preference scenarios defined:")
for scenario, prefs in buyer_scenarios.items():
    print(f"- {scenario.replace('_', ' ').title()}")

## Step 8: Implement Semantic Search

In [None]:
def search_properties(buyer_preferences: Dict[str, str], top_k: int = 3) -> List[Dict]:
    """Search for properties based on buyer preferences"""
    
    # Create search query from preferences
    search_query = f"""
    Looking for: {buyer_preferences['house_size']}
    Important features: {buyer_preferences['important_features']}
    Desired amenities: {buyer_preferences['amenities']}
    Transportation needs: {buyer_preferences['transportation']}
    Neighborhood preference: {buyer_preferences['neighborhood_type']}
    """.strip()
    
    print(f"Searching for properties matching: {search_query[:100]}...")
    
    # Generate query embedding
    query_embedding = embedding_model.encode([search_query])
    
    # Search in vector database
    results = property_collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    
    # Process results
    matched_properties = []
    for i in range(len(results['ids'][0])):
        property_id = int(results['ids'][0][i])
        distance = results['distances'][0][i]
        similarity_score = 1 - distance  # Convert distance to similarity
        
        # Find full property details
        full_property = next((p for p in property_listings if p['id'] == property_id), None)
        if full_property:
            full_property['similarity_score'] = similarity_score
            matched_properties.append(full_property)
    
    return matched_properties

# Test search with young professional scenario
print("Testing property search...")
test_preferences = buyer_scenarios["young_professional"]
matched_properties = search_properties(test_preferences, top_k=3)

print(f"\nFound {len(matched_properties)} matching properties:")
for i, prop in enumerate(matched_properties, 1):
    print(f"\n{i}. {prop['neighborhood']} - ${prop['price']:,}")
    print(f"   {prop['bedrooms']}BR/{prop['bathrooms']}BA, {prop['house_size']} sq ft")
    print(f"   Similarity: {prop['similarity_score']:.3f}")
    print(f"   {prop['description'][:100]}...")

## Step 9: Generate Personalized Descriptions

In [None]:
# Personalization prompt
personalization_prompt = PromptTemplate(
    input_variables=["original_description", "buyer_preferences", "property_details"],
    template="""
    You are a skilled real estate agent. Rewrite the following property description to appeal specifically to a buyer with these preferences:
    
    BUYER PREFERENCES:
    - House size preference: {buyer_preferences[house_size]}
    - Important features: {buyer_preferences[important_features]}
    - Desired amenities: {buyer_preferences[amenities]}
    - Transportation needs: {buyer_preferences[transportation]}
    - Neighborhood preference: {buyer_preferences[neighborhood_type]}
    
    PROPERTY DETAILS:
    {property_details}
    
    ORIGINAL DESCRIPTION:
    {original_description}
    
    INSTRUCTIONS:
    - Rewrite the description to highlight aspects that match the buyer's preferences
    - Keep all factual information accurate (price, size, location, etc.)
    - Use persuasive language that connects the property features to the buyer's needs
    - Maintain a professional, enthusiastic tone
    - Keep the description between 150-200 words
    
    PERSONALIZED DESCRIPTION:
    """
)

def personalize_description(property_data: Dict, buyer_preferences: Dict[str, str]) -> str:
    """Generate personalized property description"""
    
    property_details = f"""
    Location: {property_data['neighborhood']}
    Price: ${property_data['price']:,}
    Size: {property_data['bedrooms']} bedrooms, {property_data['bathrooms']} bathrooms
    Square Footage: {property_data['house_size']} sq ft
    Area Description: {property_data['neighborhood_description']}
    """
    
    chain = LLMChain(llm=llm, prompt=personalization_prompt)
    
    result = chain.run(
        original_description=property_data['description'],
        buyer_preferences=buyer_preferences,
        property_details=property_details
    )
    
    return result.strip()

# Test personalization
def demonstrate_personalization(scenario_name: str):
    """Demonstrate personalization for a buyer scenario"""
    print(f"\n{'='*60}")
    print(f"BUYER SCENARIO: {scenario_name.replace('_', ' ').upper()}")
    print(f"{'='*60}")
    
    preferences = buyer_scenarios[scenario_name]
    matches = search_properties(preferences, top_k=2)
    
    for i, property_data in enumerate(matches, 1):
        print(f"\n--- PROPERTY {i}: {property_data['neighborhood']} ---")
        print(f"${property_data['price']:,} | {property_data['bedrooms']}BR/{property_data['bathrooms']}BA | {property_data['house_size']} sq ft")
        print(f"Similarity Score: {property_data['similarity_score']:.3f}")
        
        print("\nORIGINAL DESCRIPTION:")
        print(property_data['description'])
        
        print("\nPERSONALIZED DESCRIPTION:")
        personalized = personalize_description(property_data, preferences)
        print(personalized)
        print("-" * 50)

# Demonstrate personalization for different buyer types
demonstrate_personalization("young_professional")

## Step 10: Complete HomeMatch Application

In [None]:
class HomeMatchSystem:
    """Complete HomeMatch application class"""
    
    def __init__(self, property_listings, property_collection, llm, embedding_model):
        self.property_listings = property_listings
        self.property_collection = property_collection
        self.llm = llm
        self.embedding_model = embedding_model
        
    def collect_preferences_interactive(self):
        """Interactively collect buyer preferences"""
        print("\n" + "="*50)
        print("WELCOME TO HOMEMATCH - PROPERTY PREFERENCE COLLECTION")
        print("="*50)
        
        preferences = {}
        questions = [
            ("house_size", "How big do you want your house to be? (Describe your ideal size and layout)"),
            ("important_features", "What are the 3 most important things for you in choosing this property?"),
            ("amenities", "Which amenities would you like? (e.g., pool, garage, garden)"),
            ("transportation", "Which transportation options are important to you?"),
            ("neighborhood_type", "How would you describe your ideal neighborhood?")
        ]
        
        for key, question in questions:
            print(f"\n{question}")
            preferences[key] = input("> ")
            
        return preferences
    
    def find_and_personalize_properties(self, preferences, num_results=3):
        """Find properties and generate personalized descriptions"""
        # Search for matching properties
        matches = search_properties(preferences, top_k=num_results)
        
        # Generate personalized descriptions
        personalized_matches = []
        for match in matches:
            personalized_desc = personalize_description(match, preferences)
            match['personalized_description'] = personalized_desc
            personalized_matches.append(match)
            
        return personalized_matches
    
    def display_recommendations(self, personalized_matches):
        """Display property recommendations"""
        print("\n" + "="*60)
        print("YOUR PERSONALIZED PROPERTY RECOMMENDATIONS")
        print("="*60)
        
        for i, prop in enumerate(personalized_matches, 1):
            print(f"\n{'='*40}")
            print(f"PROPERTY {i}: {prop['neighborhood'].upper()}")
            print(f"{'='*40}")
            print(f"💰 Price: ${prop['price']:,}")
            print(f"🏠 Size: {prop['bedrooms']} bedrooms, {prop['bathrooms']} bathrooms")
            print(f"📐 Square Footage: {prop['house_size']} sq ft")
            print(f"📊 Match Score: {prop['similarity_score']:.1%}")
            
            print(f"\n📝 PERSONALIZED DESCRIPTION:")
            print(prop['personalized_description'])
            
            print(f"\n🏘️ NEIGHBORHOOD INFO:")
            print(prop['neighborhood_description'])
            
            if i < len(personalized_matches):
                print("\n" + "-"*60)
    
    def run_demo_mode(self):
        """Run demonstration with predefined scenarios"""
        print("\nRunning HomeMatch Demo Mode...")
        
        scenarios = ["young_professional", "growing_family", "eco_conscious"]
        
        for scenario in scenarios:
            print(f"\n\n{'*'*70}")
            print(f"DEMO SCENARIO: {scenario.replace('_', ' ').title()}")
            print(f"{'*'*70}")
            
            preferences = buyer_scenarios[scenario]
            
            print("\nBuyer Preferences:")
            for key, value in preferences.items():
                print(f"- {key.replace('_', ' ').title()}: {value}")
            
            matches = self.find_and_personalize_properties(preferences, num_results=2)
            self.display_recommendations(matches)

# Initialize HomeMatch system
homematch = HomeMatchSystem(property_listings, property_collection, llm, embedding_model)

print("\nHomeMatch System initialized successfully!")
print("\nAvailable operations:")
print("1. homematch.run_demo_mode() - Run demonstration with predefined scenarios")
print("2. homematch.collect_preferences_interactive() - Interactive preference collection")
print("3. homematch.find_and_personalize_properties(preferences) - Find matching properties")

## Step 11: Run HomeMatch Demo

In [None]:
# Run the complete HomeMatch demonstration
homematch.run_demo_mode()

## Step 12: Interactive Testing

In [None]:
# Interactive testing (uncomment to run)
# preferences = homematch.collect_preferences_interactive()
# matches = homematch.find_and_personalize_properties(preferences)
# homematch.display_recommendations(matches)

## Step 13: System Testing and Validation

In [None]:
# Test different buyer preferences
test_cases = {
    "luxury_buyer": {
        "house_size": "Large luxury home, 3000+ sq ft with premium finishes",
        "important_features": "High-end finishes, privacy, prestigious location",
        "amenities": "Pool, wine cellar, chef's kitchen, multiple car garage",
        "transportation": "Private driveway, proximity to airport, valet services",
        "neighborhood_type": "Exclusive, upscale neighborhood with luxury amenities"
    },
    "first_time_buyer": {
        "house_size": "Modest starter home, 1000-1500 sq ft, 2-3 bedrooms",
        "important_features": "Affordability, move-in ready condition, good value",
        "amenities": "Basic appliances included, small yard, single garage",
        "transportation": "Access to public transit, reasonable commute to work",
        "neighborhood_type": "Safe, affordable neighborhood with potential for growth"
    }
}

def test_homematch_system():
    """Test the HomeMatch system with various scenarios"""
    print("\n" + "#"*70)
    print("HOMEMATCH SYSTEM TESTING")
    print("#"*70)
    
    for test_name, preferences in test_cases.items():
        print(f"\n{'='*50}")
        print(f"TEST CASE: {test_name.replace('_', ' ').upper()}")
        print(f"{'='*50}")
        
        matches = homematch.find_and_personalize_properties(preferences, num_results=2)
        
        print(f"\nFound {len(matches)} matching properties:")
        for i, match in enumerate(matches, 1):
            print(f"\n{i}. {match['neighborhood']} - ${match['price']:,}")
            print(f"   Match Score: {match['similarity_score']:.1%}")
            print(f"   Personalized: {match['personalized_description'][:100]}...")

# Run system tests
test_homematch_system()

## Step 14: Export Results and Summary

In [None]:
# Generate project summary
def generate_project_summary():
    """Generate project summary and results"""
    
    summary = {
        "project_name": "HomeMatch - Real Estate Property Matching System",
        "total_listings_generated": len(property_listings),
        "vector_database_entries": property_collection.count(),
        "buyer_scenarios_tested": len(buyer_scenarios) + len(test_cases),
        "key_features": [
            "Synthetic real estate listing generation using OpenAI GPT",
            "Vector embeddings with sentence-transformers",
            "ChromaDB for similarity search",
            "Personalized description generation",
            "Interactive buyer preference collection",
            "Semantic search and ranking"
        ],
        "technologies_used": [
            "OpenAI GPT (text-davinci-003)",
            "LangChain for LLM orchestration",
            "ChromaDB for vector storage",
            "Sentence Transformers (all-MiniLM-L6-v2)",
            "Pydantic for data validation",
            "Python ecosystem (pandas, numpy, tqdm)"
        ],
        "example_neighborhoods": list(set([listing['neighborhood'] for listing in property_listings])),
        "price_range": {
            "min": min([listing['price'] for listing in property_listings]),
            "max": max([listing['price'] for listing in property_listings]),
            "average": sum([listing['price'] for listing in property_listings]) / len(property_listings)
        }
    }
    
    return summary

# Generate and display summary
project_summary = generate_project_summary()

print("\n" + "="*70)
print("HOMEMATCH PROJECT SUMMARY")
print("="*70)

print(f"\n📊 STATISTICS:")
print(f"   • Total Listings Generated: {project_summary['total_listings_generated']}")
print(f"   • Vector Database Entries: {project_summary['vector_database_entries']}")
print(f"   • Buyer Scenarios Tested: {project_summary['buyer_scenarios_tested']}")

print(f"\n💰 PRICE ANALYSIS:")
print(f"   • Price Range: ${project_summary['price_range']['min']:,} - ${project_summary['price_range']['max']:,}")
print(f"   • Average Price: ${project_summary['price_range']['average']:,.0f}")

print(f"\n🏘️ NEIGHBORHOODS INCLUDED:")
for neighborhood in project_summary['example_neighborhoods'][:8]:  # Show first 8
    print(f"   • {neighborhood}")

print(f"\n🔧 KEY TECHNOLOGIES:")
for tech in project_summary['technologies_used']:
    print(f"   • {tech}")

print(f"\n✅ PROJECT DELIVERABLES COMPLETED:")
deliverables = [
    "✓ Synthetic real estate data generation (10+ listings)",
    "✓ Vector database setup and population",
    "✓ Buyer preference interface and collection",
    "✓ Semantic search implementation",
    "✓ Personalized description generation",
    "✓ Complete HomeMatch application",
    "✓ Multiple buyer scenario testing",
    "✓ Interactive demonstration mode"
]

for deliverable in deliverables:
    print(f"   {deliverable}")

print(f"\n🎯 SYSTEM CAPABILITIES:")
capabilities = [
    "Generate diverse, realistic property listings",
    "Convert natural language preferences to search queries",
    "Perform semantic similarity matching",
    "Rank properties by relevance to buyer preferences",
    "Generate personalized, compelling property descriptions",
    "Handle multiple buyer personas and use cases"
]

for capability in capabilities:
    print(f"   • {capability}")

# Save summary to file
with open('project_summary.json', 'w') as f:
    json.dump(project_summary, f, indent=2)

print(f"\n💾 Project summary saved to 'project_summary.json'")
print(f"\n🎉 HomeMatch project completed successfully!")
print("="*70)