In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

chinthamsreeraj_rental_10k_dataset_path = kagglehub.dataset_download('chinthamsreeraj/rental-10k-dataset')

print('Data source import complete.')


# Initial Setup: Installing Required Packages

This cell installs the necessary Python packages for the project using `pip`. The packages include:

- **chromadb**: A vector database for storing and querying embeddings, used to store property listings.
- **google-generativeai**: Google's Generative AI SDK for embedding generation and content creation.
- **google-api-core**: Core utilities for interacting with Google APIs.
- **langchain**: A framework for building applications with LLMs, used for chaining LLM interactions.
- **langchain-google-genai**: LangChain integration for Google's Generative AI models.
- **langgraph**: A library for building stateful, graph-based workflows with LangChain.
- **pydantic**: For data validation and serialization, used to define structured property data models.
- **scikit-learn**: For machine learning tasks, specifically KMeans clustering for grouping properties.
- **typing-extensions**: For enhanced type hints, supporting advanced typing features.

In [None]:
!pip install chromadb google-generativeai google-api-core langchain langchain-google-genai langgraph pydantic scikit-learn typing-extensions

# Importing Libraries and Defining Constants

This cell imports the required libraries and modules for the project and sets up key constants:

- **Libraries**:
  - `os`, `json`, `warnings`: For file handling, JSON processing, and warning suppression.
  - `pandas`, `numpy`: For data manipulation and numerical computations.
  - `google.genai`: For interacting with Google's Generative AI models (e.g., embeddings).
  - `chromadb`: For vector database operations.
  - `langchain_core`, `langchain_google_genai`, `langgraph`: For building conversational workflows with LLMs.
  - `pydantic`: For defining structured data models with validation.
  - `sklearn`: For KMeans clustering and data scaling.
  - `typing`: For type hints to ensure code robustness.

- **Constants**:
  - `GOOGLE_API_KEY`: The API key for accessing Google's Generative AI services (hardcoded here, though ideally should be stored securely).
  - `DATA_URL`: Path to the dataset (`apartments_for_rent_classified_10K.csv`), containing rental listings.
  - `COLLECTION_NAME`: Name of the ChromaDB collection (`Rental_Listings`) for storing property embeddings.
  - `EMBEDDING_MODEL`: Specifies the Google embedding model (`text-embedding-004`) for generating text embeddings.

These imports and constants set the foundation for data processing, embedding generation, vector storage, and conversational workflows.

In [None]:
import os
import json
import warnings
from typing import List, Dict, Annotated, Literal, Optional
import pandas as pd
import numpy as np
from google.genai import types
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry
from google import genai
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, AIMessage, ToolMessage
from langchain_core.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode

from pydantic import BaseModel, Field, ValidationError
from typing_extensions import TypedDict

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GOOGLE_API_KEY = user_secrets.get_secret("GOOGLE_API_KEY")
DATA_URL = '/kaggle/input/rental-10k-dataset/apartments_for_rent_classified_10K.csv'

COLLECTION_NAME = "Rental_Listings"
EMBEDDING_MODEL = "models/text-embedding-004"

# Loading and Cleaning the Rental Dataset

This cell loads the rental dataset from a CSV file and performs initial data cleaning:

- **Loading Data**:
  - The dataset (`apartments_for_rent_classified_10K.csv`) is read into a Pandas DataFrame using `pd.read_csv`. The file uses a semicolon (`;`) as the separator and `cp1252` encoding.
  - The dataset contains 10,000 rows and 22 columns, including property details like `id`, `title`, `price`, `bedrooms`, `cityname`, and `state`.

- **Cleaning Steps**:
  - Strips whitespace from column names to ensure consistency.
  - Drops rows with missing values using `dropna()`, reducing the dataset to 2,570 rows where all essential fields are present.
  - Drops unnecessary columns (`category`, `has_photo`, `time`) to focus on relevant features, resulting in 19 columns.
  - Prints the DataFrame's shape, columns, and a preview of the data at each step for verification.

- **Output**:
  - Initial shape: (10,000, 22)
  - After dropping missing values: (2,570, 22)
  - After dropping columns: (2,570, 19)
  - The cleaned DataFrame includes key fields like `id`, `title`, `price`, `bedrooms`, `bathrooms`, `cityname`, and `state`, ready for embedding and querying.

This step ensures the dataset is clean and structured for downstream tasks like embedding generation and vector storage.

In [None]:
print(f"Loading data from: {DATA_URL}")
df = pd.read_csv(DATA_URL, sep=";", encoding='cp1252')
print(f"Loaded {len(df)} rows of data.")
print("Initial DataFrame shape:", df.shape)
print("Initial DataFrame columns (before cleaning):", df.columns.tolist())
df.columns = df.columns.str.strip()
print("Initial DataFrame columns (after stripping whitespace):", df.columns.tolist())
print("Initial DataFrame head:\n", df.head())
df_clean = df.dropna()
print("\nShape after dropping rows with missing essential data:", df_clean.shape)
columns_to_drop = ['category', 'has_photo', 'time']
df_clean = df_clean.drop(columns=columns_to_drop, errors='ignore')
print("Shape after dropping other columns:", df_clean.shape)
print("Cleaned DataFrame head:\n", df_clean.head())

# Setting Up ChromaDB and Embedding Property Listings

This cell initializes the Google Generative AI client, defines a custom embedding function, and stores the cleaned property listings in a ChromaDB vector database:

- **Google Client Initialization**:
  - A `genai.Client` is created using the `GOOGLE_API_KEY`.
  - Lists available models that support the `embedContent` action, confirming that `text-embedding-004` (used for embeddings) is available.

- **Custom Embedding Function**:
  - Defines `GeminiEmbeddingFunction`, a custom `EmbeddingFunction` for ChromaDB that uses Google's `text-embedding-004` model.
  - Supports retry logic for transient API errors (e.g., rate limits or server issues) using `google.api_core.retry`.
  - Configures the embedding task as `retrieval_document` for document embeddings (can switch to `retrieval_query` for queries).
  - Returns embeddings as a list of float vectors.

- **ChromaDB Setup**:
  - Initializes a ChromaDB client and creates/gets a collection named `Rental_Listings`.
  - Converts each row of the cleaned DataFrame to JSON strings (`documents`), uses DataFrame indices as `ids`, and stores metadata as dictionaries.
  - Processes the data in batches (size=100) to avoid overwhelming the API, adding documents, IDs, and metadata to the collection.
  - Embeds the JSON strings using the `GeminiEmbeddingFunction` and stores them in the vector database.

- **Output**:
  - Successfully creates the `Rental_Listings` collection with 2,570 documents.
  - A deprecation warning is noted for `GeminiEmbeddingFunction` due to missing `__init__`, but it does not affect functionality.

This step prepares the property listings for semantic search by embedding them into a vector space and storing them in ChromaDB.

In [None]:
client = genai.Client(api_key=GOOGLE_API_KEY)

for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)


is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})


class GeminiEmbeddingFunction(EmbeddingFunction):
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

def batch_list(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()

documents = df_clean.apply(lambda row: row.to_json(), axis=1).tolist()
ids = df_clean.index.astype(str).tolist()
metadatas = df_clean.to_dict(orient='records')
collection_name = "Rental_Listings"
collection = chroma_client.get_or_create_collection(name=collection_name, embedding_function=embed_fn)
batch_size = 100
total_docs = len(documents)
for batch_docs, batch_ids, batch_metas in zip(
    batch_list(documents, batch_size),
    batch_list(ids, batch_size),
    batch_list(metadatas, batch_size)):
    collection.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)

print(f"Created collection '{collection_name}' with {total_docs} documents.")

# Testing ChromaDB Query Functionality

This cell tests the ChromaDB setup by performing a sample query to retrieve properties:

- **Query Setup**:
  - Queries for "I want 5 properties from Illinois" using the `GeminiEmbeddingFunction` to generate an embedding for the query text.
  - Searches the `Rental_Listings` collection for the top 5 matching documents based on embedding similarity.

- **Query Execution**:
  - Uses `collection.query` with the query embedding, requesting 5 results and including `documents` and `metadatas`.
  - Iterates through the results, printing each property's ID, a preview of the document text (first 150 characters), and key metadata (`cityname`, `state`, `price`).

- **Output**:
  - Retrieves 5 properties, all from Chicago, IL, with prices ranging from $1,755 to $3,675.
  - Each result includes the document ID, a snippet of the JSON document, and metadata, confirming that the vector database is functioning correctly.

This test validates that the embeddings and ChromaDB collection are set up correctly, allowing semantic searches based on natural language queries.

In [None]:
print("\n--- Running initial ChromaDB query test ---")
query_text = "I want 5 properties from Illinois"
query_embedding = embed_fn([query_text])[0]
print(f"Generated query embedding for: '{query_text}'")
test_results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5,
        include=['documents', 'metadatas']
    )

print("\nTest Query Results:")
if test_results and test_results.get("ids") and test_results["ids"][0]:
      for i, doc_id in enumerate(test_results["ids"][0]):
          metadata = test_results["metadatas"][0][i]
          document_text = test_results["documents"][0][i]
          print(f"Result {i+1} (ID: {doc_id}):")
          doc_snippet = str(document_text)
          print(f"  Document Text Preview: {doc_snippet[:150]}...")
          print(f"  Metadata (City, State, Price): {metadata.get('cityname')}, {metadata.get('state')}, ${metadata.get('price')}")
          print("-" * 10)
else:
  print("No results found for the test query.")

# Defining Data Models and Graph State

This cell defines the data structures and state for the LangGraph workflow:

- **PropertyJson (Pydantic Model)**:
  - A `BaseModel` defining the structure of a property with fields like `id`, `title`, `price`, `bedrooms`, `bathrooms`, `city`, `state`, etc.
  - Uses `Optional` for fields that may be missing and includes descriptions for clarity.
  - Ensures data validation when properties are added to the state.

- **TripDict (TypedDict)**:
  - Defines a trip as a dictionary with:
    - `properties`: A list of `PropertyJson` objects.
    - `map_url`: A string for the Google Maps URL (to be populated later).
    - `summary_card`: A dictionary summarizing the trip's properties.

- **Pstate (TypedDict)**:
  - The state for the LangGraph workflow, containing:
    - `messages`: A list of `BaseMessage` objects (e.g., `HumanMessage`, `AIMessage`) for conversation history, annotated with `add_messages` to append new messages.
    - `property_set`: A list of `PropertyJson` objects representing the user's selected properties.
    - `trips`: A list of `TripDict` objects for visit planning.

- **Global Variables**:
  - `GLOBAL_COLLECTION`: Stores the ChromaDB collection for querying.
  - `GLOBAL_QUERY_EMBED_FN`: Stores the embedding function for queries.
  - `GLOBAL_PROPERTY_LIST_SIZE` and `GLOBAL_TRIP_SIZE`: Initialized to 0 (though not used in the provided code).

These structures ensure that property data, conversation history, and trip plans are managed consistently throughout the workflow.

In [None]:
class PropertyJson(BaseModel):
    id: str = Field(description="Unique identifier for the property (from dataset)")
    title: Optional[str] = Field(None, description="Title of the listing")
    description: Optional[str] = Field(None, description="Full description of the property")
    price: Optional[int] = Field(None, description="Rental price (integer)")
    bedrooms: Optional[int] = Field(None, description="Number of bedrooms")
    bathrooms: Optional[int] = Field(None, description="Number of bathrooms")
    property_type: Optional[str] = Field(None, description="Type of property (e.g., apartment, house)")
    amenities: Optional[str] = Field(None, description="List of amenities (comma-separated string or 'None')")
    address: Optional[str] = Field(None, description="Street address")
    city: Optional[str] = Field(None, description="City")
    state: Optional[str] = Field(None, description="State")
    latitude: Optional[float] = Field(None, description="Latitude coordinate")
    longitude: Optional[float] = Field(None, description="Longitude coordinate")
    pets_allowed: Optional[str] = Field(None, description="Pet policy")
    sqft: Optional[float] = Field(None, description="Square footage")


class TripDict(TypedDict):
    properties: List[PropertyJson]
    map_url: str
    summary_card: Dict

class Pstate(TypedDict):
    messages: Annotated[List[BaseMessage], add_messages]
    property_set: List[PropertyJson]
    trips: List[TripDict]

GLOBAL_COLLECTION = collection
GLOBAL_QUERY_EMBED_FN = embed_fn
GLOBAL_PROPERTY_LIST_SIZE = 0
GLOBAL_TRIP_SIZE = 0

# Defining the `query_properties` Tool

This cell defines a LangChain tool (`query_properties`) for searching the ChromaDB vector database:

- **Purpose**:
  - Queries the `Rental_Listings` collection for properties matching a natural language description (e.g., "2 bedroom apartment in Chicago under $2000 with parking").
  - Returns up to `n_results` matching properties as a list of dictionaries.

- **Implementation**:
  - Takes a `requirements` string and `n_results` (default=5) as inputs.
  - Uses `GLOBAL_QUERY_EMBED_FN` to generate an embedding for the query text.
  - Queries the ChromaDB collection with the embedding, retrieving `metadatas` and `documents`.
  - Processes results, extracting metadata and adding the document ID to each property dictionary.
  - Includes error handling for:
    - Missing global collection or embedding function.
    - Invalid or zero query embeddings.
    - Missing or non-dictionary metadata.
    - General exceptions during querying.

- **Output**:
  - Returns a list of property dictionaries (with metadata and ID) if successful, or an empty list if no results or errors occur.
  - Logs detailed debugging information (e.g., number of properties found, errors).

This tool enables semantic search, allowing users to find properties based on natural language descriptions.

In [None]:
@tool
def query_properties(requirements: str, n_results: int = 5) -> List[dict]:
    """
    Query the vector database for properties matching the given requirements description.
    Args:
        requirements: A natural language string describing the desired property features (e.g., '2 bedroom apartment in Chicago under $2000 with parking').
        n_results: The maximum number of properties to return.
    Returns:
        A list of dictionaries, where each dictionary represents a property matching the query. Includes metadata and the document ID. Returns an empty list if no matches found or an error occurs.
    """
    print(f"\n--- Tool: query_properties ---")
    print(f"Requirements: '{requirements}', n_results: {n_results}")
    if GLOBAL_COLLECTION is None or GLOBAL_QUERY_EMBED_FN is None:
         print("--- Tool Error: GLOBAL_COLLECTION or GLOBAL_QUERY_EMBED_FN not available. ---")
         return []

    try:
        query_embedding = GLOBAL_QUERY_EMBED_FN([requirements])[0]
        if not query_embedding or all(v == 0.0 for v in query_embedding):
             print("--- Tool Error: Failed to generate query embedding (might be empty or zero vector). ---")
             return []

        results = GLOBAL_COLLECTION.query(
            query_embeddings=[query_embedding],
            n_results=n_results,
            include=['metadatas', 'documents']
        )

        properties = []
        if results and results.get('ids') and results.get('ids')[0]:
             ids_list = results['ids'][0]
             metadatas_list = results.get('metadatas', [None])[0]
             documents_list = results.get('documents', [None])[0]

             for i, prop_id in enumerate(ids_list):
                 meta = metadatas_list[i] if metadatas_list and i < len(metadatas_list) else {}
                 if not isinstance(meta, dict):
                     print(f"--- Tool Warning: Skipping result ID {prop_id} due to missing or non-dictionary metadata. ---")
                     continue

                 prop_dict = meta.copy()
                 prop_dict['id'] = prop_id
                 properties.append(prop_dict)

             print(f"--- Tool: query_properties found {len(properties)} properties. ---")
             return properties
        else:
            print("--- Tool: query_properties found no results or unexpected result format. ---")
            return []
    except Exception as e:
        print(f"--- Tool Error in query_properties: {e} ---")
        return []

# Defining the `add_to_property_set` Tool

This cell defines a LangChain tool (`add_to_property_set`) for adding properties to the user's working set:

- **Purpose**:
  - Adds properties (as dictionaries) to the `property_set` in the graph state, ensuring no duplicates based on `id`.
  - Validates properties using the `PropertyJson` Pydantic model.

- **Implementation**:
  - Takes a list of property dictionaries (`properties_to_add`) and the current state (`state`) as inputs.
  - Checks if `property_set` exists in the state; initializes it as an empty list if not.
  - Tracks existing property IDs to prevent duplicates.
  - Iterates through input properties, validating each with `PropertyJson`:
    - Skips non-dictionary items or those without an `id`.
    - Converts valid dictionaries to `PropertyJson` objects and adds them to the set.
    - Logs warnings for validation errors or missing IDs.
  - Updates the state with the new properties and logs the number added.

- **Output**:
  - Returns the updated state with the expanded `property_set`.
  - Provides detailed logging (e.g., number of properties added, validation errors).

This tool allows users to save properties they are interested in, ensuring data integrity through validation.

In [None]:
@tool
def add_to_property_set(properties_to_add: List[Dict], state: Pstate) -> Pstate:
    """
    Adds one or more properties (provided as dictionaries) to the
    user's working property set.
    Filters out properties already present based on their 'id'.
    Converts valid dicts to PropertyJson objects.
    Args:
        properties_to_add: A list of dictionaries, each representing a
            property. Must contain necessary fields for PropertyJson.
        state: The current graph state.
    Returns:
        The updated graph state with properties added to the
        property_set.
    """
    print(f"\n--- Tool: add_to_property_set ---")
    print(f"Attempting to add {len(properties_to_add)} properties.")

    if 'property_set' not in state or not isinstance(state['property_set'], list):
        print("--- Tool Warning: 'property_set' not found or not a list in state. Initializing as empty list. ---")
        state['property_set'] = []

    current_ids = {p.id for p in state['property_set']}
    added_count = 0
    new_properties_validated: List[PropertyJson] = []

    if not isinstance(properties_to_add, list):
        print(f"--- Tool Warning: Input 'properties_to_add' is not a list ({type(properties_to_add)}). Skipping. ---")
        return state

    for prop_dict in properties_to_add:
        if not isinstance(prop_dict, dict):
            print(f"--- Tool Warning: Skipping item that is not a dictionary: {prop_dict} ---")
            continue

        prop_id = prop_dict.get('id')
        if not prop_id:
            print(f"--- Tool Warning: Skipping property due to missing or empty 'id': {prop_dict.get('title', 'N/A')} ---")
            continue

        if prop_id not in current_ids:
            try:
                validated_prop = PropertyJson(**prop_dict)
                new_properties_validated.append(validated_prop)
                current_ids.add(prop_id)
                added_count += 1
            except ValidationError as e:
                print(f"--- Tool Warning: Skipping property ID {prop_id} ('{prop_dict.get('title', 'N/A')}') due to Pydantic validation error: {e} ---")
            except Exception as e:
                 print(f"--- Tool Warning: Skipping property ID {prop_id} ('{prop_dict.get('title', 'N/A')}') due to unexpected error during validation: {e} ---")

    state['property_set'].extend(new_properties_validated)
    print(f"--- Tool: Added {added_count} new properties. Property set size: {len(state['property_set'])} ---")

    return state

# Defining the `remove_from_property_set` Tool

This cell defines a LangChain tool (`remove_from_property_set`) for removing properties from the user's working set:

- **Purpose**:
  - Removes properties from the `property_set` based on their 0-based indices.
  - Ensures only valid indices are processed to prevent errors.

- **Implementation**:
  - Takes a list of indices (`indices_to_remove`) and the current state (`state`) as inputs.
  - Checks if `property_set` exists and is non-empty; returns the state unchanged if not.
  - Filters and sorts valid indices (integers within the range of `property_set` length) in reverse order to avoid index shifting during removal.
  - Removes properties at the specified indices, tracking removed properties and their IDs.
  - Updates the state and logs the number of properties removed and their IDs.

- **Output**:
  - Returns the updated state with the reduced `property_set`.
  - Provides detailed logging (e.g., indices removed, property IDs, warnings for invalid indices).

This tool allows users to refine their property set by removing unwanted properties.

In [None]:
@tool
def remove_from_property_set(indices_to_remove: List[int], state: Pstate) -> Pstate:
    """
    Removes properties from the working set based on their list index (0-based).
    Args:
        indices_to_remove: A list of integer indices of the properties
            to remove from the current property_set.
        state: The current graph state.
    Returns:
        The updated graph state with specified properties removed.
    """
    print(f"\n--- Tool: remove_from_property_set ---")
    print(f"Attempting to remove properties at indices: {indices_to_remove}")

    if 'property_set' not in state or not isinstance(state['property_set'], list) or not state['property_set']:
        print("--- Tool Warning: Cannot remove properties, property set is empty or invalid. ---")
        return state

    initial_count = len(state['property_set'])
    valid_indices = sorted(
        [idx for idx in indices_to_remove if isinstance(idx, int) and 0 <= idx < initial_count],
        reverse=True
    )

    if not valid_indices:
        print(f"--- Tool Warning: No valid indices provided to remove. Indices provided: {indices_to_remove}. Set size: {initial_count}. ---")
        return state

    removed_count = 0
    removed_props = []
    properties_after_removal = list(state['property_set'])

    for idx in valid_indices:
        if 0 <= idx < len(properties_after_removal):
             removed_props.append(properties_after_removal.pop(idx))
             removed_count += 1


    state['property_set'] = properties_after_removal
    print(f"--- Tool: Removed {removed_count} properties. Property set size: {len(state['property_set'])} ---")

    if removed_props:
        removed_ids = [p.id for p in removed_props if p.id]
        if removed_ids:
            print(f"--- Tool: Removed property IDs: {', '.join(removed_ids)} ---")
        else:
            print(f"--- Tool: Removed properties but they had no IDs. ---")

    return state

# Defining the `generate_groups_for_visits` Tool

This cell defines a LangChain tool (`generate_groups_for_visits`) for clustering properties into visit groups:

- **Purpose**:
  - Clusters properties in the `property_set` into `n_trips` groups based on geographic coordinates (latitude, longitude) using KMeans clustering.
  - Prepares properties for visit planning by grouping them logically by location.

- **Implementation**:
  - Takes the desired number of trips (`n_trips`) and the current state (`state`) as inputs.
  - Validates the `property_set` and filters properties with valid coordinates.
  - Adjusts `n_trips` to be within valid bounds (1 to the number of valid properties).
  - Uses `StandardScaler` to normalize coordinates and `KMeans` to cluster properties into `n_trips` groups.
  - Sorts properties within each group by latitude and title for consistent ordering.
  - Creates `TripDict` entries with clustered properties, empty `map_url`, and empty `summary_card`.
  - Updates the state’s `trips` field with the new groups, clearing previous trips.

- **Output**:
  - Returns the updated state with the `trips` field populated.
  - Includes extensive logging for debugging (e.g., number of clusters, properties processed, errors).

This tool enables geographic organization of properties, making visit planning more efficient.

In [None]:
@tool
def generate_groups_for_visits(n_trips: int, state: Pstate) -> Pstate:
    """
    Clusters properties currently in the property_set into a specified
    number of groups based on geographic coordinates (latitude, longitude)
    for visit planning.
    Args:
        n_trips: The desired number of visit groups (clusters).
        state: The current graph state containing the property_set.
    Returns:
        The updated graph state with the 'trips' field populated with
        initial trip groups (properties only). Clears previous trips.
    """
    print(f"\n--- Tool: generate_groups_for_visits ---")
    properties = state.get('property_set', [])
    print(f"Attempting to generate {n_trips} visit groups from {len(properties)} properties.")

    if not isinstance(properties, list):
        print("--- Tool Warning: 'property_set' in state is not a list. Clearing trips and returning. ---")
        state['trips'] = []
        return state

    valid_properties = [p for p in properties if p.latitude is not None and p.longitude is not None]

    if not valid_properties:
        print("--- Tool Warning: Cannot generate groups, no properties with valid coordinates in the set. Clearing trips. ---")
        state['trips'] = []
        return state
    if not isinstance(n_trips, int) or n_trips <= 0:
        print(f"--- Tool Warning: Invalid number of trips ({n_trips}). Must be a positive integer. Setting to 1. ---")
        n_trips = 1

    if n_trips > len(valid_properties):
        print(f"--- Tool Warning: Number of trips ({n_trips}) > number of valid properties ({len(valid_properties)}). Setting n_trips = {len(valid_properties)}. ---")
        n_trips = len(valid_properties)

    if n_trips == 0 and len(valid_properties) > 0:
         print("--- Tool Warning: Number of trips calculated as 0 but valid properties exist. Setting to 1. ---")
         n_trips = 1

    if n_trips == 0:
        print("--- Tool Warning: Number of trips is 0 after adjustments. No groups will be generated. ---")
        state['trips'] = []
        return state

    try:
        coords = np.array([[p.latitude, p.longitude] for p in valid_properties])
        scaler = StandardScaler()
        coords_scaled = scaler.fit_transform(coords)

        n_clusters_actual = min(n_trips, len(coords_scaled))

        if n_clusters_actual == 0:
            print("--- Tool Warning: Not enough samples for clustering (0). No groups generated. ---")
            state['trips'] = []
            return state

        print(f"--- Tool: Running KMeans with {n_clusters_actual} clusters ---")
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Number of distinct clusters")
            kmeans = KMeans(n_clusters=n_clusters_actual, random_state=42, n_init='auto')

        labels = kmeans.fit_predict(coords_scaled)

        trips_temp: List[TripDict] = []
        for i in range(n_clusters_actual):
            group_properties = [valid_properties[j] for j, label in enumerate(labels) if label == i]
            if group_properties:
                 group_properties_sorted = sorted(group_properties,
                                                  key=lambda p: (p.latitude if p.latitude is not None else float('inf'),
                                                                 p.title if p.title is not None else ''))
                 trips_temp.append({
                     "properties": group_properties_sorted,
                     "map_url": "",
                     "summary_card": {}
                 })

        state['trips'] = trips_temp

        print(f"--- Tool: Successfully generated {len(state['trips'])} trip groups. ---")

    except Exception as e:
        print(f"--- Tool Error in generate_groups_for_visits: {e} ---")
        state['trips'] = []

    return state

# Defining the `generate_visit_plan` Tool

This cell defines a LangChain tool (`generate_visit_plan`) for creating visit plans with Google Maps URLs and summary cards:

- **Purpose**:
  - Enhances the `trips` created by `generate_groups_for_visits` by generating Google Maps URLs and detailed summary cards for each trip.
  - Must be called after `generate_groups_for_visits`.

- **Implementation**:
  - Takes the current state (`state`) as input and checks for valid `trips`.
  - Initializes a `genai.Client` for calling the `gemini-2.0-flash` model to generate map URLs.
  - For each trip:
    - Constructs a prompt with property details (title, address, city, state, bedrooms, bathrooms, sqft, price, pets, amenities).
    - Calls `gemini-2.0-flash` to generate a Google Maps URL showing property locations.
    - Validates the URL and sets `map_url` (or sets to "N/A" with an error message if invalid).
    - Creates a `summary_card` with:
      - Trip index, number of properties, cities covered, average/total price.
      - A preview of property details (up to 3) and full details in `_full_property_details`.
  - Handles errors (e.g., missing API key, invalid properties, LLM failures) with appropriate logging.

- **Output**:
  - Returns the updated state with `map_url` and `summary_card` populated for each trip.
  - Provides detailed logging for each step (e.g., LLM calls, URL validation, summary card generation).

This tool completes the visit planning process by providing actionable map links and comprehensive trip summaries.

In [None]:
@tool
def generate_visit_plan(state: Pstate) -> Pstate:
    """
    Generates Google Maps search URLs (using gemini-2.0-flash) showing
    property locations and comprehensive summary cards for each trip group
    previously created by 'generate_groups_for_visits'.
    Includes details like bedrooms, bathrooms, price, square footage,
    pets, and amenities in the prompt and summary.
    Must be called AFTER 'generate_groups_for_visits'.
    Args:
        state: The current graph state containing the 'trips' list.
    Returns:
        The updated graph state with 'map_url' and 'summary_card'
        populated for each trip.
    """
    print(f"\n--- Tool: generate_visit_plan ---")
    if not state.get('trips') or not isinstance(state['trips'], list):
        print("--- Tool Warning: No trips found or trips format incorrect. Run 'generate_groups_for_visits' first. ---")
        state['trips'] = []
        return state

    print(f"Attempting to generate visit plans for {len(state['trips'])} trips.")

    client = None
    try:
        if 'GOOGLE_API_KEY' in globals():
             client = genai.Client(api_key=GOOGLE_API_KEY)
             print("--- Tool: Gemini client initialized. ---")
             client_initialized = True
        else:
             print("--- Tool Error: GOOGLE_API_KEY not found. Cannot initialize Gemini client. ---")
             client_initialized = False

    except Exception as e:
        print(f"--- Tool Error: Failed to initialize Gemini client: {e} ---")
        client_initialized = False
        client = None

    for i, trip in enumerate(state['trips']):
        properties = trip.get('properties', [])
        if not properties or not isinstance(properties, list):
            print(f"--- Tool Warning: Trip {i+1} has no properties or properties format incorrect, skipping plan generation. ---")
            trip['map_url'] = "N/A - No properties listed"
            trip['summary_card'] = {"error": "No properties in this trip"}
            continue

        property_list_for_prompt = "; ".join([
            f"{getattr(p, 'title', 'Property')} at {getattr(p, 'address', 'N/A')}, {getattr(p, 'city', 'N/A')}, {getattr(p, 'state', 'N/A')} "
            f"({getattr(p, 'bedrooms', 'N/A')} bed, {getattr(p, 'bathrooms', 'N/A')} bath, {getattr(p, 'sqft', 'N/A')} sq ft, " # Use 'sqft' as per PropertyJson
            f"Price: {getattr(p, 'price_display', 'N/A')} {getattr(p, 'price_type', 'N/A')}, Pets Allowed: {getattr(p, 'pets_allowed', 'N/A')}, "
            f"Fee: {getattr(p, 'fee', 'N/A')}, Amenities: {getattr(p, 'amenities', 'N/A')})"
            for p in properties
            if all([getattr(p, 'address', None), getattr(p, 'city', None), getattr(p, 'state', None)])
        ])

        if property_list_for_prompt and client_initialized and client is not None:
            llm_prompt = f"""
            Please generate a Google Maps URL that displays the locations of the following properties.
            Ensure the URL directly shows markers for these locations. Include key details about each property
            in the information displayed on the map if possible, or structure the URL to highlight these properties.
            Properties:
            {property_list_for_prompt}
            Provide only the Google Maps URL in your response, starting with 'https://'.
            Do not include any other text, explanation, or formatting.
            """
            try:
                print(f"--- Tool: Calling Gemini 2.0 Flash for Map URL for Trip {i+1} ---")
                answer = client.models.generate_content(
                    model="gemini-2.0-flash",
                    contents=llm_prompt
                )
                generated_map_url = answer.text.strip()
                if generated_map_url.startswith("http") and "google" in generated_map_url and "maps" in generated_map_url:
                     trip['map_url'] = generated_map_url
                     print(f"--- Tool: Gemini generated URL for Trip {i+1}: {trip['map_url']} ---")
                else:
                     trip['map_url'] = f"N/A - LLM did not return a valid URL format: {generated_map_url[:200]}..."
                     print(f"--- Tool Warning: Gemini did not return a valid URL format for Trip {i+1}. Response snippet: {generated_map_url[:200]} ---")

            except Exception as e:
                trip['map_url'] = f"N/A - Error calling LLM: {e}"
                print(f"--- Tool Error: Failed to call Gemini for Trip {i+1}: {e} ---")

        elif not property_list_for_prompt:
            trip['map_url'] = "N/A - No valid property details for LLM prompt"
            print(f"--- Tool Warning: No valid property details for LLM prompt for Trip {i+1}. Map URL set to N/A. ---")
        else:
            trip['map_url'] = "N/A - LLM client not initialized"
            print(f"--- Tool Warning: LLM client not initialized. Map URL set to N/A for Trip {i+1}. ---")

        prop_details_summary = []
        for p in properties:
             detail = {
                 "id": getattr(p, 'id', 'N/A'),
                 "title": getattr(p, 'title', 'N/A'),
                 "city": getattr(p, 'city', 'N/A'),
                 "state": getattr(p, 'state', 'N/A'),
                 "price": getattr(p, 'price', None),
                 "price_display": getattr(p, 'price_display', 'N/A'),
                 "price_type": getattr(p, 'price_type', 'N/A'),
                 "bedrooms": getattr(p, 'bedrooms', 'N/A'),
                 "bathrooms": getattr(p, 'bathrooms', 'N/A'),
                 "square_feet": getattr(p, 'sqft', 'N/A'),
                 "pets_allowed": getattr(p, 'pets_allowed', 'N/A'),
                 "fee": getattr(p, 'fee', 'N/A'),
                 "amenities": getattr(p, 'amenities', 'N/A'),
                 "address": getattr(p, 'address', 'N/A')
             }
             prop_details_summary.append(detail)
        cities_covered = sorted(list(set(f"{getattr(p, 'city', '')}, {getattr(p, 'state', '')}"
                                       for p in properties if getattr(p, 'city', None) and getattr(p, 'state', None))))

        valid_prices = [p['price'] for p in prop_details_summary if p['price'] is not None and isinstance(p['price'], (int, float))]
        total_price_sum = sum(valid_prices) if valid_prices else 0
        avg_price = total_price_sum / len(valid_prices) if valid_prices else 0

        trip['summary_card'] = {
            "trip_index": i + 1,
            "number_of_properties": len(properties),
            "cities_covered": cities_covered if cities_covered else ["N/A"],
            "average_price": round(avg_price) if avg_price else "N/A",
            "total_price_sum": total_price_sum if total_price_sum else "N/A",
            "property_details_preview": [
                {
                    "title": d['title'],
                    "city": d['city'],
                    "bedrooms": d['bedrooms'],
                    "bathrooms": d['bathrooms'],
                    "price_display": d['price_display']
                } for d in prop_details_summary[:3]
            ] + ([{"...": "..."}] if len(prop_details_summary) > 3 else []),
            "_full_property_details": prop_details_summary
        }

        print(f"--- Tool: Generated Summary Card for Trip {i+1} ---")
    print(f"--- Tool: Finished generating plans for {len(state.get('trips', []))} trips. ---")

    return state

# Initializing LLM and Defining System Instructions

This cell sets up the Language Model (LLM) and defines the system instructions for the real estate agent:

- **LLM Initialization**:
  - Creates a `ChatGoogleGenerativeAI` instance using the `gemini-2.0-flash` model with a temperature of 0.7 (for balanced creativity) and the `GOOGLE_API_KEY`.
  - Logs successful initialization or raises an error if the API key or model is invalid.

- **System Instructions (`REAL_ESTATE_AGENT_SYSINT`)**:
  - Defines the persona of a "helpful and knowledgeable real estate AI assistant" for finding rental properties.
  - Outlines the workflow:
    - Use `query_properties` to search for properties based on user descriptions.
    - Use `add_to_property_set` to save properties to the user’s set, ensuring full metadata is provided.
    - Use `remove_from_property_set` to remove properties by index.
    - Use `generate_groups_for_visits` to cluster properties into trips, followed by `generate_visit_plan` for maps and summaries.
  - Emphasizes conversational guidance, clarity in presenting results, and error handling (e.g., checking for empty sets, avoiding hallucination).
  - Lists available tools for reference.

- **Tool Setup**:
  - Defines a list of tools: `query_properties`, `add_to_property_set`, `remove_from_property_set`, `generate_groups_for_visits`, `generate_visit_plan`.
  - Creates a `ToolNode` for executing these tools within the LangGraph workflow.

This step prepares the conversational AI component, ensuring it can interact with users and execute tools as needed.

In [None]:
print("\nDefining LangGraph graph...")

try:
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0.7,
        google_api_key=GOOGLE_API_KEY,
    )
    print("ChatGoogleGenerativeAI LLM initialized.")
except Exception as e:
    print(f"Error initializing LLM. Ensure API key is correct and model is available. Error: {e}")
    llm = None
    raise


REAL_ESTATE_AGENT_SYSINT = (
    "You are a helpful and knowledgeable real estate AI assistant. Your goal is to assist users in finding rental properties. "
    "You have access to a database of property listings."
    "You can search for properties using the `query_properties` tool based on user descriptions (e.g., location, number of bedrooms/bathrooms, price range, amenities, pet policy). When presenting results, list them clearly, maybe with indices like [0], [1], etc., mentioning key details like title, price, bedrooms, bathrooms, city, and state."
    "When the user finds properties they like from the search results, they might ask you to save them to a 'property set'. Use the `add_to_property_set` tool to add properties. You MUST provide the full property details (as a dictionary, typically obtained from the metadata of the search results) for each property you want to add. You can refer back to the details you presented from `query_properties` results to construct these dictionaries."
    "The user can ask you to remove properties from their current 'property set' by referring to their 0-based index within the set. Use the `remove_from_property_set` tool for this, providing the list of indices to remove."
    "Once the user has a set of properties, they might want to plan visits. Use the `generate_groups_for_visits` tool FIRST to cluster the properties in the set into logical groups (trips) based on location. You need to specify the desired number of trips (e.g., if the user says 'organize them into 3 trips', call the tool with n_trips=3)."
    "AFTER successfully grouping the properties, use the `generate_visit_plan` tool SECOND to generate summary cards and map links for these trip groups."
    "When the property set or trips are modified, clearly inform the user about the changes and the current state (e.g., 'Added 3 properties, your set now has 5.', 'Organized your 10 properties into 3 visit groups.')."
    "Be conversational, friendly, and guide the user. Ask clarifying questions if the request is unclear (e.g., 'What city or area are you interested in?')."
    "Always check if the property set is empty before attempting to remove or group items. Remember that indices for removing items refer to the CURRENT property set list."
    "Do not hallucinate property details or make up map URLs; only use information from tool outputs."
    "Available tools: `query_properties`, `add_to_property_set`, `remove_from_property_set`, `generate_groups_for_visits`, `generate_visit_plan`."
    "You are in an interactive chat. Respond to the user based on the conversation history and tool results."
)


tools = [query_properties, add_to_property_set, remove_from_property_set, generate_groups_for_visits, generate_visit_plan]
tool_node = ToolNode(tools)

# Defining Agent Node and Routing Logic

This cell defines the core logic for the LangGraph agent and its decision-making process:

- **Agent Node (`agent_node`)**:
  - Invokes the LLM (`ChatGoogleGenerativeAI`) to process user input and decide the next action.
  - Constructs a message list with the system instructions (`REAL_ESTATE_AGENT_SYSINT`) and the conversation history (`state['messages']`).
  - Handles LLM invocation errors by returning an error message as an `AIMessage`.
  - Returns the updated state with the LLM’s response added to `messages`.

- **Routing Logic (`should_continue`)**:
  - Determines the next step based on the last message in the state:
    - If a `ToolMessage`, routes back to the `agent` (tool execution complete).
    - If an `AIMessage` with `tool_calls`, routes to `tools` for execution.
    - If an `AIMessage` without `tool_calls`, ends the turn (`__end__`).
    - For other message types, routes to `agent` for further processing.
  - Logs routing decisions for debugging (e.g., number of tool calls, message types).

This logic ensures the agent can handle user inputs, execute tools, and manage the conversation flow appropriately.

In [None]:
def agent_node(state: Pstate):
    """Invokes the LLM to get the next action or response."""
    print("--- Agent Node: Invoking LLM ---")
    if llm is None:
        error_msg = AIMessage(content="Error: LLM not initialized. Cannot process request.")
        return {"messages": [error_msg]}
    messages_for_llm = [SystemMessage(content=REAL_ESTATE_AGENT_SYSINT)] + state['messages']


    try:
        response = llm.invoke(messages_for_llm)
        print(f"--- Agent Node: LLM Response received (type: {type(response).__name__}) ---")
        return {"messages": [response]}
    except Exception as e:
         print(f"--- Agent Node Error during LLM invocation: {e} ---")
         error_message = AIMessage(content=f"An error occurred while processing your request: {e}")
         return {"messages": [error_message]}



def should_continue(state: Pstate) -> Literal["tools", "__end__", "agent"]:
    """Determines whether to call tools, end the current execution step, or return to agent."""
    last_message = state['messages'][-1]

    if isinstance(last_message, ToolMessage):
         print(f"--- Conditional Edge: Last message is ToolMessage. Routing back to agent. ---")
         return "agent"

    if isinstance(last_message, AIMessage) and last_message.tool_calls:
        if isinstance(last_message.tool_calls, list) and len(last_message.tool_calls) > 0:
             print(f"--- Conditional Edge: AI message has tool_calls. Routing to tools ({len(last_message.tool_calls)} calls) ---")
             return "tools"
        else:
             print(f"--- Conditional Edge: AI message has invalid tool_calls format or is empty. Ending the turn. ---")
             return END

    if isinstance(last_message, AIMessage) and not last_message.tool_calls:
        print("--- Conditional Edge: AI message has no tool calls. Ending the turn. ---")
        return END

    print(f"--- Conditional Edge: Last message is {type(last_message).__name__}. Routing to agent. ---")
    return "agent"

# Building and Compiling the LangGraph Workflow

This cell constructs and compiles the LangGraph workflow for the real estate assistant:

- **Graph Construction**:
  - Creates a `StateGraph` with the `Pstate` type to manage the workflow state.
  - Adds two nodes:
    - `agent`: Executes the `agent_node` function to invoke the LLM.
    - `tools`: Executes the `tool_node` to run tools like `query_properties`.
  - Sets the entry point to the `agent` node.
  - Adds conditional edges from `agent` using `should_continue` to route to `tools`, `agent`, or `END`.
  - Adds a direct edge from `tools` back to `agent` to process tool outputs.

- **Compilation**:
  - Compiles the graph into an executable workflow using `graph_builder.compile()`.
  - Logs successful compilation or exits on error.

This step finalizes the workflow, enabling the assistant to handle user inputs, execute tools, and manage state transitions.

In [None]:
graph_builder = StateGraph(Pstate)

graph_builder.add_node("agent", agent_node)
graph_builder.add_node("tools", tool_node)
graph_builder.set_entry_point("agent")

graph_builder.add_conditional_edges(
    "agent",
    should_continue,
    {
        "tools": "tools",
        END: END,
        "agent": "agent"
    }
)


graph_builder.add_edge("tools", "agent")


print("Compiling graph...")
try:
    compiled_graph = graph_builder.compile()
    print("Graph compiled successfully.")
except Exception as e:
    print(f"Error compiling graph: {e}")
    exit()

# Running the Interactive Chat Session

This cell implements an interactive chat loop for the real estate assistant:

- **Initialization**:
  - Initializes the state (`current_state`) with empty `messages`, `property_set`, and `trips`.
  - Displays a welcome message and instructions to type 'quit' to exit.

- **Chat Loop**:
  - Prompts the user for input.
  - Exits if the input is 'quit', 'q', 'bye', or 'goodbye' (case-insensitive).
  - Appends the user’s input as a `HumanMessage` to the state’s `messages`.
  - Invokes the compiled LangGraph workflow (`compiled_graph.invoke`) to process the input and update the state.
  - Prints the last message from the updated state (typically an `AIMessage` with the assistant’s response).
  - Handles errors by appending an error message to the state and continuing the loop.
  - Logs each step for debugging (e.g., graph invocation, message types).

- **Conversation Example** (from output):
  - User searches for 2-bedroom apartments in Chicago ($1000-$2000, with gym).
  - Assistant presents three properties, adds them to the property set, removes one, organizes the remaining two into one trip, and generates a summary card.
  - The assistant uses pseudo-tool calls (e.g., `tool_code` in responses) but does not execute them due to a potential issue with tool integration (see below).

- **Issues Noted**:
  - The assistant outputs tool calls as code blocks (e.g., `query_properties(...)`) but does not execute them, indicating a possible misconfiguration in the LangGraph setup or LLM tool-calling behavior.
  - Properties in the conversation (e.g., "Luxury 2 bed 2 bath in Streeterville!") appear to be fabricated, as they don’t match the dataset’s actual listings (e.g., from the ChromaDB test query).
  - Validation errors likely occur when adding properties due to missing `id` fields in the fabricated data.

This cell demonstrates the assistant’s conversational capabilities but highlights areas for improvement in tool execution and data consistency.

In [None]:
print("\n--- Starting the interactive LangGraph session ---")
print("\n---WELCOME TO RENT ASSISTANT - YOUR RENTAL BUDDY")
print("Type 'quit' to end the conversation.")
print("-" * 30)


current_state: Pstate = {
    "messages": [],
    "property_set": [],
    "trips": []
}


while True:
    user_input = input("You: ")

    if user_input.lower() == 'quit' or user_input.lower()=='q' or user_input.lower()=='bye' or user_input.lower()=='goodbye':
        print("--- Ending conversation ---")
        break

    current_state['messages'].append(HumanMessage(content=user_input))
    print("--- Invoking graph for one turn ---")
    try:
        new_state = compiled_graph.invoke(current_state)
        current_state = new_state

        if current_state and 'messages' in current_state and current_state['messages']:
            last_message = current_state['messages'][-1]

            if isinstance(last_message, AIMessage):
                if last_message.content:
                    print(f"AI: {last_message.content}")

            elif isinstance(last_message, HumanMessage):
                 print("AI (Unexpected - received HumanMessage as final output)")
            elif isinstance(last_message, ToolMessage):
                 print("AI (Tool Response received directly)")

            else:
                 print(f"AI (Unexpected message type: {type(last_message).__name__})")


    except Exception as e:
        print(f"\n--- An unexpected error occurred during graph execution turn: {e} ---")
        error_msg = AIMessage(content=f"An internal error occurred: {e}. Please try again or rephrase your request.")
        if 'messages' in current_state and isinstance(current_state['messages'], list):
             current_state['messages'].append(error_msg)
        else:
             print("--- Error: State messages list is missing or corrupted. Cannot add error message. ---")

    print("-" * 30)

print("\n--- Interactive session finished ---")