In [1]:
import os
import warnings
from datasets import load_dataset
import pandas as pd
from typing import List, Optional
from pydantic import BaseModel, ValidationError
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()
mongo_uri = os.getenv('MONGO_URI')
OPENAI_API_KEY = os.getenv('OPENAI')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pymongo
print(pymongo.version)

4.10.1


In [3]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split = "train")
dataset = dataset.take(100)
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

print(dataset_df.columns)

Index(['_id', 'listing_url', 'name', 'summary', 'space', 'description',
       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
       'house_rules', 'property_type', 'room_type', 'bed_type',
       'minimum_nights', 'maximum_nights', 'cancellation_policy',
       'last_scraped', 'calendar_last_scraped', 'first_review', 'last_review',
       'accommodates', 'bedrooms', 'beds', 'number_of_reviews', 'bathrooms',
       'amenities', 'price', 'security_deposit', 'cleaning_fee',
       'extra_people', 'guests_included', 'images', 'host', 'address',
       'availability', 'review_scores', 'reviews', 'weekly_price',
       'monthly_price', 'text_embeddings', 'image_embeddings'],
      dtype='object')


In [4]:
class Host(BaseModel):
  host_id : str
  host_url: str
  host_name: str
  host_location: str
  host_about: str
  host_response_time : Optional[str] = None
  host_thumbnail_url : str
  host_picture_url : str
  host_response_rate : Optional[int] = None
  host_is_superhost: bool
  host_has_profile_pic: bool
  host_identity_verified: bool

class Location(BaseModel):
  type : str
  coordinates : List[float]
  is_location_exact: bool

class Address(BaseModel):
  street: str
  government_area: str
  market: str
  country: str
  country_code: str
  location: Location

class Review(BaseModel):
  _id : str
  date: Optional[datetime] = None
  listing_id: str
  reviewer_id: str
  reviewer_name: Optional[str] = None
  comments: Optional[str] = None

class Listing(BaseModel):
  _id: int
  listing_url: str
  name: str
  summary: str
  space: str
  description: str
  neighborhood_overview: Optional[str] = None
  notes: Optional[str] = None
  transit: Optional[str] = None
  access: str
  interaction: Optional[str] = None
  house_rules: str
  property_type: str
  room_type: str
  bed_type: str
  minimum_nights: int
  maximum_nights: int
  cancellation_policy: str
  last_scraped: Optional[datetime] = None
  calendar_last_scraped: Optional[datetime] = None
  first_review: Optional[datetime] = None
  last_review: Optional[datetime] = None
  accommodates: int
  bedrooms: Optional[float] = 0
  beds: Optional[float] = 0
  number_of_reviews: int
  bathrooms: Optional[float] = 0
  amenities: List[str]
  price: int
  security_deposit: Optional[float] = None
  cleaning_fee: Optional[float] = None
  extra_people: int
  guests_included: int
  images: dict
  host: Host
  address: Address
  availability: dict
  review_scores: dict
  reviews: List[Review]
  text_embeddings: List[float]


In [5]:
records = dataset_df.to_dict(orient = 'records')
for record in records:
  for key, value in record.items():
    if isinstance(value, list):
      processed_list = [None if pd.isnull(v) else v for v in value]
      record[key] = processed_list
    else:
      if pd.isnull(value):
        record[key] = None

try:
  listings = [Listing(**record).dict() for record in records]
  print(listings[0].keys())
except ValidationError as e:
  print(e)


dict_keys(['listing_url', 'name', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'property_type', 'room_type', 'bed_type', 'minimum_nights', 'maximum_nights', 'cancellation_policy', 'last_scraped', 'calendar_last_scraped', 'first_review', 'last_review', 'accommodates', 'bedrooms', 'beds', 'number_of_reviews', 'bathrooms', 'amenities', 'price', 'security_deposit', 'cleaning_fee', 'extra_people', 'guests_included', 'images', 'host', 'address', 'availability', 'review_scores', 'reviews', 'text_embeddings'])


/var/folders/bb/_7c_tp6x5yz_39tq724bm5pc0000gn/T/ipykernel_76642/3462360810.py:12: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  listings = [Listing(**record).dict() for record in records]


In [6]:
#use if ur cluster has data already and u r running this again
# collection.delete_many({})

In [7]:
import ssl
from pymongo import MongoClient

# Connection string
uri = "mongodb+srv://prxshetty:asdf@cluster0.kypjl.mongodb.net/airbnb_dataset?retryWrites=true&w=majority"

# Create SSL context enforcing TLS 1.2
ssl_context = ssl.create_default_context()
ssl_context.options |= ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1  # Disable TLS < 1.2

try:
    # Pass the SSL context with the `tls` parameter
    client = MongoClient(uri, tls=True, tlsAllowInvalidCertificates=True, tlsContext=ssl_context, serverSelectionTimeoutMS=5000)
    print(client.server_info())  # Forces connection
except Exception as e:
    print(f"Connection error: {e}")


Connection error: Unknown option: tlsContext. Did you mean one of (connect, tlscafile, tlscrlfile) or maybe a camelCase version of one? Refer to docstring.


  ssl_context.options |= ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1  # Disable TLS < 1.2


In [8]:
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel
database_name = "airbnb_dataset"
collection_name = "listings_reviews"

def get_mongo_client(mongo_uri):
  client = MongoClient(mongo_uri, appname ="devrel")
  print("Connection to MongoDB Successful")
  return client

mongo_client = get_mongo_client(mongo_uri)
db = mongo_client.get_database(database_name)
collection = db.get_collection(collection_name)

collection.delete_many({})

collection.insert_many(listings)
print("Data Ingestion into Mongo Complete")



Connection to MongoDB Successful


ServerSelectionTimeoutError: SSL handshake failed: cluster0-shard-00-00.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: cluster0-shard-00-02.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: cluster0-shard-00-01.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 673f4a6c7f605199305b8d34, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.kypjl.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-00.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('cluster0-shard-00-01.kypjl.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-01.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('cluster0-shard-00-02.kypjl.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-02.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [9]:
#Vector Search done here
text_embedding_field_name = "text_embeddings"
vector_search_index_name_text = "vector_index_text"
vector_search_index_model = SearchIndexModel ( 
    definition = {
        "mappings" : {
            "dynamic" : True,
            "fields" : {
                text_embedding_field_name : {
                    "dimensions" : 1536,
                    "similarity" : "cosine",
                    "type" : "knnVector",
                }
            },
        }
    },
    name = vector_search_index_name_text ,
)

In [10]:
index_exists = False
for index in collection.list_indexes():
    print(index)
    if index['name'] == vector_search_index_name_text:
        index_exists = True
        print(index_exists)
        break

ServerSelectionTimeoutError: SSL handshake failed: cluster0-shard-00-00.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: cluster0-shard-00-02.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: cluster0-shard-00-01.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 673f4a6c7f605199305b8d34, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.kypjl.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-00.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('cluster0-shard-00-01.kypjl.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-01.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('cluster0-shard-00-02.kypjl.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-02.kypjl.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [None]:
import time
if not index_exists:
    try:
        result = collection.create_search_index(model = vector_search_index_model)
        print("Creating index ...")
        time.sleep(20)
        print("Index created successfully", result)
        print("Wait for a few minutes before conducting search")
    except Exception as e:
        print(f"Error creating a vector search index: {str(e)}")
else:
    print(f"Index '{vector_search_index_name_text}' already exists.")

        

In [None]:
import openai
openai.api_key = OPENAI_API_KEY
def get_embedding(text):
    if not text or not isinstance(text, str):
        return None
    try:
        response = openai.embeddings.create(
        input = text, 
        model = "text-embedding-3-small", dimensions = 1536)
        embedding = response.data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None
        

In [None]:
def vector_search(user_query, db, collection, vector_index="vector_index_text"):
    query_embedding = get_embedding(user_query)
    if query_embedding is None:
        return "Invalid query or embedding generation failed."
    vector_search_stage = {
        "$vectorSearch": {
            "index": vector_index,
            "queryVector": query_embedding,
            "path": text_embedding_field_name,
            "numCandidates": 150,
            "limit": 20
        }
    }

    try:
        # Perform the vector search
        results = collection.aggregate([vector_search_stage])
        explain_query_execution = db.command(
            'explain', {
                'aggregate': collection.name,
                'pipeline': [vector_search_stage],
                'cursor': {}
            },
            verbosity='executionStats'
        )
        # print("Explain Query Execution Structure:", explain_query_execution)        
        # Access the time statistics safely
        stages = explain_query_execution.get('stages', [])
        if stages and '$vectorSearch' in stages[0]:
            vector_search_explain = stages[0]['$vectorSearch']
            millis_elapsed = vector_search_explain.get('explain', {}).get('collectStats', {}).get('millisElapsed', "N/A")
            print(f"Total time taken: {millis_elapsed} milliseconds")
        else:
            print("Could not find 'millisElapsed' in the explain output.")
        
        return list(results)
    except Exception as e:
        print(f"Error during vector search: {str(e)}")
        return []


In [None]:
class SearchResultItem(BaseModel):
    name : str
    accommodates : Optional[int] = None
    address : Address
    summary : Optional[str] = None
    description : Optional[str] = None
    neighborhood_overview : Optional[str] = None
    notes : Optional[str] = None


In [None]:
from IPython.display import display, HTML
def handle_user_query(query, db, collection):
    get_knowledge = vector_search(query, db, collection)
    if not get_knowledge:
        return "No results found.", "No source information available."
    search_results_models = [
    SearchResultItem(**result)
    for result in get_knowledge
    ]
    search_results_df = pd.DataFrame([item.dict() for item in search_results_models])
    completion = openai.chat.completions.create(
        model = "gpt-4o-mini",
    messages = [
    {
    "role" : "system",
    "content" : "You are a airbnb listing recommendation system. "},
    {
    "role" : "user",
    "content" : f"Answer this user query: {query} with the following context:\n {search_results_df}"
    }
    ]
    )
    api_response = completion.choices[0].message.content
    print(f" - User Question:\n{query}\n")
    print(f" - System Response: \n{api_response}\n")
    display(HTML(search_results_df.to_html()))
    return api_response

In [None]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from resturants, can you recommend a place? 
Include a reason as to why you've chosen your selection.
"""
handle_user_query(query, db, collection)