# AIRBNB RAGSystem

- source - https://builtin.com/articles/how-to-build-a-rag-system

In [1]:
from datasets import load_dataset
import pandas as pd
import os

# https://huggingface.co/datasets/MongoDB/embedded_movies
# Make sure you have an Hugging Face token(HF_TOKEN) in your development environemnt
dataset = load_dataset("MongoDB/airbnb_embeddings")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset['train'])

with open('hf_key','r') as f:
    key = f.readline()

with open('openai_key.txt', 'r') as f:
    openai_key = f.readline()

os.environ['HF_TOKEN'] = key
os.environ['OPENAI_API_KEY'] = openai_key

dataset_df.head(5)

Unnamed: 0,_id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,...,images,host,address,availability,review_scores,reviews,weekly_price,monthly_price,text_embeddings,image_embeddings
0,10006546,https://www.airbnb.com/rooms/10006546,Ribeira Charming Duplex,Fantastic duplex apartment with three bedrooms...,Privileged views of the Douro River and Ribeir...,Fantastic duplex apartment with three bedrooms...,"In the neighborhood of the river, you can find...",Lose yourself in the narrow streets and stairc...,Transport: • Metro station and S. Bento railwa...,We are always available to help guests. The ho...,...,"{'thumbnail_url': '', 'medium_url': '', 'pictu...","{'host_id': '51399391', 'host_url': 'https://w...","{'street': 'Porto, Porto, Portugal', 'suburb':...","{'availability_30': 28, 'availability_60': 47,...","{'review_scores_accuracy': 9, 'review_scores_c...","[{'_id': '58663741', 'date': 2016-01-03 05:00:...",,,"[0.0123710884, -0.0180913936, -0.016843712, -0...","[-0.1302358955, 0.1534578055, 0.0199299306, -0..."
1,10021707,https://www.airbnb.com/rooms/10021707,Private Room in Bushwick,Here exists a very cozy room for rent in a sha...,,Here exists a very cozy room for rent in a sha...,,,,,...,"{'thumbnail_url': '', 'medium_url': '', 'pictu...","{'host_id': '11275734', 'host_url': 'https://w...","{'street': 'Brooklyn, NY, United States', 'sub...","{'availability_30': 0, 'availability_60': 0, '...","{'review_scores_accuracy': 10, 'review_scores_...","[{'_id': '61050713', 'date': 2016-01-31 05:00:...",,,"[0.0153845912, -0.0348115042, -0.0093448907, 0...","[0.0340401195, 0.1742489338, -0.1572628617, 0...."
2,1001265,https://www.airbnb.com/rooms/1001265,Ocean View Waikiki Marina w/prkg,A short distance from Honolulu's billion dolla...,Great studio located on Ala Moana across the s...,A short distance from Honolulu's billion dolla...,You can breath ocean as well as aloha.,,Honolulu does have a very good air conditioned...,"Pool, hot tub and tennis",...,"{'thumbnail_url': '', 'medium_url': '', 'pictu...","{'host_id': '5448114', 'host_url': 'https://ww...","{'street': 'Honolulu, HI, United States', 'sub...","{'availability_30': 16, 'availability_60': 46,...","{'review_scores_accuracy': 9, 'review_scores_c...","[{'_id': '4765259', 'date': 2013-05-24 04:00:0...",650.0,2150.0,"[-0.0400562622, -0.0405789167, 0.000644172, 0....","[-0.1640156209, 0.1256971657, 0.6594450474, -0..."
3,10009999,https://www.airbnb.com/rooms/10009999,Horto flat with small garden,One bedroom + sofa-bed in quiet and bucolic ne...,Lovely one bedroom + sofa-bed in the living ro...,One bedroom + sofa-bed in quiet and bucolic ne...,This charming ground floor flat is located in ...,"There´s a table in the living room now, that d...","Easy access to transport (bus, taxi, car) and ...",,...,"{'thumbnail_url': '', 'medium_url': '', 'pictu...","{'host_id': '1282196', 'host_url': 'https://ww...","{'street': 'Rio de Janeiro, Rio de Janeiro, Br...","{'availability_30': 0, 'availability_60': 0, '...","{'review_scores_accuracy': None, 'review_score...",[],1492.0,4849.0,"[-0.063234821, 0.0017937823, -0.0243996996, -0...","[-0.1292964518, 0.037789464, 0.2443587631, 0.0..."
4,10047964,https://www.airbnb.com/rooms/10047964,Charming Flat in Downtown Moda,Fully furnished 3+1 flat decorated with vintag...,The apartment is composed of 1 big bedroom wit...,Fully furnished 3+1 flat decorated with vintag...,With its diversity Moda- Kadikoy is one of the...,,,,...,"{'thumbnail_url': '', 'medium_url': '', 'pictu...","{'host_id': '1241644', 'host_url': 'https://ww...","{'street': 'Kadıköy, İstanbul, Turkey', 'subur...","{'availability_30': 27, 'availability_60': 57,...","{'review_scores_accuracy': 10, 'review_scores_...","[{'_id': '68162172', 'date': 2016-04-02 04:00:...",,,"[0.023723349, 0.0064210771, -0.0339970738, -0....","[-0.1006749049, 0.4022984803, -0.1821258366, 0..."


In [2]:
import json

# Read JSON file
with open('mongodb.json', 'r') as file:
    mongo_connection = json.load(file)

In [3]:
dataset_df = dataset_df.drop(columns=['text_embeddings'])

In [4]:
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=1024)
llm = OpenAI()

Settings.llm = llm
Settings.embed_model = embed_model

In [5]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient='records')

# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:

  # Value for metadata must be one of (str, int, float, None)
  document["amenities"] = json.dumps(document["amenities"])
  document["images"] = json.dumps(document["images"])
  document["host"] = json.dumps(document["host"])
  document["address"] = json.dumps(document["address"])
  document["availability"] = json.dumps(document["availability"])
  document["review_scores"] = json.dumps(document["review_scores"])
  document["reviews"] = json.dumps(document["reviews"])
  document["image_embeddings"] = json.dumps(document["image_embeddings"])


  # Create a Document object with the text and excluded metadata for llm and embedding models
  llama_document = Document(
      text=document["description"],
      metadata=document,
      excluded_llm_metadata_keys=["_id", "transit", "minimum_nights", "maximum_nights", "cancellation_policy", "last_scraped", "calendar_last_scraped", "first_review", "last_review", "security_deposit", "cleaning_fee", "guests_included", "host", "availability", "reviews", "image_embeddings"],
      excluded_embed_metadata_keys=["_id", "transit", "minimum_nights", "maximum_nights", "cancellation_policy", "last_scraped", "calendar_last_scraped", "first_review", "last_review", "security_deposit", "cleaning_fee", "guests_included", "host", "availability", "reviews", "image_embeddings"],
      metadata_template="{key}=>{value}",
      text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
      )

  llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)


The LLM sees this: 
 Metadata: listing_url=>https://www.airbnb.com/rooms/10006546
name=>Ribeira Charming Duplex
summary=>Fantastic duplex apartment with three bedrooms, located in the historic area of Porto, Ribeira (Cube) - UNESCO World Heritage Site. Centenary building fully rehabilitated, without losing their original character.
space=>Privileged views of the Douro River and Ribeira square, our apartment offers the perfect conditions to discover the history and the charm of Porto. Apartment comfortable, charming, romantic and cozy in the heart of Ribeira. Within walking distance of all the most emblematic places of the city of Porto. The apartment is fully equipped to host 8 people, with cooker, oven, washing machine, dishwasher, microwave, coffee machine (Nespresso) and kettle. The apartment is located in a very typical area of the city that allows to cross with the most picturesque population of the city, welcoming, genuine and happy people that fills the streets with his outspok

In [6]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

parser = SentenceSplitter(chunk_size=5000)
nodes = parser.get_nodes_from_documents(llama_documents)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode=MetadataMode.EMBED)
    )
    
    node.embedding = node_embedding

In [19]:
node_content = node.get_content(metadata_mode=MetadataMode.EMBED)
node_content

'Metadata: listing_url=>https://www.airbnb.com/rooms/10006546\nname=>Ribeira Charming Duplex\nsummary=>Fantastic duplex apartment with three bedrooms, located in the historic area of Porto, Ribeira (Cube) - UNESCO World Heritage Site. Centenary building fully rehabilitated, without losing their original character.\nspace=>Privileged views of the Douro River and Ribeira square, our apartment offers the perfect conditions to discover the history and the charm of Porto. Apartment comfortable, charming, romantic and cozy in the heart of Ribeira. Within walking distance of all the most emblematic places of the city of Porto. The apartment is fully equipped to host 8 people, with cooker, oven, washing machine, dishwasher, microwave, coffee machine (Nespresso) and kettle. The apartment is located in a very typical area of the city that allows to cross with the most picturesque population of the city, welcoming, genuine and happy people that fills the streets with his outspoken speech and cont

In [8]:
# import pymongo
# import certifi

# def get_mongo_client(mongo_uri):
#   """Establish connection to the MongoDB."""
#   try:
#     client = pymongo.MongoClient(mongo_uri)
#     print("Connection to MongoDB successful")
#     return client
#   except pymongo.errors.ConnectionFailure as e:
#     print(f"Connection failed: {e}")
#     return None

# mongo_uri = mongo_connection['connection_uri']
# if not mongo_uri:
#   print("MONGO_URI not set in environment variables")

# mongo_client = get_mongo_client(mongo_uri)

import chromadb
from chromadb.config import Settings

# Initialize ChromaDB with default settings
settings = Settings()
client = chromadb.Client(settings)

DB_NAME="airbnb"
COLLECTION_NAME="listings_reviews"

collection = client.create_collection(COLLECTION_NAME)
# db = mongo_client[DB_NAME]
# collection = db[COLLECTION_NAME]

In [23]:
import numpy as np

# Prepare lists to store embeddings and metadata
embeddings = []
documents = []
ids = []
# Add vectors to the collection
for node in nodes:
    node_content = node.get_content(metadata_mode=MetadataMode.EMBED)
    # Store the embedding and node content
    embeddings.append(node.embedding)
    documents.append(node_content)
    ids.append(node.id_)
    
# Insert embeddings and metadata into ChromaDB collection
i = 0

while i < len(embeddings):
    collection.add(embeddings=embeddings[i:i+2000], documents=documents[i:i+2000], ids=ids[i:i+2000])
    i=i+2000


In [51]:
# Convert a query into a vector
query_text = "Give a good and friendly host and a friendly neighborhood"  # replace with your actual query
query_vector = embed_model.get_text_embedding(query_text)

In [52]:
k = 5  # Number of nearest neighbors to search for
results = collection.query(
    query_embeddings=[query_vector], 
    n_results=k
)


In [53]:
# Display the results
result_ids = results['ids']
result_distances = results['distances']
result_metadatas = results['metadatas']

# Display the results
print("Indices of nearest neighbors:", results["ids"])
print("Distances to nearest neighbors:", results["distances"])

Indices of nearest neighbors: [['4617dab2-e287-4050-aa8d-9419ad35c481', 'e28c6697-0343-4f36-9b54-d7ab3ec9c201', '1cfbfb8e-14c9-4804-9f28-e5d35c33c57a', '86e3f41c-5b14-4d07-8dd9-9767870b0c80', 'b2c6f5d3-9a5c-4d1f-9010-215371d24001']]
Distances to nearest neighbors: [[1.0906058549880981, 1.1123956441879272, 1.1154425144195557, 1.119917869567871, 1.1333705186843872]]


In [54]:
collection.get(result_ids[0])['documents']

['Metadata: listing_url=>https://www.airbnb.com/rooms/1370405\nname=>Fresh, Simple Sleep in Brooklyn!\nsummary=>A budget traveler\'s DREAM! Queen bed, shared bathroom, subway access, and SEVEN YEAR/FIVE STAR Superhosts, all for a very reasonable price! Plus, 2 dogs to call your best friends for the duration of your stay :)\nspace=>:) FIRST THINGS FIRST! :) Airbnb has a COMMUNITY VIBE. We invite strangers into our home and welcome you as family, while giving you the SPACE AND RESPECT you need on your vacation. As such, we have a few requests when inquiring so that you get a QUICK AND POSITIVE acceptance! 1. PROFILE: Have a PHOTO of YOU, along with a BIO to let us know a little bit about you! (Where you\'re from, what you do, etc.) Please, VERIFY YOUR ID! 2. TRAVEL: Tell us about your TRIP (travel plans TO NYC, duration, etc.) and any special requirements you may have. 3. AGREEMENT: Confirm you have READ THE TERMS of stay, POLICY on check-in and check-out, and the very few house rules. 4