In [1]:
# # Install dependencies.
# !pip install asyncio==3.4.3 asyncpg==0.27.0 cloud-sql-python-connector["asyncpg"]==1.2.3
# !pip install numpy==1.22.4 pandas==1.5.3
# !pip install pgvector==0.1.8
# !pip install langchain==0.0.196 transformers==4.30.1
# !pip install google-cloud-aiplatform==1.26.0
# !pip install faker
# !pip install --user  psycopg2-binary
#!pip install langchain_community
#!pip install langchain_google_vertexai
#!pip install langchain_astradb 



In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.vectorstores.pgvector import PGVector
#from langchain.embeddings import VertexAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_google_vertexai import ChatVertexAI
from langchain_google_vertexai import VertexAIEmbeddings
import psycopg2
from google.cloud import aiplatform
from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document

In [None]:
embedding = VertexAIEmbeddings()
vstore = AstraDBVectorStore(
    embedding=embedding,
    collection_name="hotel_embeddings",
    token="AstraCS:uyGjAfTBKJGEkzDRcnQAuved:8f96e3aaa89b517c44931fb815cc8a6a9606ac3a86a10eadf13b223d2f192216",
    api_endpoint="https://d76dcb8a-2c6c-41ad-8000-0849aed20095-us-east1.apps.astra.datastax.com",
)

In [None]:
from faker import Faker
import random
import pandas as pd

# Initialize Faker to generate fake data
faker = Faker()

# Define cities
cities = ["Miami", "Seattle", "Chicago", "Edison", "Portland", "Houston", "Phoenix", "Dallas", "Austin", "Atlanta"]

# Number of rows in the dataset
num_rows = 1000

# Generate synthetic data for each field
hotel_id = [i + 1 for i in range(num_rows)]
hotel_name = [faker.company() for _ in range(num_rows)]
brand = [faker.company_suffix() for _ in range(num_rows)]
hotel_address = [faker.street_address() for _ in range(num_rows)]
hotel_area_name = [faker.city_suffix() for _ in range(num_rows)]
city = random.choices(cities, k=num_rows)  # Randomly select cities from the list
country = ["United States" for _ in range(num_rows)]  # All hotels are in the United States
amenities = [random.sample(['WiFi', 'Restaurant', 'Pool', 'Gym', 'Spa', 'Parking'], random.randint(1, 3)) for _ in range(num_rows)]

# Generate hotel descriptions
hotel_descriptions = []
for i in range(num_rows):
    location_type = random.choice(['urban', 'suburban', 'rural'])
    room_types = random.sample(['single', 'double', 'suite'], random.randint(1, 3))
    facilities = random.sample(['gym', 'spa', 'pool', 'restaurant', 'bar', 'conference room'], random.randint(1, 4))
    description = f"{hotel_name[i]} offers a {location_type} retreat with cozy accommodations. "
    description += f"Located in {city[i]}, {country[i]}, our hotel provides easy access to local attractions. "
    description += f"Whether you're traveling for business or leisure, {hotel_name[i]} is the perfect choice. "
    description += f"Indulge in our {', '.join(amenities[i])} and experience unmatched hospitality. "
    description += f"Book your stay today and discover the ultimate comfort and convenience."

    hotel_descriptions.append(description)

# Create DataFrame from the generated data
data = {
    'hotel_id': hotel_id,
    'hotel_name': hotel_name,
    'brand': brand,
    'hotel_address': hotel_address,
    'hotel_area_name': hotel_area_name,
    'city': city,
    'country': country,
    'amenities': amenities,
    'description': hotel_descriptions
}

df_hotels = pd.DataFrame(data)

# Print the first few rows of the DataFrame
print(df_hotels.head())


In [4]:
print(df_hotels.columns)

Index(['hotel_id', 'hotel_name', 'brand', 'hotel_address', 'hotel_area_name',
       'city', 'country', 'amenities', 'description'],
      dtype='object')


In [5]:
#Replace column names that contain a dot with an underscore as the metadata key value pairs do not support periods.
df_hotels.rename(columns={col: col.replace('.', '_') for col in df_hotels.columns}, inplace=True)
print(df_hotels.columns)


Index(['hotel_id', 'hotel_name', 'brand', 'hotel_address', 'hotel_area_name',
       'city', 'country', 'amenities', 'description'],
      dtype='object')


In [6]:
#Create a function to make human readable text from column names
import re

def convert_to_readable_string(input_string):
    # Split the string to words and handle camel case words
    segments = re.split('[._ ]', input_string)
    words = []
    #print(segments)
    for segment in segments:
        segment_words = re.findall('[A-Za-z][^A-Z]*', segment)
        # Capitalize each word and add to the list
        words.extend(segment_words)
        #print(words)
    # Capitalize each word and join with a space
    readable_string = ' '.join([word.capitalize() for word in words])
    return readable_string

# # Example usage
# input_string = "hotelFeatures.highlightedAmenities phone_number"
# output_string = convert_to_readable_string(input_string)
# print(output_string)  # Output: Hotel Features Hotel Amenities

In [7]:

#created Hotel template for the structure of the documents.
import numpy as np

# Creating the template for hotels based on column names and values
for index, row in df_hotels.iterrows():
    content = f"<h1>Hotel name: {row['hotel_name']}</h1><ul>"
    for column, value in row.items():
        # Skip content, just in case we re-run this cell
        # Skip internal ID
        if column == "content" or column == "rid":
            continue
        # Skip empty string
        if isinstance(value, str) and not value:
            continue
        # Skip empty numbers
        if isinstance(value, (int, float)) and np.isnan(value):
            continue
        content += f"<li>{convert_to_readable_string(column)}: {value}"
        # content += f"<li>{column}: {value}"
    content += "</ul>"
    df_hotels.loc[index, 'type'] = "hotel"
    df_hotels.loc[index, 'content'] = content


In [8]:
# Print the template for each hotel
#for index, row in df_hotels.iterrows():
    #print(row['content'])
print(df_hotels.iloc[0]['content'])



<h1>Hotel name: Villa, Roman and Klein</h1><ul><li>Hotel Id: 1<li>Hotel Name: Villa, Roman and Klein<li>Brand: LLC<li>Hotel Address: 4649 Joe Villages Suite 746<li>Hotel Area Name: ville<li>City: Austin<li>Country: United States<li>Amenities: ['Restaurant', 'WiFi']<li>Description: Villa, Roman and Klein offers a urban retreat with cozy accommodations. Located in Austin, United States, our hotel provides easy access to local attractions. Whether you're traveling for business or leisure, Villa, Roman and Klein is the perfect choice. Indulge in our Restaurant, WiFi and experience unmatched hospitality. Book your stay today and discover the ultimate comfort and convenience.</ul>


In [10]:
import pandas as pd

# Assuming that 'df' is your DataFrame and it has a column named 'amenities'

# Define a mapping of amenities abbreviations to their full names
amenities_mapping = {
    "G": "Gym",
    "S": "Spa",
    "P": "Pool",
    "R": "Restaurant",
    "B": "Bar",
    "C": "Conference Room",
}

TEMPLATE = "{hotel_id}-{hotel_name}-{brand}-{hotel_address}-{hotel_area_name}-{city}-{country}-{amenities}-{description}"

chunked = []
for index, row in df_hotels.iterrows():
    # Use the amenities list directly
    amenities_list = row['amenities']

    # Map the amenities abbreviations to their full names
    amenities_mapped = [amenities_mapping.get(amenity, amenity) for amenity in amenities_list]

    # Remove duplicates and join the amenities with commas
    amenities_formatted = ', '.join(list(set(amenities_mapped)))
    amenities = f"{{{amenities_formatted}}}"

    r = {
        "hotel_id": int(row['hotel_id']),
        "content": TEMPLATE.format(
            hotel_id=row['hotel_id'],
            hotel_name=row['hotel_name'],
            brand=row['brand'],
            hotel_address=row['hotel_address'],
            hotel_area_name=row['hotel_area_name'],
            city=row['city'],
            country=row['country'],
            amenities=amenities,
            description=row['description']
        )
    }
    chunked.append(r)

print(chunked[90])


{'hotel_id': 91, 'content': "91-Warren, James and Cox-Inc-9016 Holmes Square Apt. 314-ton-Austin-United States-{Pool}-Warren, James and Cox offers a urban retreat with cozy accommodations. Located in Austin, United States, our hotel provides easy access to local attractions. Whether you're traveling for business or leisure, Warren, James and Cox is the perfect choice. Indulge in our Pool and experience unmatched hospitality. Book your stay today and discover the ultimate comfort and convenience."}


In [11]:
#created document types instead of structured row with content and metadata.
#Adv: this allows us to do filtering on metadata and similarity search and ability to add different document types like for flights/restaurants and avoid the table column sparsing
from langchain_community.document_loaders.dataframe import DataFrameLoader
loader = DataFrameLoader(df_hotels, page_content_column="content")
docs = loader.load()
docs[1]

Document(page_content="<h1>Hotel name: Barr and Sons</h1><ul><li>Hotel Id: 2<li>Hotel Name: Barr and Sons<li>Brand: and Sons<li>Hotel Address: 50719 Anthony Field<li>Hotel Area Name: burgh<li>City: Chicago<li>Country: United States<li>Amenities: ['Spa']<li>Description: Barr and Sons offers a rural retreat with cozy accommodations. Located in Chicago, United States, our hotel provides easy access to local attractions. Whether you're traveling for business or leisure, Barr and Sons is the perfect choice. Indulge in our Spa and experience unmatched hospitality. Book your stay today and discover the ultimate comfort and convenience.</ul>", metadata={'hotel_id': 2, 'hotel_name': 'Barr and Sons', 'brand': 'and Sons', 'hotel_address': '50719 Anthony Field', 'hotel_area_name': 'burgh', 'city': 'Chicago', 'country': 'United States', 'amenities': ['Spa'], 'description': "Barr and Sons offers a rural retreat with cozy accommodations. Located in Chicago, United States, our hotel provides easy acce

In [12]:
# Clear and re-create and populate the vector store, skip this cell if you are simply connecting to the vector DB
import os
from google.cloud import aiplatform
from langchain_astradb import AstraDBVectorStore
from langchain_google_vertexai import VertexAIEmbeddings

embeddings_service = VertexAIEmbeddings(model_name="textembedding-gecko")
# #create connection with name vector_store
vector_store = AstraDBVectorStore(token="AstraCS:uyGjAfTBKJGEkzDRcnQAuved:8f96e3aaa89b517c44931fb815cc8a6a9606ac3a86a10eadf13b223d2f192216", api_endpoint="https://d76dcb8a-2c6c-41ad-8000-0849aed20095-us-east1.apps.astra.datastax.com", collection_name="hotel_embeddings", embedding=embeddings_service)

vector_store.clear()
# #Create embedddings in docs and insert to astra vector_setote which is hotels table

for i in range(0, len(docs), 20):
    vector_store.add_documents(docs[i:i + 20])

In [13]:
# This code snippet may run for longer to generate embeddings for 1000 records.

from langchain.embeddings import VertexAIEmbeddings
from google.cloud import aiplatform
import time
import pandas as pd

aiplatform.init(project=f"hackathon-420400", location=f"us-central1")
embeddings_service = VertexAIEmbeddings(model_name="textembedding-gecko")


# Helper function to retry failed API requests with exponential backoff.
def retry_with_backoff(func, *args, retry_delay=5, backoff_factor=2, **kwargs):
    max_attempts = 10
    retries = 0
    for i in range(max_attempts):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print(f"error: {e}")
            retries += 1
            wait = retry_delay * (backoff_factor**retries)
            print(f"Retry after waiting for {wait} seconds...")
            time.sleep(wait)


batch_size = 5
for i in range(0, len(chunked), batch_size):
    request = [x["content"] for x in chunked[i : i + batch_size]]
    response = retry_with_backoff(embeddings_service.embed_documents, request)
    # Store the retrieved vector embeddings for each chunk back.
    for x, e in zip(chunked[i : i + batch_size], response):
        x["embedding"] = e

# Store the generated embeddings in a pandas dataframe.
hotel_embeddings = pd.DataFrame(chunked)
hotel_embeddings.head()

  warn_deprecated(


Unnamed: 0,hotel_id,content,embedding
0,1,"1-Villa, Roman and Klein-LLC-4649 Joe Villages...","[0.11127083748579025, 0.013308064080774784, -0..."
1,2,2-Barr and Sons-and Sons-50719 Anthony Field-b...,"[0.08539631217718124, -0.020732130855321884, -..."
2,3,3-Obrien Group-Group-95489 Ochoa Place-bury-Au...,"[0.09397877752780914, -0.008087624795734882, -..."
3,4,"4-Schneider, Perez and Vaughan-LLC-510 Padilla...","[0.044070564210414886, -0.0019353424431756139,..."
4,5,"5-Yates, Mann and Newman-Ltd-79789 Bradley Via...","[0.08065102249383926, -0.028701307252049446, -..."


In [14]:
# Dependencies for constructing a conversation Chain
from IPython.display import Markdown
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
)
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
from vertexai.generative_models import Content, GenerativeModel, Part

In [15]:
import re
from langchain_google_vertexai import ChatVertexAI, VertexAI
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationSummaryMemory, ConversationBufferMemory
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.vectorstores.cassandra import Cassandra
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain.chains.llm import LLMChain

# Using the VertexAi method
#llm = ChatVertexAI(model_name="chat-bison", temperature=0.1, max_output_tokens=2048)
#llm_text = VertexAI(model_name="text-bison")
#llm = ChatVertexAI(model_name="gemini-pro", temperature=0.1, max_output_tokens=500)
llm = ChatVertexAI(model_name="gemini-1.0-pro", temperature=0.1, max_output_tokens=300)


# Switch to the open ai llm to accurately simulate Penny
#llm = ChatOpenAI(model_name="gpt-3.5-turbo")
verbose = False

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
The standalone question has to be made in first-person from the user point of view.


Beginning of the example 1.
Input:
=========
Chat history:
[{{"human": "Can you recommend a restaurant in Boston?"}}, {{"assistant": "Sure, the restaurant <EXAMPLE> is a good choice."}}]

Follow up question: is there a japanese cuisine option?
=========
Example output:
Can you tell me if the restaurant <EXAMPLE> has a japanese cuisine option?
=========
End of the example 1.

Beginning of the example 2.
Input:
=========
Chat history:
[]

Follow up question: is there a japanese cuisine option?
=========
Example output:
is there a japanese cuisine option?
=========
End of the example 2.



Chat History:
{chat_history}

Follow up question: {question}""")



template = """
You're an assistant called Priceline Penny. You're a chatbot in the Priceline website.
Priceline website offers solution for booking hotels and restaurant world-wide.

Your goal is to help the user to answer questions about the website, hotels, restaurants and reservations.
Don't make politicals, religious or any other assumptions and don't even talk about this, just focus on the Priceline offer.
Use the same language of the input question.


If there's no answer for the user, say "I don't know, please contact the support".
If the user is asking to policies, only use information provided in this prompt. Do not make up rules.

When suggesting an hotel or restaurant, include the name, the area, the address and explain why it's a good choice.
Do not make up any data about them, only use details provided in this instruction.

When the user asks about hotels, provide a list of available hotels with their names, locations, description in a bulleted format. Omit additional information like "Other factors to consider" and avoid asking clarifying questions or making suggestions.


The following list are hotels/restaurant in HTML format. Use these items only if you need to search an hotel or restaurant. This list is not exhaustive but only contains relevant content.
<context>
{context}
</context>

This is the chat history:
<chat-history>
{chat_history}
</chat-history>

This is the question that you need to answer (considering the chat history too):
Question:
{question}
"""
prompt = ChatPromptTemplate.from_template(template)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
memory.chat_memory.clear()
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=verbose
)
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context"

)
condense_question_chain = LLMChain(
    llm=llm,
    prompt=CONDENSE_QUESTION_PROMPT,
    verbose=verbose
)

def format_history(chat_history):
  all = []


  for dialogue_turn in chat_history:
    if dialogue_turn.type == "human":
      prefix = "Human"
    else:
      prefix = "Assistant (you)"
    all.append({"role": prefix, "content": dialogue_turn.content})

  import json
  return json.dumps(all)



conversation = ConversationalRetrievalChain(
    combine_docs_chain=chain,
    retriever=vstore.as_retriever(search_kwargs={"k": 3}),
    question_generator=condense_question_chain,
    memory=memory,
    get_chat_history=format_history
)

In [None]:
#questions = ["I need to do an hotel reservation in atlantic city", "ok, any restaurant nearby?", "ok thanks. Can you tell me how to cancel a reservation ? "]
# for q in questions:
#     print("\nYou: ", q)
#     answer = conversation.invoke({"question": q})



while True:
  q = input("\nYou: ")
  answer = conversation.invoke({"question": q})
  print("\nhotel_ragbot: ", answer["answer"])



You:  hotels in miami near beach



hotel_ragbot:  Yes, there are several hotels in Miami near the beach. Here are a few options:

* **The Betsy Hotel:** This luxury hotel is located on Ocean Drive, just steps from the beach. It offers stunning ocean views, a private beach club, and a variety of amenities.
* **The Ritz-Carlton, South Beach:** This iconic hotel is located on Collins Avenue, just a short walk from the beach. It offers luxurious accommodations, a world-class spa, and several restaurants.
* **The Setai Miami Beach:** This stylish hotel is located on Collins Avenue, just a few blocks from the beach. It offers a rooftop pool, a private beach club, and a variety of dining options.
* **The W South Beach:** This trendy hotel is located on Collins Avenue, just steps from the beach. It offers a vibrant atmosphere, a rooftop pool, and a variety of restaurants and bars.
* **The Loews Miami Beach Hotel:** This family-friendly hotel is located on Collins Avenue, just a short walk from the beach. It offers a variety of