In [None]:
# # Install dependencies.
# !pip install asyncio==3.4.3 asyncpg==0.27.0 cloud-sql-python-connector["asyncpg"]==1.2.3
# !pip install numpy==1.22.4 pandas==1.5.3
# !pip install pgvector==0.1.8
# !pip install langchain==0.0.196 transformers==4.30.1
# !pip install google-cloud-aiplatform==1.26.0
# !pip install faker
# !pip install --user  psycopg2-binary
#!pip install langchain_community
#!pip install langchain_google_vertexai

In [19]:
#Synthetic Data Generation for hotels including hotel names, brands, addresses, amenities, and descriptions.
from faker import Faker
import random
import pandas as pd
# Initialize Faker to generate fake data
faker = Faker()
cities = ["Miami", "Seattle", "Chicago", "Edison", "Portland", "Houston", "Phoenix", "Dallas", "Austin", "Atlanta"]
num_rows = 3000
# Generate synthetic data for each field
hotel_id = [i + 1 for i in range(num_rows)]
hotel_name = [faker.company() for _ in range(num_rows)]
brand = [faker.company_suffix() for _ in range(num_rows)]
hotel_address = [faker.street_address() for _ in range(num_rows)]
hotel_area_name = [faker.city_suffix() for _ in range(num_rows)]
city = random.choices(cities, k=num_rows)  # Randomly select cities from the list
country = ["United States" for _ in range(num_rows)]  # Assuming all hotels are in the United States
amenities = [random.sample(['WiFi', 'Restaurant', 'Pool', 'Gym', 'Spa', 'Parking'], random.randint(1, 3)) for _ in range(num_rows)]
# Generate hotel descriptions
hotel_descriptions = []
for i in range(num_rows):
    location_type = random.choice(['urban', 'suburban', 'rural'])
    room_types = random.sample(['single', 'double', 'suite'], random.randint(1, 3))
    facilities = random.sample(['gym', 'spa', 'pool', 'restaurant', 'bar', 'conference room'], random.randint(1, 4))
    description = f"{hotel_name[i]} offers a {location_type} retreat with cozy accommodations. "
    description += f"Located in {city[i]}, {country[i]}, our hotel provides easy access to local attractions. "
    description += f"Whether you're traveling for business or leisure, {hotel_name[i]} is the perfect choice. "
    description += f"Indulge in our {', '.join(amenities[i])} and experience unmatched hospitality. "
    description += f"Book your stay today and discover the ultimate comfort and convenience."
    hotel_descriptions.append(description)
data = {
    'hotel_id': hotel_id,
    'hotel_name': hotel_name,
    'brand': brand,
    'hotel_address': hotel_address,
    'hotel_area_name': hotel_area_name,
    'city': city,
    'country': country,
    'amenities': amenities,
    'description': hotel_descriptions
}
df = pd.DataFrame(data)
# Filter out duplicate entries based on hotel name
df = df.drop_duplicates(subset=['hotel_name'])
# Print the first few rows to verify that duplicates have been removed
#print(df.head())

In [41]:
#Created Google cloudsql postgresql database "hackathon-420400:us-central1:hackathon"
#Create a postgresql connection 
# Read the password from the text file
with open("password.txt", "r") as file:
    password = file.read().strip()
import psycopg2
from psycopg2.extras import RealDictCursor
conn = psycopg2.connect(
    host="10.69.160.3",
    port="5432",
    user="hackathon",
    dbname="vectordb",
    password=password
)
cursor = conn.cursor(cursor_factory=RealDictCursor)

In [26]:
# Drop the existing table if it exists
cursor.execute("DROP TABLE IF EXISTS hotels CASCADE;")
# Create the "hotels" table to load teh above the synthetic data
cursor.execute("""CREATE TABLE hotelsdata (
                    hotel_id SERIAL PRIMARY KEY,
                    hotel_name TEXT,
                    brand TEXT,
                    hotel_address TEXT,
                    hotel_area_name TEXT,
                    city TEXT,
                    country TEXT,
                    amenities TEXT[],
                    description TEXT
                );""")

# Iterate over the DataFrame rows and insert them into the table
for index, row in df.iterrows():
    cursor.execute("""
        INSERT INTO hotelsdata 
        (hotel_name, brand, hotel_address, hotel_area_name, city, country, amenities, description) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """, (row['hotel_name'], row['brand'], row['hotel_address'], row['hotel_area_name'], row['city'], row['country'], row['amenities'], row['description']))

# Commit the transaction
conn.commit()
# Rollback failed transaction
#conn.rollback()
cursor.execute("SELECT * FROM hotelsdata LIMIT 5;")
result = cursor.fetchall()
#print(result)


[RealDictRow([('hotel_id', 1), ('hotel_name', 'Pierce, Anthony and Richardson'), ('brand', 'PLC'), ('hotel_address', '092 Hernandez Drive'), ('hotel_area_name', 'chester'), ('city', 'Chicago'), ('country', 'United States'), ('amenities', ['Restaurant', 'Pool', 'WiFi']), ('description', "Pierce, Anthony and Richardson offers a rural retreat with cozy accommodations. Located in Chicago, United States, our hotel provides easy access to local attractions. Whether you're traveling for business or leisure, Pierce, Anthony and Richardson is the perfect choice. Indulge in our Restaurant, Pool, WiFi and experience unmatched hospitality. Book your stay today and discover the ultimate comfort and convenience.")]), RealDictRow([('hotel_id', 2), ('hotel_name', 'Bird and Sons'), ('brand', 'PLC'), ('hotel_address', '927 Julia Junction'), ('hotel_area_name', 'port'), ('city', 'Dallas'), ('country', 'United States'), ('amenities', ['Spa', 'WiFi']), ('description', "Bird and Sons offers a suburban retre

In [27]:
#Data Preprocessing and then constructs a list of dictionaries, each containing a hotel ID and a formatted content string,
#including details such as hotel name, brand, address, amenities, and description, following a predefined template.
import pandas as pd
amenities_mapping = {
    "G": "Gym",
    "S": "Spa",
    "P": "Pool",
    "R": "Restaurant",
    "B": "Bar",
    "C": "Conference Room",
}
TEMPLATE = "{hotel_id}-{hotel_name}-{brand}-{hotel_address}-{hotel_area_name}-{city}-{country}-{amenities}-{description}"
chunked = []
for index, row in df.iterrows():
    amenities_list = row['amenities']

    amenities_mapped = [amenities_mapping.get(amenity, amenity) for amenity in amenities_list]

    amenities_formatted = ', '.join(list(set(amenities_mapped)))
    amenities = f"{{{amenities_formatted}}}"

    r = {
        "hotel_id": int(row['hotel_id']),
        "content": TEMPLATE.format(
            hotel_id=row['hotel_id'],
            hotel_name=row['hotel_name'],
            brand=row['brand'],
            hotel_address=row['hotel_address'],
            hotel_area_name=row['hotel_area_name'],
            city=row['city'],
            country=row['country'],
            amenities=amenities,
            description=row['description']
        )
    }
    chunked.append(r)

In [28]:
#print(chunked[90])

{'hotel_id': 91, 'content': "91-Novak, Cabrera and Carrillo-LLC-26773 Murphy Prairie-furt-Dallas-United States-{WiFi}-Novak, Cabrera and Carrillo offers a rural retreat with cozy accommodations. Located in Dallas, United States, our hotel provides easy access to local attractions. Whether you're traveling for business or leisure, Novak, Cabrera and Carrillo is the perfect choice. Indulge in our WiFi and experience unmatched hospitality. Book your stay today and discover the ultimate comfort and convenience."}


In [29]:
#Retrieves Vector embeddings for each chunk using the embeddings service,
#and stores the generated embeddings along with other hotel details in a pandas DataFrame.
from langchain.embeddings import VertexAIEmbeddings
from google.cloud import aiplatform
import time
import pandas as pd
aiplatform.init(project=f"hackathon-420400", location=f"us-central1")
embeddings_service = VertexAIEmbeddings()
# Helper function to retry failed API requests with exponential backoff.
def retry_with_backoff(func, *args, retry_delay=5, backoff_factor=2, **kwargs):
    max_attempts = 10
    retries = 0
    for i in range(max_attempts):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print(f"error: {e}")
            retries += 1
            wait = retry_delay * (backoff_factor**retries)
            print(f"Retry after waiting for {wait} seconds...")
            time.sleep(wait)
batch_size = 5
for i in range(0, len(chunked), batch_size):
    request = [x["content"] for x in chunked[i : i + batch_size]]
    response = retry_with_backoff(embeddings_service.embed_documents, request)
    # Store the retrieved vector embeddings for each chunk back.
    for x, e in zip(chunked[i : i + batch_size], response):
        x["embedding"] = e
# Store the generated embeddings in a pandas dataframe.
hotel_embeddings = pd.DataFrame(chunked)
hotel_embeddings.head()

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


Unnamed: 0,hotel_id,content,embedding
0,1,"1-Pierce, Anthony and Richardson-PLC-092 Herna...","[-0.03707405924797058, -0.02454996295273304, -..."
1,2,2-Bird and Sons-PLC-927 Julia Junction-port-Da...,"[-0.015065439976751804, 0.002776271430775523, ..."
2,3,3-Lewis Group-LLC-73701 Smith Mountain-ton-Hou...,"[-0.05323795601725578, -0.029694894328713417, ..."
3,4,4-Waller Group-Ltd-485 William Terrace-ville-A...,"[-0.06092603877186775, -4.232280844007619e-05,..."
4,5,5-Stafford and Sons-and Sons-0166 Maria Trace-...,"[-0.03332320600748062, 0.002322334097698331, -..."


In [42]:
#Insert into pgvector table "htlembeddings"  with all (hotel_id, content, embedding)
# Rollback failed transaction
conn.rollback()
batch_size = 1000  
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cursor.execute("DROP TABLE IF EXISTS htlembeddings")
cursor.execute("""CREATE TABLE htlembeddings (
    hotel_id INTEGER PRIMARY KEY,
    content TEXT NOT NULL,
    embedding VECTOR(768) NOT NULL
                )""")
try:
    for index, row in hotel_embeddings.iterrows():
        cursor.execute(
            "INSERT INTO htlembeddings (hotel_id, content, embedding) VALUES (%s, %s, %s)",
            (int(row["hotel_id"]), row["content"], row["embedding"])
        )
        if index % batch_size == 0:
            conn.commit()  # Commit in batches
except Exception as e:
    print("Error occurred during insertion:", e)
    conn.rollback()  # Rollback if an error occurs
else:
    conn.commit()  # Commit any remaining data
cursor.close()
conn.commit
conn.close()

In [44]:
# Dependencies for constructing a conversation Chain
from IPython.display import Markdown
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
)
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_google_vertexai import ChatVertexAI, HarmBlockThreshold, HarmCategory
from vertexai.generative_models import Content, GenerativeModel, Part

In [45]:
#llm = GenerativeModel("gemini-1.-pro")
verbose = False
#llm = ChatVertexAI(model_name="gemini-1.5-pro", temperature=0.1, max_output_tokens=2048)
#llm = ChatVertexAI(model_name="gemini-pro", temperature=0.1, max_output_tokens=2048)
llm = ChatVertexAI(model_name="gemini-1.0-pro", temperature=0.1, max_output_tokens=500)
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate

In [46]:
# CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
# The standalone question must include relevant part of the CHAT_HISTORY, keeping all the possible details about previous questions and answers.

# If the follow up question is not asking anything, use the follow up question as standalone question.

# Beginning of the example 1.
# Input:
# =========
# CHAT_HISTORY:
# [{{"human": "Can you recommend a hotel in Boston?"}}, {{"assistant": "Sure, the hotel <EXAMPLE> is a good choice."}}]

# FOLLOW_UP_QUESTION:
# is there a hotel with wifi option?
# =========
# Example output:
# Can you tell me if the hotel <EXAMPLE> has a wifi option?
# =========
# End of the example 1.

# Beginning of the example 2.
# Input:
# =========
# CHAT_HISTORY:
# []

# FOLLOW_UP_QUESTION:
# is there a hotel with wifi option?
# =========
# Example output:
# is there a hotel with wifi option?
# =========
# End of the example 2.


# CHAT_HISTORY:
# {chat_history}

# FOLLOW_UP_QUESTION:
# {question}""")


# template = """
# INSTRUCTIONS:
# You're an chatbot called "hotels ragbot", deployed as webapp.
# "hotels ragbot" offers solution for booking hotels world-wide.

# Your goal is to help the user to know more about available hotels from the pgvector.
# Use the same language of the input question.

# If the user is asking a generic question, continue the conversation asking for more detail about the context.

# When suggesting an hotel , include the name, the area, the address and explain why it's a good choice. Do not repeat the same hotel. Just give unique outputs. Please donot provide any Additional Information.

# Use only the CONTEXT to suggest hotels. If the CONTEXT is not helpful, ignore it and say "I'm sorry, but I don't have information on your request at the moment. ".

# CONTEXT:
# {context}

# QUESTION:
# {question}"""


In [47]:
# Using the VertexAi method
#llm = ChatVertexAI(model_name="chat-bison", temperature=0.1, max_output_tokens=2048)
#llm_text = VertexAI(model_name="text-bison")
#llm = ChatVertexAI(model_name="gemini-pro", temperature=0.1, max_output_tokens=300)
#llm = ChatVertexAI(model_name="gemini-1.5-pro", temperature=0.1, max_output_tokens=300)

# Switch to the open ai llm to accurately simulate Penny
#llm = ChatOpenAI(model_name="gpt-3.5-turbo")
#verbose = True

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
The standalone question has to be made in first-person from the user point of view.


Beginning of the example 1.
Input:
=========
Chat history:
[{{"human": "Can you recommend a restaurant in Boston?"}}, {{"assistant": "Sure, the restaurant <EXAMPLE> is a good choice."}}]

Follow up question: is there a japanese cuisine option?
=========
Example output:
Can you tell me if the restaurant <EXAMPLE> has a japanese cuisine option?
=========
End of the example 1.

Beginning of the example 2.
Input:
=========
Chat history:
[]

Follow up question: is there a japanese cuisine option?
=========
Example output:
is there a japanese cuisine option?
=========
End of the example 2.



Chat History:
{chat_history}

Follow up question: {question}""")



template = """
You're an assistant called Hotel RAGbot.
You should offer solution for booking hotels.

Your goal is to help the user to answer questions about the website, hotel.
Don't make politicals, religious or any other assumptions and don't even talk about this, just focus on the "hotel RAGbot" offer.
Use the same language of the input question.


If there's no answer for the user, say "I don't know, please contact the support".
If the user is asking to policies, only use information provided in this prompt. Do not make up rules.

When suggesting an hotel or restaurant, include the name, the area, the address and explain why it's a good choice.
Do not make up any data about them, only use details provided in this instruction.

When the user asks about hotels, provide a list of available hotels with their names, locations, description in a bulleted format. Omit additional information like "Other factors to consider" and avoid asking clarifying questions or making suggestions.


The following list are hotels in HTML format. Use these items only if you need to search an hotel. This list is not exhaustive but only contains relevant content.
<context>
{context}
</context>

This is the chat history:
<chat-history>
{chat_history}
</chat-history>

This is the question that you need to answer (considering the chat history too):
Question:
{question}
"""


In [48]:
prompt = ChatPromptTemplate.from_template(template)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
memory.chat_memory.clear()


In [49]:
from langchain.chains.llm import LLMChain
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=verbose
)


In [50]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context"

)


In [51]:
condense_question_chain = LLMChain(
    llm=llm,
    prompt=CONDENSE_QUESTION_PROMPT,
    verbose=verbose
)



In [52]:
class HotelConversationMemory(ConversationBufferMemory):
    def update_hotels_mentioned(self, hotels):
        """
        Update the list of hotels mentioned in the conversation history.
        """
        if "hotels_mentioned" not in self.context:
            self.context["hotels_mentioned"] = []
        self.context["hotels_mentioned"].extend(hotels)


In [53]:
class HotelConversationChain(LLMChain):
    def respond(self, context):
        """
        Respond to the user query considering the hotels mentioned in the conversation history.
        """
        hotels_mentioned = context.get("hotels_mentioned", [])
        response = super().respond(context)
        # Filter out hotels mentioned in the conversation history
        filtered_response = [hotel for hotel in response if hotel not in hotels_mentioned]
        return filtered_response


In [54]:
class HotelConversationChain(LLMChain):
    def respond(self, context):
        """
        Respond to the user query considering the hotels mentioned in the conversation history.
        """
        hotels_mentioned = context.get("hotels_mentioned", [])
        response = super().respond(context)
        # Filter out hotels mentioned in the conversation history
        filtered_response = [hotel for hotel in response if hotel["hotel_name"] not in hotels_mentioned]
        return filtered_response


In [58]:
def format_history(chat_history):
    all = ""

    for dialogue_turn in chat_history:
        if dialogue_turn.type == "human":
            prefix = "Human"
        else:
            prefix = "Assistant (you)"
        all += prefix + ": '" + dialogue_turn.content + "'\n"
  
    return all


In [59]:
from langchain.vectorstores.pgvector import PGVector, DistanceStrategy

# Convert content strings to objects with a page_content attribute
class Document:
    def __init__(self, content, metadata={}):
        self.page_content = content
        self.metadata = metadata


# Create a list of Document objects
#documents = [Document(content) for content in hotel_embeddings["content"].tolist()]
# Get unique content from hotel_embeddings DataFrame


# Create a PGVector instance to house the documents and embeddings
db = PGVector.from_documents(
    documents=documents,
    embedding=embeddings_service,
    collection_name="htlembeddings",
    distance_strategy=DistanceStrategy.COSINE,
    #column_name='content', 
    connection_string="postgresql://hackathon:MyHackathon12#@10.69.160.3:5432/vectordb"
)

# Create a retriever from the PGVector instance
retriever = db.as_retriever(search_kwargs={"k": 3})


In [60]:
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain

conversation = ConversationalRetrievalChain(
    combine_docs_chain=chain,
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    question_generator=condense_question_chain,
    memory=memory,
    get_chat_history=format_history,
    #response_if_no_docs_found="I'm sorry, but I don't have information on your request at the moment."
)

In [None]:
#questions = ["I need to do an hotel reservation in atlantic city", "ok, any restaurant nearby?", "ok thanks. Can you tell me how to cancel a reservation ? "]
# for q in questions:
#     print("\nYou: ", q)
#     answer = conversation.invoke({"question": q})
#     print("Penny: ", answer["answer"])
while True:
  q = input("\nYou: ")
  answer = conversation.invoke({"question": q})
  print("\nHotel_RAGbot: ", answer["answer"])



You:  hotels near airport in Atlanta with pools



Hotel_RAGbot:  Yes, there are a few hotels near the Atlanta airport that have pools. Here are a few options:

* **James and Sons** (1371-James and Sons-Group-9716 Linda Knolls Suite 610-fort-Atlanta-United States): This hotel offers a rural retreat with cozy accommodations. It has a pool and is located near the Atlanta airport.
* **Bass and Sons** (819-Bass and Sons-LLC-641 Shawn Crescent Apt. 814-bury-Atlanta-United States): This hotel offers a urban retreat with cozy accommodations. It has a pool and is located near the Atlanta airport.

Both of these hotels are located within a short distance of the Atlanta airport and offer a variety of amenities, including a pool. They are also both highly rated by guests.

If you have any other questions, please let me know.



You:  do they have gym



Hotel_RAGbot:  Yes, both the James and Sons and Bass and Sons hotels near the Atlanta airport have gyms. This information is available on their websites.

* **James and Sons** (1371-James and Sons-Group-9716 Linda Knolls Suite 610-fort-Atlanta-United States): This hotel offers a rural retreat with cozy accommodations. It has a gym and a pool, and is located near the Atlanta airport.
* **Bass and Sons** (819-Bass and Sons-LLC-641 Shawn Crescent Apt. 814-bury-Atlanta-United States): This hotel offers a urban retreat with cozy accommodations. It has a gym and a pool, and is located near the Atlanta airport.

Both of these hotels are located within a short distance of the Atlanta airport and offer a variety of amenities, including a gym and a pool. They are also both highly rated by guests.

If you have any other questions, please let me know.



You:  do they have restuarant



Hotel_RAGbot:  I'm sorry, but I can't answer that question. The information provided in the chat history does not mention whether either of the hotels have a restaurant. 

Would you like me to search for hotels near the Atlanta airport with a restaurant? 



You:  yes



Hotel_RAGbot:  I'm sorry, but I can't answer that question. The information provided in the chat history does not mention whether either of the hotels have a restaurant. 

Would you like me to search for hotels near the Atlanta airport with a restaurant? 



You:  hotels near the Atlanta airport with a restaurant



Hotel_RAGbot:  I'm sorry, but I can't answer that question. The information provided in the chat history does not mention whether either of the hotels have a restaurant. 

Would you like me to search for hotels near the Atlanta airport with a restaurant? 



You:  hotels in Atlanta near airport with a restaurant



Hotel_RAGbot:  ## Hotels in Atlanta near the airport with a restaurant:

* **Garza, Parker and Martin** (1147-Garza, Parker and Martin-PLC-311 Matthew Summit Suite 496-mouth-Atlanta-United States): This hotel offers a urban retreat with cozy accommodations. It has a restaurant and is located near the Atlanta airport.
* **Vargas Ltd** (191-Vargas Ltd-LLC-01481 Aaron Light Suite 739-haven-Atlanta-United States): This hotel offers a rural retreat with cozy accommodations. It has a restaurant and is located near the Atlanta airport.

Both of these hotels are located within a short distance of the Atlanta airport and offer a variety of amenities, including a restaurant. They are also both highly rated by guests.

**Additional Information:**

* **Garza, Parker and Martin** also has a gym and a pool.
* **Vargas Ltd** also has a gym.

**Please note:** I do not have information on parking availability at these hotels. 

