In [1]:
# !pip install --upgrade langchain langchain_google_vertexai
# !pip install --upgrade --quiet  google-cloud-storage
# ! pip install --user --quiet unstructured pdf2image==1.16.3 pytesseract==0.3.10 pdfminer.six==20221105 unstructured
# !pip install pypdf

In [1]:
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.vectorstores import BigQueryVectorSearch
from langchain.document_loaders import GCSFileLoader
PROJECT_ID = "project-id"

embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest", project=PROJECT_ID
)
GCS_BUCKET_DOCS = "bucket-name"
PDF_BLOB = "newyork-city-tourism.pdf"

In [2]:
from langchain_community.document_loaders import PyPDFLoader

def load_pdf(file_path):
    return PyPDFLoader(file_path)


loader = GCSFileLoader(
    project_name=PROJECT_ID, bucket=GCS_BUCKET_DOCS, blob=PDF_BLOB, loader_func=load_pdf
)
documents = loader.load()

  warn_deprecated(


In [3]:
for document in documents:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]
    # derive doc source from Document loader
    doc_source_prefix = "/".join(GCS_BUCKET_DOCS.split("/")[:3])
    doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

# of documents loaded (pre-chunking) = 8


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 26


In [5]:
DATASET = "vector_store_exp"
TABLE = "nyc_tourism"

new_york_tourism = BigQueryVectorSearch(
    project_id=PROJECT_ID,
    dataset_name=DATASET,
    table_name=TABLE,
    location="US",
    embedding=embedding,
)

new_york_tourism.add_documents(doc_splits)

  warn_deprecated(


['9728ea159ab7488e88b862f2dfc15bf2',
 'e34fd03ff5c145b19b9f0450271d5c87',
 'e260d11c2db5456cabb9c8e56bf09697',
 '08129d411a4748c5b1a8fcef5f367999',
 '914dbd0084a549b5a5bd095132417dff',
 '6a2ee14133884fe29134aeb6f8188081',
 'e999006f71d644c59f1254f5e9314fbc',
 'c4ffe7a877b8414aba081aa1a8cf2800',
 '953d180cafd3487389d7016805b1455d',
 '076b803cbf0a48919de91b74f8a5065c',
 'dfdd9746493749d39bb8b2b27ce5e47a',
 'd94349646e5d46438021017bfeee36fa',
 '5e13e7e97be943ceb96bcbbc1342b7bd',
 '44f38f807bd2494499cc23e380bac584',
 'f22e12f0c8c1465f9adc5c56051aee69',
 'bfee29ee79554dd0bf1a57d3b9a84a68',
 '3c57a8af105b40d5beb3683bc3e18e2e',
 '8eb67eb3d8fe4714bb395fe3bd9f67e0',
 'd510543c57f9484e802d5cdc39f84c41',
 '11499d03d025412c8861a364a381cd61',
 '9fc28861267c4de689566e71980d0da2',
 '7db26dfa22814fceb6c1498335cd9e12',
 '4a47ec65fa8f4e758514608b8cb865f7',
 '6a41e89a154a4207bd71fe073e71094b',
 'eacc9ec59d97494293e14d9c530095f4',
 '0f2e6898655f40d69649d7fd3fff686d']

In [7]:
query = """What are some of the attractions one can find in Central Park, New York City?"""

new_york_tourism.similarity_search(query)

[Document(page_content='CENTRAL PARK ★\u2009★\u2009★\nWelcome to the lungs of New \nYork City, located between \nUpper East Side  and Upper West \nSide . This vast urban oasis more \nthan 4 km long by 1 km wide is a \nfavourite destination for locals \nand attracts millions of visitors \nevery year. Attractions include \na number of playgrounds for \nchildren, a carousel, a zoo, plazas, \ngardens, rolling meadows, lakes, \nfountains, terraces, a skating \nrink, wide promenades, a running \ntrack and walking paths, and \nmuch more.\nWWW.CENTRALPARKNYC.ORG\nUPPER WEST SIDE ★\u2009★\u2009\nAMERICAN MUSEUM OF \nNATURAL HISTORY ★\u2009★\u2009★\nThe American Museum of \nNatural History is one of the \nlargest museums in the world, \nand is internationally known \nfor its research. The museum \ncollections contain over 32 \nmillion specimens of plants, \nhumans, animals, fossils, \nminerals, rocks, meteorites, \nand human cultural artifacts. \nBe sure to visit the exceptional \nCultural Halls

In [8]:
from langchain_google_vertexai import VertexAI
from langchain.chains import RetrievalQA

llm = VertexAI(model_name="gemini-pro")

retriever = new_york_tourism.as_retriever()

In [9]:
search_query = """What days and times is the American Museum of Natural History open to the public?"""

retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever,
    return_source_documents=True
)
results = retrieval_qa.invoke(search_query)

print("*" * 79)
print(results["result"])
print("*" * 79)
for doc in results["source_documents"]:
    print("-" * 79)
    print(doc.page_content)

*******************************************************************************
The American Museum of Natural History is open from Wednesday to Sunday, from 10 am to 5:30 pm. It is closed on Thanksgiving and Christmas Day.
*******************************************************************************
-------------------------------------------------------------------------------
CENTRAL PARK ★ ★ ★
Welcome to the lungs of New 
York City, located between 
Upper East Side  and Upper West 
Side . This vast urban oasis more 
than 4 km long by 1 km wide is a 
favourite destination for locals 
and attracts millions of visitors 
every year. Attractions include 
a number of playgrounds for 
children, a carousel, a zoo, plazas, 
gardens, rolling meadows, lakes, 
fountains, terraces, a skating 
rink, wide promenades, a running 
track and walking paths, and 
much more.
WWW.CENTRALPARKNYC.ORG
UPPER WEST SIDE ★ ★ 
AMERICAN MUSEUM OF 
NATURAL HISTORY ★ ★ ★
The American Museum of 
Natural History is

In [10]:
from langchain.chains import RetrievalQAWithSourcesChain

search_query = """
        Which New York City neighborhood is known for its affluent history and stylish mansions along 5th Avenue?
        """
retrieval_qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever
)

retrieval_qa_with_sources({"question": search_query}, return_only_outputs=True)

  warn_deprecated(


{'answer': 'Upper East Side', 'sources': ''}

In [11]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
conversational_retrieval = ConversationalRetrievalChain.from_llm(
    llm=llm, retriever=retriever, memory=memory
)

search_query = """What days and times is the American Museum of Natural History open to the public?"""

result = conversational_retrieval({"question": search_query})
print(result["answer"])

The American Museum of Natural History is open Wednesday-Sunday from 10 am to 5:30pm.


In [12]:
new_query = "What about Wednesday 7pm?"
result = conversational_retrieval({"question": new_query})
print(result["answer"])

The American Museum of Natural History closes at 5:30pm on Wednesdays.


In [14]:
search_query = """
        Which New York City neighborhood is known for its affluent history and stylish mansions along 5th Avenue?
        """
result = conversational_retrieval({"question": search_query})
print(result["answer"])

5th Avenue


In [15]:
from langchain.prompts import PromptTemplate

template = """SYSTEM: You are an intelligent assistant helping the users with their questions on tourism.

Question: {question}

Strictly Use ONLY the following pieces of context to answer the question at the end. Think step-by-step and then answer.

Do not try to make up an answer:
 - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
 - If the context is empty, just say "I do not know the answer to that."

=============
{context}
=============

Question: {question}
Helpful Answer:"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [16]:
NUMBER_OF_RESULTS = 10
SEARCH_DISTANCE_THRESHOLD = 0.6

# Expose index to the retriever
retriever = new_york_tourism.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    },
)

In [17]:
print(qa.combine_documents_chain.llm_chain.prompt.template)

SYSTEM: You are an intelligent assistant helping the users with their questions on tourism.

Question: {question}

Strictly Use ONLY the following pieces of context to answer the question at the end. Think step-by-step and then answer.

Do not try to make up an answer:
 - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
 - If the context is empty, just say "I do not know the answer to that."

{context}

Question: {question}
Helpful Answer:


In [18]:
qa.combine_documents_chain.verbose = True
qa.combine_documents_chain.llm_chain.verbose = True
qa.combine_documents_chain.llm_chain.llm.verbose = True

import textwrap


def formatter(result):
    print(f"Query: {result['query']}")
    print("." * 80)
    if "source_documents" in result.keys():
        for idx, ref in enumerate(result["source_documents"]):
            print("-" * 80)
            print(f"REFERENCE #{idx}")
            print("-" * 80)
            if "score" in ref.metadata:
                print(f"Matching Score: {ref.metadata['score']}")
            if "source" in ref.metadata:
                print(f"Document Source: {ref.metadata['source']}")
            if "document_name" in ref.metadata:
                print(f"Document Name: {ref.metadata['document_name']}")
            print("." * 80)
            print(f"Content: \n{wrap(ref.page_content)}")
    print("." * 80)
    print(f"Response: {wrap(result['result'])}")
    print("." * 80)


def wrap(s):
    return "\n".join(textwrap.wrap(s, width=120, break_long_words=False))


def ask(query, qa=qa, k=NUMBER_OF_RESULTS, search_distance=SEARCH_DISTANCE_THRESHOLD):
    qa.retriever.search_kwargs["search_distance"] = search_distance
    qa.retriever.search_kwargs["k"] = k
    result = qa({"query": query})
    return formatter(result)

In [19]:
ask("What can visitors see and do at Rockefeller Center in Midtown Manhattan?")




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSYSTEM: You are an intelligent assistant helping the users with their questions on tourism.

Question: What can visitors see and do at Rockefeller Center in Midtown Manhattan?

Strictly Use ONLY the following pieces of context to answer the question at the end. Think step-by-step and then answer.

Do not try to make up an answer:
 - If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
 - If the context is empty, just say "I do not know the answer to that."

City, make your way up to the 
70th floor to the T op of the Rock  
(daily from 10 am to 10 pm), 
where three floors of indoor and 
outdoor observation decks offer 
stunning panoramas of the city!
45 ROCKEFELLER PLAZA, NEW YORK 
212-332-6868
WWW.ROCKEFELLERCENTER.COM
© NYC - Rockefeller_Ce