<a href="https://colab.research.google.com/github/stigsfoot/datascience-2023/blob/main/%5BPalm_ChromaDB%5D_GCP_Generative_AI_Document_Retrieval_and_Question_Answering_with_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Palm-ChromaDB] Generative AI - Document Retrieval and Question Answering with LLMs


## Authenticate

In [None]:
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
!gcloud config set project genai-sandbox

Updated property [core/project].


## Dependencies

In [None]:
!pip install google-cloud-aiplatform==1.25.0
!pip install langchain==0.0.187
!pip install xmltodict==0.13.0
!pip install unstructured==0.7.0 # used by langchain
!pip install pdf2image==1.16.3 #used by langchain
!pip install requests==2.31.0
!pip install beautifulsoup4==4.12.2
!pip install unstructured
!pip install chromadb
from langchain.vectorstores import Chroma


Collecting google-cloud-aiplatform==1.25.0
  Downloading google_cloud_aiplatform-1.25.0-py2.py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3 (from google-cloud-aiplatform==1.25.0)
  Downloading google_cloud_resource_manager-1.10.3-py2.py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.0/321.0 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting shapely<2.0.0 (from google-cloud-aiplatform==1.25.0)
  Downloading Shapely-1.8.5.post1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: shapely, google-cloud-resource-manager, google-cloud-aiplatform
  Attempting uninstall: shapely
    Found existing installation: shapely 2.0.1

Collecting langchain==0.0.187
  Downloading langchain-0.0.187-py3-none-any.whl (960 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m960.7/960.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain==0.0.187)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain==0.0.187)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic<2,>=1 (from langchain==0.0.187)
  Downloading pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.187)
  Downloading marshmallow-3

In [None]:
PROJECT_ID = 'genai-sandbox'
REGION = 'us-central1'
BUCKET = 'gs://noble-genai-workshops/embeddings'
DIMENSIONS=768
DISPLAY_NAME='palm-2-langchain-document-answering'
ENDPOINT='us-central1-aiplatform.googleapis.com'
TEXT_GENERATION_MODEL='text-bison@001'

usda_sitemap_1='https://www.nrcs.usda.gov/sitemap.xml?page=1'
usda_sitemap_2='https://www.nrcs.usda.gov/sitemap.xml?page=2'

In [None]:
import os
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

# Documents
## Parse the sitemap

In [None]:
import requests
from bs4 import BeautifulSoup

def parse_sitemap(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "xml")
    urls = [element.text for element in soup.find_all("loc")]
    return urls

def parse_sitemaps(urls):
    all_urls = []
    for url in urls:
        all_urls += parse_sitemap(url)
    return all_urls

usda_sitemap_1 = 'https://www.nrcs.usda.gov/sitemap.xml?page=1'
usda_sitemap_2 = 'https://www.nrcs.usda.gov/sitemap.xml?page=2'
sitemap_urls = [usda_sitemap_1, usda_sitemap_2]

sites = parse_sitemaps(sitemap_urls)


In [None]:
sites_filtered = [url for url in sites if not url.startswith('http://default')]

# Print
sites_filtered

['https://www.nrcs.usda.gov/',
 'https://www.nrcs.usda.gov/conservation-basics/conservation-by-state',
 'https://www.nrcs.usda.gov/contact/find-a-service-center',
 'https://www.nrcs.usda.gov/events',
 'https://www.nrcs.usda.gov/conservation-basics/conservation-by-state/alabama',
 'https://www.nrcs.usda.gov/conservation-basics/conservation-by-state/alaska',
 'https://www.nrcs.usda.gov/programs-initiatives/eqip-air-quality-initiative',
 'https://www.nrcs.usda.gov/programs-initiatives/watersmart',
 'https://www.nrcs.usda.gov/programs-initiatives/great-lakes-restoration-initiative',
 'https://www.nrcs.usda.gov/programs-initiatives/mississippi-river-basin-healthy-watersheds-initiative',
 'https://www.nrcs.usda.gov/programs-initiatives/national-water-quality-initiative',
 'https://www.nrcs.usda.gov/programs-initiatives/source-water-protection',
 'https://www.nrcs.usda.gov/programs-initiatives/lesser-prairie-chicken-initiative',
 'https://www.nrcs.usda.gov/programs-initiatives/longleaf-pine-i

In [None]:
len(sites_filtered)

5944

## Load page content using LangChains UnstructuredURLLoader

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
!pip install unstructured -q

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
small_sitemap_urls = sitemap_urls[:5]
loader = UnstructuredURLLoader(urls=small_sitemap_urls)
documents = loader.load()

In [None]:
# Filter out the unwanted URLs
filtered_sitemap_urls = [url for url in sites_filtered if not url.startswith('http://default')]

# Create the loader
loader = UnstructuredURLLoader(urls=filtered_sitemap_urls, mode='single')

# Load the documents
documents = loader.load()

# Print the content of the first document to verify
print(documents[0].page_content[:500])


ERROR:langchain.document_loaders.url:Error fetching or processing https://www.nrcs.usda.gov/resources/tech-tools/efh-2-software-version-2011, exeption: Invalid file. The FileType.UNK file type is not supported in partition.
ERROR:langchain.document_loaders.url:Error fetching or processing https://www.nrcs.usda.gov/resources/tech-tools/wintr-55-small-watershed-hydrology-version-20000, exeption: Invalid file. The FileType.UNK file type is not supported in partition.
ERROR:langchain.document_loaders.url:Error fetching or processing https://www.nrcs.usda.gov/resources/tech-tools/efh-2-software-version-2012, exeption: Invalid file. The FileType.UNK file type is not supported in partition.
ERROR:langchain.document_loaders.url:Error fetching or processing https://www.nrcs.usda.gov/resources/tech-tools/wintr-20-system-controllereditor-version-3301, exeption: Invalid file. The FileType.UNK file type is not supported in partition.


In [None]:
len(documents)

## Chunking

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# Define chunk size and overlap based on your content
chunk_size = 1000
chunk_overlap = 100

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

document_chunks = text_splitter.split_documents(documents)

print(f"Number of documents: {len(documents)}")
print(f"Number of chunks: {len(document_chunks)}")

# Include additional metadata if needed
document_chunks = [
    f"Context: {chunk.page_content} Source: {chunk.metadata['source']}"
    for chunk in document_chunks
]




# Embeddings for documents



## Create embedding for all document chunks

In [None]:
#!pip install shapely==1.7.1
#!pip install sentence_transformers

## ChromaDB Initialization

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [None]:
db = Chroma.from_documents(docs, embedding_function)


In [None]:
document_count = db._collection.count()
print(f"There are {document_count} documents in the collection.")


In [None]:
query = "What does NRCS mean?"
retrieved_docs = db.similarity_search(query)
print(retrieved_docs[0].page_content)


# Question and Answering with Palm model



In [None]:
# Example: "How can the CSP program help ranchers address challenges?"
# Example: What are some challenges that black farmers face in accessing agricultural resources and opportunities?

question = input("Please enter your question: ")

#### Prompt

In [None]:
from vertexai.preview.language_models import TextGenerationModel

# Query ChromaDB to get the context for the question
query = question
chroma_docs = db.similarity_search(query)

# Aggregate the retrieved documents to form the context
chroma_context = "\n".join([doc.page_content for doc in chroma_docs])

prompt = f"""
Follow exactly those 3 steps:
1. Read the context below and aggregrate this data
Context : {chroma_context}
2. Answer the question using only this context
3. Show the source for your answers
User Question: {question}

If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
"""

model = TextGenerationModel.from_pretrained(TEXT_GENERATION_MODEL)
response = model.predict(
        prompt,
        temperature=0.2,
        top_k=40,
        top_p=.8,
        max_output_tokens=1024,
)

# Question and Response
print(f"User Question:")
print(f"{question}\n")

print(f"Model Response:")
print(f"{response.text}\n")

# Context from ChromaDB
print(f"Context from ChromaDB:")
for idx, doc in enumerate(chroma_docs):
    print(f"Document {idx + 1}:")
    print(f"{doc.page_content}\n")


### Lets put the language model to work by summarizing the context

## Direct usage of Vertex AI Matching Engine (only to showase the direct usage for document retrieval)

In [None]:
# Query ChromaDB with the user's question
query = "What are some challenges that black farmers face in accessing agricultural resources and opportunities?"
chroma_docs = db.similarity_search(query)

# Print the user's question
print(f"User Question:\n{query}\n")

# Print the retrieved documents with numbering for clarity
print(f"Retrieved Documents from ChromaDB:")
for idx, doc in enumerate(chroma_docs, 1):
    print(f"Document {idx}:")
    print(f"{doc.page_content}\n")

# If you want to return the response as a variable, you can aggregate the content
response = "\n".join([doc.page_content for doc in chroma_docs])

# Optionally, print the aggregated response
print(f"Aggregated Response:")
print(response)
