In [None]:
# !pip install -U langchain langchain-community langchainhub openai chromadb bs4

In [None]:
# !pip install -U gpt4all

In [None]:
# pip install tiktoken

In [None]:
# %pip install gpt4all > /dev/null

In [6]:
# !pip install huggingface_hub

In [95]:
# !pip install faiss-gpu

In [100]:
from getpass import getpass
import os
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub

## Setting up HuggingFace Hub
https://python.langchain.com/docs/integrations/llms/huggingface_hub

Get a token: https://huggingface.co/docs/api-inference/quicktour#get-your-api-token

In [1]:
HUGGINGFACEHUB_API_TOKEN = getpass()

········


In [2]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

### Testing with a LLM
To make sure that the HF API works

In [4]:
question = "Who won the FIFA World Cup in the year 1994? "

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [5]:
prompt

PromptTemplate(input_variables=['question'], template="Question: {question}\n\nAnswer: Let's think step by step.")

In [97]:
repo_id = "google/flan-t5-xxl"

llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 64}
)
llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.run(question))



The 1994 FIFA World Cup was won by France. France is a country located in Europe. The answer: France.


## Setting up for Q&A
https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa

### Load

Example

In [8]:
# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

In [9]:
len(docs[0].page_content) # number of str char

42824

In [12]:
len(docs)

1

#### USSD GIS Data page

In [13]:
# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("s-lib-box-content"))
loader = WebBaseLoader(
    web_paths=("https://ucsd.libguides.com/gis/gisdata",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

In [14]:
len(docs[0].page_content) # number of str char

5321

In [16]:
print(docs[0].page_content)


Browse the GIS data holdings on X drive

UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)
The data is organized geographically, roughly by continent, with topical data arrangement inside each folder.
Remember, if you are looking for data on a smaller area of geography, be sure to check the folder with the larger geography first.  An example would be if you are looking for data for only one of the states in the United States, you would want to look in the United States Data folder as well as the North America data folder.
VPN (Cisco AnyConnect Client)Download VPN software on your computer to get off-campus access for everything you'd get using the proxy. If you're having prob

Now we have the GIS data page saved as `docs`

### Split

We’ll split our documents into chunks of 1000 characters with 200 characters of overlap between chunks. The overlap helps mitigate the possibility of separating a statement from important context related to it

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [20]:
len(all_splits) # total number of splitted doc

10

In [21]:
len(all_splits[0].page_content)

39

In [44]:
all_splits[1]

Document(page_content="UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area of geography, be sure to check the folder with the larger geography first.\xa0 An example would be if you are looking for data for only one of the states in the United States, you would want to look in the United States Data folder as well as the North America data folder.", metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 42})

### Store
https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub

In [82]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Chroma, FAISS

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)
# vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embeddings)


In [83]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x7fa6f1b04880>

### Retrieve relevant documents

Options:
1. Chroma (example from LangChain)
2. FAISS (Facebook AI Similarity Search) - using this because it has score function


In [84]:
# Retrieve relevant documents given a query, i.e. question
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

In [90]:
query = "What is X drive?"

In [91]:
retrieved_docs = retriever.get_relevant_documents(query)

In [92]:
len(retrieved_docs) # should have the same number as `k` specified above

2

In [93]:
print(retrieved_docs[1].page_content)

UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)
The data is organized geographically, roughly by continent, with topical data arrangement inside each folder.
Remember, if you are looking for data on a smaller area of geography, be sure to check the folder with the larger geography first.  An example would be if you are looking for data for only one of the states in the United States, you would want to look in the United States Data folder as well as the North America data folder.


#### To view scores

Note that **the returned distance score is L2 distance. Therefore, a lower score is better.**

In [94]:
vectorstore.similarity_search_with_score(query)

[(Document(page_content='Browse the GIS data holdings on X drive', metadata={'source': 'https://ucsd.libguides.com/gis/gisdata', 'start_index': 1}),
  0.84911835),
 (Document(page_content="UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab. You must be connected to UCSD VPN in order to browse. This is for BROWSING ONLY. You CANNOT DOWNLOAD any files. You must visit the Lab to access files until we are able to publish the data into an online portal (currently in development)\nThe data is organized geographically, roughly by continent, with topical data arrangement inside each folder.\nRemember, if you are looking for data on a smaller area of geography, be sure to check the folder with the larger geography first.\xa0 An example would be if you are looking for data for only one of the states in the United States, you would want to look in the United States Data folder as well as the North America data folder.", me

### Generate

In [112]:
repo_id = "google/flan-t5-xxl"

llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.1, "max_length": 100}
)



In [113]:
prompt = hub.pull("rlm/rag-prompt")

In [114]:
example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()
example_messages

[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: filler question \nContext: filler context \nAnswer:")]

In [115]:
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


In [116]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [117]:
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

UC San Diego affiliate can browse the Library's geospatial data holdings that are hosted on the X drive in our Data & GIS Lab.