**Install Dependencies**

In [None]:
!pip install langchain-community
!pip install langchain
!pip install sentence-transformers
!pip install pdfplumber
!pip install chromadb
!pip install tiktoken

**Import Necessary Libraries**

In [2]:
import torch
from transformers import AutoTokenizer,pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders.pdf import PDFPlumberLoader
from langchain.text_splitter import CharacterTextSplitter,TokenTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

**Create Pipeline for Embedding and Text-Generation**

In [3]:
def create_sbert_mpnet():
        model = "sentence-transformers/all-mpnet-base-v2"
        device = "cuda" if torch.cuda.is_available() else "cpu"
        return HuggingFaceEmbeddings(model_name=model, model_kwargs={"device": device})

def create_flan_t5_base(load_in_8bit):
        # Wrap it in HF pipeline for use with LangChain
        model="google/flan-t5-xl"
        tokenizer = AutoTokenizer.from_pretrained(model)
        return pipeline(
            task="text2text-generation",
            model=model,
            tokenizer = tokenizer,
            max_new_tokens=100,
            device_map="auto",
            model_kwargs={"load_in_8bit": load_in_8bit,
                          "temperature": 0.95}
        )

embedding = create_sbert_mpnet()
llm = create_flan_t5_base(load_in_8bit=False)

  return HuggingFaceEmbeddings(model_name=model, model_kwargs={"device": device})
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

**Use LLM to Ask a Question without RAG**

In [4]:
_question = "What is Grant of Patent in ArangoDB License?"

prompt = "Answer the following question. \n" + _question
res = llm(prompt)
print(res)



[{'generated_text': 'Grant of Patent in ArangoDB License is a clause in the license that allows the holder of the patent to assert the patent in the United States.'}]


**Use a PDF file as external
knowledge source and save it using Chroma vectorDB**

In [5]:
pdf_path = "https://arangodb.com/wp-content/uploads/2023/09/Download-Contributor-License-Agreement.pdf"
# pdf_path = "https://web.stanford.edu/~jurafsky/slp3/10.pdf"
loader = PDFPlumberLoader(pdf_path)
documents = loader.load()

# # Split documents and create text snippets
# text_splitter = CharacterTextSplitter()
# texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)
texts = text_splitter.split_documents(documents)

persist_directory = "./chromadb"
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

**Print Chunks of extracted texts**

In [6]:
print(len(texts))

for t in texts[:4]:
  print(t)
  print("##**##"*20)

11
page_content='ArangoDB GmbH
Individual Contributor License Agreement ("Agreement")
http://www.arangodb.com
Thank you for your interest in ArangoDB GmbH ("ArangoDB"). In order to
clarify the intellectual property license granted with Contributions
from any person or entity, ArangoDB must have a Contributor License
Agreement ("CLA") on file that has been signed by each Contributor,
indicating agreement to the license terms below. This license is for
your protection as a Contributor as well as the protection of ArangoDB
and its users; it does not change your rights to use your own
Contributions for any other purpose.
If you have not already done so, please complete and sign, then scan
and email a pdf file of this Agreement to cla@arangodb.com.
Alternatively, you may send it by facsimile to ArangoDB at +49-221' metadata={'source': 'https://arangodb.com/wp-content/uploads/2023/09/Download-Contributor-License-Agreement.pdf', 'file_path': 'https://arangodb.com/wp-content/uploads/2023/09/Do

**Print examples of data saved in the VectorDB**

In [7]:
vdb = vectordb.get(include=['embeddings', 'documents', 'metadatas'])
num=2
print(vdb['documents'][:num])
print(vdb['embeddings'][:num])
print(vdb['metadatas'][:num])

['ArangoDB GmbH\nIndividual Contributor License Agreement ("Agreement")\nhttp://www.arangodb.com\nThank you for your interest in ArangoDB GmbH ("ArangoDB"). In order to\nclarify the intellectual property license granted with Contributions\nfrom any person or entity, ArangoDB must have a Contributor License\nAgreement ("CLA") on file that has been signed by each Contributor,\nindicating agreement to the license terms below. This license is for\nyour protection as a Contributor as well as the protection of ArangoDB\nand its users; it does not change your rights to use your own\nContributions for any other purpose.\nIf you have not already done so, please complete and sign, then scan\nand email a pdf file of this Agreement to cla@arangodb.com.\nAlternatively, you may send it by facsimile to ArangoDB at +49-221', 'ile to ArangoDB at +49-221-\n2722999-88. If necessary, send an original signed Agreement to\nArangoDB GmbH, Hohenstaufenring 43-45, 50674 Cologne, Germany\nPlease read this docum

**Create RAG Framework**

In [8]:
hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb.as_retriever(search_kwargs={"k":1})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)

# Defining a default prompt for flan models
question_t5_template = """
context: {context}
question: {question}
answer:
"""
QUESTION_T5_PROMPT = PromptTemplate(
    template=question_t5_template, input_variables=["context", "question"]
)
qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT

  hf_llm = HuggingFacePipeline(pipeline=llm)


**Run RAG Framework**

In [9]:
qa.combine_documents_chain.verbose = True
qa.return_source_documents = True
qa({"query":_question,})


  qa({"query":_question,})




[1m> Entering new StuffDocumentsChain chain...[0m

[1m> Finished chain.[0m


{'query': 'What is Grant of Patent in ArangoDB License?',
 'result': 'a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work',
 'source_documents': [Document(metadata={'Author': 'Claudius Weinberger', 'CreationDate': "D:20151116212836Z00'00'", 'Creator': 'Word', 'Keywords': '', 'ModDate': "D:20151116212836Z00'00'", 'Producer': 'Mac OS X 10.11.1 Quartz PDFContext', 'Subject': '', 'Title': 'Ohne Titel', 'file_path': 'https://arangodb.com/wp-content/uploads/2023/09/Download-Contributor-License-Agreement.pdf', 'page': 1, 'source': 'https://arangodb.com/wp-content/uploads/2023/09/Download-Contributor-License-Agreement.pdf', 'total_pages': 4}, page_content='grant to ArangoDB and to recipients of software distributed by\nArangoDB a perpetual, worldwide, non-exclusive, no-charge,\nroyalty-free, irrevocable (except as stated in this section

**Run the Retriever for the question**

In [10]:
retriever.get_relevant_documents(_question)

  retriever.get_relevant_documents(_question)


[Document(metadata={'Author': 'Claudius Weinberger', 'CreationDate': "D:20151116212836Z00'00'", 'Creator': 'Word', 'Keywords': '', 'ModDate': "D:20151116212836Z00'00'", 'Producer': 'Mac OS X 10.11.1 Quartz PDFContext', 'Subject': '', 'Title': 'Ohne Titel', 'file_path': 'https://arangodb.com/wp-content/uploads/2023/09/Download-Contributor-License-Agreement.pdf', 'page': 1, 'source': 'https://arangodb.com/wp-content/uploads/2023/09/Download-Contributor-License-Agreement.pdf', 'total_pages': 4}, page_content='grant to ArangoDB and to recipients of software distributed by\nArangoDB a perpetual, worldwide, non-exclusive, no-charge,\nroyalty-free, irrevocable (except as stated in this section)\npatent license to make, have made, use, offer to sell, sell,\nimport, and otherwise transfer the Work, where such license\napplies only to those patent claims licensable by You that are\nnecessarily infringed by Your Contribution(s) alone or by\ncombination of Your Contribution(s) with the Work to whi

In [11]:
ret = retriever.invoke(_question)
qa.combine_documents_chain._get_inputs(ret)

{'context': 'grant to ArangoDB and to recipients of software distributed by\nArangoDB a perpetual, worldwide, non-exclusive, no-charge,\nroyalty-free, irrevocable (except as stated in this section)\npatent license to make, have made, use, offer to sell, sell,\nimport, and otherwise transfer the Work, where such license\napplies only to those patent claims licensable by You that are\nnecessarily infringed by Your Contribution(s) alone or by\ncombination of Your Contribution(s) with the Work to which such\nContribution(s) was submitted. If any entity institutes patent\nlitigation against You or any other entity (including a cross-\nclaim or counterclaim in a lawsuit) alleging that your\nContribution, or the Work to which you have contributed,\nconstitutes direct or contributory patent infringement, then any\npatent licenses granted to that entity under this Agreement for'}

**Add Context to the Prompt**

In [12]:
QUESTION_T5_PROMPT.format(question=_question,context=ret[0].page_content)

'\ncontext: grant to ArangoDB and to recipients of software distributed by\nArangoDB a perpetual, worldwide, non-exclusive, no-charge,\nroyalty-free, irrevocable (except as stated in this section)\npatent license to make, have made, use, offer to sell, sell,\nimport, and otherwise transfer the Work, where such license\napplies only to those patent claims licensable by You that are\nnecessarily infringed by Your Contribution(s) alone or by\ncombination of Your Contribution(s) with the Work to which such\nContribution(s) was submitted. If any entity institutes patent\nlitigation against You or any other entity (including a cross-\nclaim or counterclaim in a lawsuit) alleging that your\nContribution, or the Work to which you have contributed,\nconstitutes direct or contributory patent infringement, then any\npatent licenses granted to that entity under this Agreement for\nquestion: What is Grant of Patent in ArangoDB License?\nanswer:\n'