###1. Installing required libraries

In [0]:
!pip install -U langchain-community
!pip install -q langchain=0.1.7
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu

###2. Importing required libraries

In [0]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

###3. Importing the external datasource (Dolly dataricks datasets)
Context column fo this dataset is the reference text of Wikipedia

In [0]:
# Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context" 
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
data = loader.load()
data[:2]

###4. chunking the Dolly dataset due to its big size

In [0]:
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)
docs[0]

###5.Defining the pretrained model that text will be embedded in vector DB based on

In [0]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings( model_name=modelPath, encode_kwargs=encode_kwargs)

In [0]:
### the embeddings have a list of 384 numbers for each text. To get a sample with its first 10 numbers we can use below code
text = "This is a test document to check how ebdeddings look like by used model."
query_result = embeddings.embed_query(text)
query_result[:10]

###6. Facebook AI Similarity Search is used as vector storage systems
You can ask your questions and ti will search based on most similar vectors, the same as Google search

In [0]:
db = FAISS.from_documents(docs, embeddings)

question = "What is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

In [0]:
# Create a retriever object from the vector db
retriever = db.as_retriever()
## The questiosn can be passed by get_relevant_documents function to the retriever to find the response
docs = retriever.get_relevant_documents("What is Cheesemaking?")
print(docs[0].page_content)

###7. Making a QA model LLM pipeline 
with tinybert model and adding some creativity level by temperature parameter

In [0]:
model_name = "Intel/dynamic_tinybert"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)
question_answerer = pipeline( "question-answering", model=model_name, tokenizer=tokenizer, return_tensors='pt')
llm = HuggingFacePipeline(pipeline=question_answerer, model_kwargs={"temperature": 0.7, "max_length": 512})

In [0]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, don't try to make up an answer. Just say "I can't find the proper answer". If you find the answer, write the answer in a concise way with five sentences maximum.

Question: {question}

"""
prompt = PromptTemplate(template=template, input_variables=["question"])
chain = prompt | llm
question = "Who is Thomas Jefferson?"
print(chain.invoke({"question": question}))

##### There would be no output since the LLM model doesn't know the asnwer

In [0]:
# The retriever of vector DB can be configured by different settings such as up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# the RetrievalQA class of Langchain sets the generation of response to questions by refining on the relevant documents from the external source. In our case, the DB vector has many vectored info which helps the tinybert model to give more relevant information
QA = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=True)

In [0]:
question = "Who is Thomas Jefferson?"
result = QA.invoke({"query": question})
print(result["result"])

##### The result is in below by using the Dolly database :
"Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams."