## Import Necessary Libraries

In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import PyPDF2

  from .autonotebook import tqdm as notebook_tqdm


## Load data in a dataloader

In [2]:
loader = PyPDFLoader(r"data\ConceptsofBiology-WEB.pdf")
data = loader.load()

### Select the first two chapters for speed

In [46]:
# Pages 19 to 76 are the two chapters
data = data[18:76]

## Split the data and do chunking

In [47]:

# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)

## Create embeddings model

In [48]:

# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-L6-v2"
# modelPath = "google/flan-t5-large"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)



## ChromaDB (persistent) database

In [49]:
# import chromadb
# import os
# from langchain.vectorstores import Chroma
# from langchain.document_loaders import PyPDFLoader

# def load_chunk_persist_pdf(modelPath = "sentence-transformers/all-MiniLM-L6-v2", model_kwargs = {'device':'cpu'},\
#     encode_kwargs = {'normalize_embeddings': False}, pdf_folder_path= "data") -> Chroma:
#     pdf_folder_path = pdf_folder_path
#     documents = []
#     for file in os.listdir(pdf_folder_path):
#         if file.endswith('.pdf'):
#             pdf_path = os.path.join(pdf_folder_path, file)
#             loader = PyPDFLoader(pdf_path)
#             documents.extend(loader.load())
#     documents = documents[16:38]
#     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
#     chunked_documents = text_splitter.split_documents(documents)
#     client = chromadb.Client()
#     if client.list_collections():
#         consent_collection = client.create_collection("consent_collection")
#     else:
#         print("Collection already exists")
#     vectordb = Chroma.from_documents(
#         documents=chunked_documents,
#         embedding=HuggingFaceEmbeddings(
#     model_name=modelPath,     # Provide the pre-trained model's path
#     model_kwargs=model_kwargs, # Pass the model configuration options
#     encode_kwargs=encode_kwargs # Pass the encoding options
# ),
#         persist_directory="chroma_store"
#     )
#     vectordb.persist()
#     return vectordb

## Create Vector DB, FAISS (in-memory)

In [50]:
db = FAISS.from_documents(docs, embeddings)
# vectordb = load_chunk_persist_pdf()


## Create LLM instance

In [51]:
# Specify the model name you want to use
model_name = "google/flan-t5-base"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=tokenizer
    , max_length=512
    
    # ,return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.4, "max_length": 512}
)

## Create retreiver for question-answering

In [52]:
# Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "stuff," the retriever we created, and an option to not return source documents.
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [53]:

# retriever = vectordb.as_retriever(search_kwargs={"k": 1})
# qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

# def get_llm_response(query,qa):
#     # matching_docs = vectordb.similarity_search(query)
#     answer = qa.invoke({'query': query})
#     return answer

# questions = ["what is Biology? ",
#              "What are the properties of life? List them",
#              "What is chemotaxis?",
#              "What is adaptation in Biology? ",
#              "What does genes provide? ",
#              "What is an atom?",
#              "What is an organ system?"]
# for el in questions:
#     response = get_llm_response(el,qa)
#     print("Query: ",el)
#     print("Answer: ", response['result'])


## Test the model with sample questions

In [54]:
questions = ["what is Biology?",
             "What are the properties of life? List them",
             "What is chemotaxis?",
             "What is adaptation? ",
             "What does genes provide? Explain",
             "What is an atom?",
             "What is an organ system?"]
for el in questions:
    result = qa.invoke({'query': el})
    print("Query: ",result['query'])
    print("Answer: ",  result['result'])

# result

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


Query:  what is Biology?
Answer:  Scienc e of life
Query:  What are the properties of life? List them
Answer:  All groups o f living or ganisms shar e se veral key char acteristics or functions: or der, sensitivity or response t o stimuli, r eproduction, adap tation, gr owth and de velopment, r egulation/homeos tasis, ener gy processing, and e volution.
Query:  What is chemotaxis?
Answer:  bact eria can mo ve toward or a way from chemicals (a process cal led chemotaxis) or light (phot otaxis).
Query:  What is adaptation? 
Answer:  All living or ganisms e xhibit a “ fit” to their en vironment.
Query:  What does genes provide? Explain
Answer:  Ins tructions nec essary for lif e.
Query:  What is an atom?
Answer:  smal lest component o f an element that r etains al l of the chemical pr oper ties o f that element.
Query:  What is an organ system?
Answer:  collections o f tissues gr ouped t ogether based on a c ommon function
