In [14]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings

In [15]:
from langchain_community.document_loaders import PyPDFLoader
import PyPDF2



loader = PyPDFLoader("data\ConceptsofBiology-WEB.pdf")
data = loader.load()

In [16]:
# Pages 19 to 76 are the two chapters
data = data[18:36]

In [17]:

# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)

In [18]:
len(docs)

61

In [19]:

# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-L6-v2"
# modelPath = "google/flan-t5-large"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)



In [20]:
text = "This is a test substance."
query_result = embeddings.embed_query(text)
query_result[:3]

[0.02040591835975647, -0.033092256635427475, 0.003897689515724778]

In [21]:
db = FAISS.from_documents(docs, embeddings)


In [22]:
# Specify the model name you want to use
model_name = "google/flan-t5-base"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=tokenizer
    , max_length=512
    
    # ,return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.1, "max_length": 512}
)

In [23]:
# Create a retriever object from the 'db' using the 'as_retriever' method.
# This retriever is likely used for retrieving data or documents from the database.
retriever = db.as_retriever()

In [24]:
# Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "refine," the retriever we created, and an option to not return source documents.
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [25]:
questions = ["what is Biology?",
             "What are the properties of life? List them",
             "What is chemotaxis?",
             "What is adaptation? ",
             "What does genes provide? Explain",
             "What is an atom?",
             "What is an organ system?"]
for el in questions:
    result = qa.invoke({'query': el})
    print("Query: ",result['query'])
    print("Answer: ",  result['result'])

# result

Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 512). Running this sequence through the model will result in indexing errors


Query:  what is Biology?
Answer:  a broad br anch itself
Query:  What are the properties of life? List them
Answer:  All groups o f living or ganisms shar e se veral key char acteristics or functions: or der, sensitivity or response t o stimuli, r eproduction, adap tation, gr owth and de velopment, r egulation/homeos tasis, ener gy processing, and e volution.
Query:  What is chemotaxis?
Answer:  chemical
Query:  What is adaptation? 
Answer:  All living or ganisms e xhibit a “ fit” to their en vironment.
Query:  What does genes provide? Explain
Answer:  Ins tructions nec essary for lif e.
Query:  What is an atom?
Answer:  The atomis the smal lest and mos t fundamental unit o f mat ter that r etains the pr oper ties o f an element.
Query:  What is an organ system?
Answer:  collections o f tissues gr ouped t ogether based on a c ommon function
