In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

In [2]:
from langchain.embeddings import (
    LlamaCppEmbeddings, 
    HuggingFaceEmbeddings, 
    SentenceTransformerEmbeddings
)
from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    PyPDFLoader,
    DataFrameLoader,
    GitLoader
  )
import pandas as pd
import nbformat
from nbconvert import PythonExporter
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [3]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
#os.environ["OPENAI_API_KEY"] = "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"

In [7]:
# Splitting up the text into smaller chunks for indexing
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [4]:

loader = PyPDFLoader("mahabharata.pdf")
pages = loader.load_and_split()

In [5]:
len(pages)


217

In [6]:
def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the  
  splits so for further processing can be done."""
  
  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()  

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content.replace('\n', ''))
    doc_list.extend(pg_splits)

  return doc_list

In [7]:
def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")


In [8]:
def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn)
  test_dict = test_index.docstore._dict
  return len(test_dict.values())  
     

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [10]:
pdf_docs = get_pdf_splits("mahabharata.pdf")

embed_index(doc_list=pdf_docs,
            embed_fn=embeddings,
            index_store='pdf_book2')

New store created...


In [11]:
get_docs_length(index_path='pdf_book2',embed_fn=embeddings)


5513

In [12]:
docsearch = FAISS.load_local("pdf_book2",embeddings)


In [22]:
query = "how did abhimanyu die?"
docs = docsearch.similarity_search(query)


In [23]:
docs

[Document(page_content="main cause of Abhimanyu's death. He it was who had effectively prevented t he relief of Abhimanyu by the Pandavas, and thereby caused Abhimanyu to be", metadata={}),
 Document(page_content="him, but they fell like moths in the fire, one after another. Abhimanyu's shafts sea rched the weak points in the armor of his enemies. And the bodies", metadata={}),
 Document(page_content='THE DEATH OF ABH IMANYU  THE Pandavas, proceeding according to plan, had closely followed Abhimanyu when he b roke into the Kaurava formation. But', metadata={}),
 Document(page_content='Abhimanyu to be isolated, overpowered and slain.  We have seen how Yudhishthira in his anxiety sent first Satyaki and then Bhima to join Arjuna in his', metadata={})]

## Local LLM QA Chain

In [29]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-small'# go for a smaller model if you dont have the VRAM
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=200
)

local_llm = HuggingFacePipeline(pipeline=pipe)



In [25]:
from langchain import PromptTemplate

prompt_template = """Answer based on context:\n\n{context}\n\n{question}"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [30]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm=local_llm, prompt=PROMPT)


In [31]:
result = chain({"input_documents": docs, "question": query}, return_only_outputs=True)[
    "output_text"
]

In [32]:
result

'roke into the Kaurava formation'