In [18]:
!pip3 install -Uqqq pip
!pip3 install -qqq langchain==0.0.173
!pip3 install -qqq chromadb==0.3.23
!pip3 install -qqq pypdf==3.8.1
!pip3 install -qqq pygpt4all==1.1.0
!pip3 install -qqq pdf2image==1.16.3
!pip3 install sentence-transformers
!pip3 install pygpt4all


[0m

In [20]:
import chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings


In [21]:
class GPT4AllImpl:
  def __init__(self, pdf_file, model_path):
    self.pdf_file = pdf_file
    self.model_path = model_path


  # # Load pdf
  def load_pdf_chunks(self, pdf_file = "ms-financial-statement.pdf"):
    loader = PyPDFLoader(pdf_file) #pdf_file
    documents = loader.load_and_split()
    # len(documents)

    # chunk into smaller parts using RecursiveCharacterTextSplitter.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=64)
    texts = text_splitter.split_documents(documents)

    return texts




  # Create Embeddings
  def create_embeddings(self):
    print('entering create_embeddings()')
    embedding_function = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")
    print('exiting create_embeddings()')
    return embedding_function

  # # Save the geenrated embeddings
  def save_embeddings(self, texts, embedding_function, persist_directory):
    print('entering save_embeddings()')
    db = Chroma.from_documents(texts, embedding = embedding_function,persist_directory= persist_directory)
    print('exiting save_embeddings()\n')
    db.persist()
    return db

  ################## testing below ###################

  # The embeddigs are already generate dand saved
  # Now reteieve those embeddings
  def retrieve_embeddings(self, persist_directory,embedding_function):

    db_loaded = Chroma(collection_name = 'index_collection', embedding_function=embedding_function, persist_directory= persist_directory )
    return db_loaded


  ################### testing above ###################

  # Create Chain
  def create_retrievalQA_chain(self, db_loaded, model_n_ctx,model_path):
    print('entering create_retrievalQA_chain()')
    model_n_ctx = 1000
    model_path = "./ggml-gpt4all-j-v1.3-groovy.bin"
    llm = GPT4All(model=model_path, n_ctx=1000, backend="gptj", verbose=False)

    # pass the GPT4All model to a RetrievalQA chain:
    qa = RetrievalQA.from_chain_type(
      llm=llm,
      chain_type="stuff",
      retriever=db_loaded.as_retriever(search_kwargs={"k": 3}),
      return_source_documents=True,
      verbose=False  )

    print('exiting create_retrievalQA_chain()\n')

    return qa

  # Reply to asked Question
  def retrieve_result(self,  query_param, qa):
    print('entering retrieve_result()')
    query_param = query_param + 'Extract it from the text.'

    res = qa(query_param)
    print('exiting retrieve_result()\n')
    return res["result"]

  # driver function
  def execute_c4a(self, cust_query):
    print('entering execute_c4a()')
    texts = self.load_pdf_chunks()
    embedding_function = self.create_embeddings()
    db = self.save_embeddings(texts,embedding_function,"db")
    qa = self.create_retrievalQA_chain(db, 1000,"./ggml-gpt4all-j-v1.3-groovy.bin")
    res = self.retrieve_result(cust_query, qa)
    print('exiting execute_c4a()\n')
    return res



  # do not use this yet
  def execute_c4a_new(self, cust_query):
    cust_query = 'How much is the dividend per share during during 2022? Extract it from the text.'


    texts = self.load_pdf_chunks()


    # create the open-source embedding function
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    client = chromadb.PersistentClient(path="./db_latest")

    # collection = client.create_collection(name="my_collection", embedding_function=embedding_function)
    collection = client.get_collection(name="my_collection", embedding_function=embedding_function)

    # # save to disk
    # texts = self.load_pdf_chunks()
    # db2 = Chroma.from_documents(texts, embedding_function, persist_directory="./db_test")
    # docs = db2.similarity_search(query)

    # # load from disk
    # db3 = Chroma(persist_directory="./db_test", embedding_function=embedding_function)


    model_n_ctx = 1000
    model_path = "./ggml-gpt4all-j-v1.3-groovy.bin"
    llm = GPT4All(model=model_path, n_ctx=1000, backend="gptj", verbose=False)

    # pass the GPT4All model to a RetrievalQA chain:
    qa = RetrievalQA.from_chain_type(
      llm=llm,
      chain_type="stuff",
      retriever=collection.as_retriever(search_kwargs={"k": 3}),
      return_source_documents=True,
      verbose=False  )


    query_param = query_param + 'Extract it from the text.'

    res = qa(query_param)
    return res["result"]




# Function-wise starts

In [22]:
# linewise
c4a = GPT4AllImpl_Working("","")
cust_query = 'How much is the dividend per share during during 2022?'

texts = c4a.load_pdf_chunks()

In [23]:
embedding_function = c4a.create_embeddings()

entering create_embeddings()
exiting create_embeddings()


In [24]:
db = c4a.save_embeddings(texts,embedding_function,"db")

entering save_embeddings()
exiting save_embeddings()



In [25]:
# # db_loaded = c4a.retrieve_embeddings('./db',embedding_function)

# db_loaded = Chroma(collection_name='index_collection',
#                    embedding_function=embedding_function, persist_directory= 'db' )

In [26]:
# db_loaded

In [27]:
qa = c4a.create_retrievalQA_chain(db, 1000,"./ggml-gpt4all-j-v1.3-groovy.bin")


entering create_retrievalQA_chain()
exiting create_retrievalQA_chain()



In [28]:
res = c4a.retrieve_result(cust_query, qa)

entering retrieve_result()
exiting retrieve_result()



In [29]:
res

'\n\nThe dividend declared on June 14, 2022 was included in other current liabilities as of June 30, 2022.\n\nTotal $2.24 $16,871\n\nThe dividend per share during 2022 is not provided in the given text.'

# Function-wise ends

# Driver method - one go call starts

In [14]:
# c4a = GPT4AllImpl_Working("","")
# cust_query = 'How much is the dividend per share during during 2022?'
# result = c4a.execute_c4a(cust_query)

# result

In [15]:
# result

# Driver method - one go call ends