In [1]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path

In [2]:
# load the pdf file
loader = PyPDFLoader("ms-financial-statement.pdf")
documents = loader.load_and_split()
print(len(documents))

print(documents[0].page_content)

2
9   
Dividends  
Our Board of Directors declared the following dividends:  
  
Declaration Date  Record Date  Payment  Date  Dividend  
Per Share  Amount    
          
Fiscal Year 2022        (In millions)  
          
September  14, 2021   November  18, 2021    December  9, 2021   $   0.62  $ 4,652   
December  7, 2021   February  17, 2022    March  10, 2022    0.62   4,645   
March  14, 2022   May 19, 2022    June  9, 2022    0.62   4,632   
June  14, 2022   August  18, 2022    September  8, 2022    0.62   4,627     
Total      $   2.48  $   18,556             
          
Fiscal Year 2021          
          
September  15, 2020   November  19, 2020    December  10, 2020   $ 0.56  $ 4,230   
December  2, 2020   February  18, 2021    March  11, 2021    0.56   4,221   
March  16, 2021   May 20, 2021    June  10, 2021    0.56   4,214   
June  16, 2021   August  19, 2021    September  9, 2021    0.56   4,206     
          
Total      $ 2.24  $ 16,871             
The dividend declare

In [3]:
# chunk the text into smaller pieces with overlap

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64
)
texts = text_splitter.split_documents(documents)
print(len(texts))

3


In [4]:
# create the embeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# save embeddings into a chroma database
db = Chroma.from_documents(texts, embeddings, persist_directory="db")

In [6]:
# create chain. load gpt4all model

llm = GPT4All(
    model="./ggml-gpt4all-j-v1.3-groovy.bin",
    n_ctx=1000,
    backend="gptj",
    verbose=False
)

Found model file at  ./ggml-gpt4all-j-v1.3-groovy.bin
gptj_model_load: loading model from './ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 5401.45 MB
gptj_model_load: kv self size  =  896.00 MB
gptj_model_load: ................................... done
gptj_model_load: model size =  3609.38 MB / num tensors = 285


In [7]:
# create the retrieval chain

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k":3}),
    return_source_documents=True,
    verbose=False
)

In [10]:
# ask a question
res = qa(f"""
    How much is the dividend per share during during 2022?
    Extract it from the text.
""")
print(res["result"])

 The information provided does not contain enough details to extract an accurate answer on how many shares were issued and paid dividends in 2021 or 2020, nor what percentage of stock ownership each shareholder has that may have influenced the company's dividend policy during those years. Therefore I am unable provide a definitive answer based solely upon this text snippet alone as it lacks important context necessary for interpretation
 The information provided does not contain enough details to extract an accurate answer on how many shares were issued and paid dividends in 2021 or 2020, nor what percentage of stock ownership each shareholder has that may have influenced the company's dividend policy during those years. Therefore I am unable provide a definitive answer based solely upon this text snippet alone as it lacks important context necessary for interpretation
