In [29]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.environ.get('OPEN_AI_API_KEY')

In [38]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.document_loaders import DirectoryLoader

loader = PyMuPDFLoader("../data/lyn/april-2023-newsletter.pdf")
# loader = DirectoryLoader('../data/lyn/', glob="**/*.pdf", loader_cls=PyMuPDFLoader)
docs = loader.load()


len(docs)

37

In [39]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [40]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever())

In [33]:
from langchain.prompts import PromptTemplate
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Summarize the given text

Let's think about this step by step.

{context}

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [41]:
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs)


In [42]:
query = "What is this document about?"
qa.run(query)

" This document is about the U.S. federal budget from October 2022 to March 2023 and the policy choices that policymakers need to make in order to resolve a post-debt ceiling liquidity cliff. It also provides an overview of the author's three-pillar portfolio strategy for investing in this environment."

In [43]:
query = "what are the biggest takeaways the paper makes?"
qa.run(query)
# would be nice to be able to ask "what has come true and what has been proved false and use wiki to verify"

' The paper discusses the potential consequences of fiscal consolidations and the potential solutions to large public debt, such as inflation. It also discusses how to navigate the current investment environment, suggesting a three-pillar portfolio strategy with a focus on profitable equities, commodities, and cash. Lastly, it provides an update on the US federal budget and discusses potential policy changes as solutions to the post-debt ceiling liquidity cliff.'

In [44]:
query = "when is this document from"
qa.run(query)

' This document is from April 2023.'