In [1]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "../data/paper.pdf"
loader = PyPDFLoader(pdf_path)
data = loader.load_and_split()  # already does the splitting
print(f'You have {len(data)} document(s) in your data.')
print(f'There are {len(data[0].page_content)} characters in your document')

You have 22 document(s) in your data.
There are 3978 characters in your document


In [3]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in the environment variables. Check your .env file and make sure the variable name matches.")

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectoredb = Chroma().from_documents(
    data,
    embedding=embeddings,
    persist_directory="."
  )
vectoredb.persist()

Using embedded DuckDB without persistence: data will be transient
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction
Using embedded DuckDB with persistence: data will be stored in: .


In [5]:
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain

qa = ChatVectorDBChain.from_llm(OpenAI(temperature=0), vectoredb)



In [6]:
chat_history = []
query = "What is the paper about? What are the most salient quotes? What are its biggest weaknesses? What, if any, assumptions are made by the paper?"
result = qa({"question": query, "chat_history": chat_history})

In [7]:
result["answer"]

' This paper is about pre-training a model to detect mistakes in videos. The most salient quotes are "To address this, we pre-train our model with a masked modeling objective that encourages the step representations to capture the global context of the entire video" and "Since our model learns step representations “globally” from the whole video, it is able to capture these subtle differences." The biggest weaknesses of the paper are that it lacks a benchmark for detecting mistakes in videos and that it makes assumptions about the relationship between different steps in the same task. The paper makes the assumption that the way a step is situated in an overall task may contain important information about the step.'