# Step 1 - Data Ingestion

In [21]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Step 1.1 - Load PDFs (can load more than 1 PDFs)

In [22]:
pdfs = [
    "data/Deep_Work.pdf"
]

In [23]:
all_pdfs = []
for pdf in pdfs:
    loader = PyPDFLoader(pdf)
    # Load the PDF document
    document = loader.load()        
    # Add the loaded document to our list
    all_pdfs.append(document)

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

chunked_all_pdfs = []
for pdf in all_pdfs:
    # Chunk the annual_report
    texts = text_splitter.split_documents(pdf)
    # Add the chunks to chunked_all_pdfs, which is a list of lists
    chunked_all_pdfs.append(texts)
    print(f"chunked_all_pdfs length: {len(texts)}")

chunked_all_pdfs length: 535


### Step 1.2 - Upsert pdf vector embeddings to Pinecone

In [25]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import os

In [26]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [27]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [28]:
# Initialize Pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)
index_name = "langchain2"

In [29]:
# Upsert annual reports to Pinecone via LangChain.
# There's likely a better way to do this instead of Pinecone.from_texts()
for chunks in chunked_all_pdfs:
    Pinecone.from_texts([chunk.page_content for chunk in chunks], embeddings, index_name=index_name)

# Step 2 - Data Retrieval

### Step 2.1 - Retrieve the annual report vector embeddings from Pinecone

In [30]:
vectorstore = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

# Step 3 - Chat Q&A

### Step 3.1 - Ask questions about the pdf!

In [31]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [32]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm)

In [33]:
query = "What is this book about?"
docs = vectorstore.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' This book does not provide any information about its content.'

In [34]:
query = "What are the 5 top lessons from this book?"
docs = vectorstore.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The 5 top lessons from this book are to work deeply, embrace boredom, quit social media, drain the shallows, and focus on living a focused life.'

In [35]:
query = "Provide a summary of this book in 300 words"
docs = vectorstore.similarity_search(query)
chain.run(input_documents=docs, question=query)

" This book is about the concept of deep work and how to incorporate it into your life. It begins by discussing the value of deep work and how it is becoming increasingly rare in today's world. It then goes on to discuss four rules for incorporating deep work into your life: work deeply, embrace boredom, quit social media, and drain the shallows. The book then concludes with a discussion of how deep work can be meaningful and how it can help you achieve your goals. The book also includes notes, newsletters, and a copyright page. In summary, this book is about the concept of deep work and how to incorporate it into your life. It provides four rules for doing so and discusses the value and meaning of deep work."

In [36]:
query = "How many hours in a day can someone spend doing deep work?"
docs = vectorstore.similarity_search(query)
chain.run(input_documents=docs, question=query)

' According to the context, the most adept deep thinker cannot spend more than four hours in a state of true depth in a given day.'

In [37]:
query = "What does the author suggest are ways to ensure someone can get time to do deep work?"
docs = vectorstore.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The author suggests that someone can ensure they have time to do deep work by introducing artificial constraints on their schedule, blocking out deep work hours and preserving them against incursion, and ritualizing their work habits.'

In [38]:
query = "Can someone get burnt out when trying to do deep work?"
docs = vectorstore.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Yes, someone can get burnt out when trying to do deep work. Deep work requires intense, uninterrupted concentration on cognitively demanding concepts, which can be mentally and physically exhausting.'

In [39]:
query = "Provide a summary of this book using 1000 words"
docs = vectorstore.similarity_search(query)
chain.run(input_documents=docs, question=query)

" This book, written by Cal Newport, is about the concept of deep work and how to incorporate it into one's life. Deep work is defined as activities that require intense focus and concentration, and it is valuable because it produces results that are of high quality and can be completed quickly. Deep work is also rare, as most people are distracted by shallow activities such as social media and email. The book provides four rules for incorporating deep work into one's life: work deeply, embrace boredom, quit social media, and drain the shallows. By following these rules, one can become more productive and successful in their work. The book also provides an introduction to the concept of deep work, as well as a conclusion that summarizes the main points."