# Chapter 3

## Summarizing a document bigger than the LLM’s context window

In [34]:
with open("./Moby-Dick.txt", 'r', encoding='utf-8') as f:
    mobi_dick_book = f.read()

In [35]:
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document

text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=100)
text_chunks = text_splitter.split_text(mobi_dick_book)
chunk_docs = [Document(page_content=text_chunk, metadata={}) for text_chunk in text_chunks]

In [36]:
from langchain_openai import ChatOpenAI
from langchain.chains import (
    LLMChain,
    MapReduceChain,
    StuffDocumentsChain, 
    ReduceDocumentsChain, 
    MapReduceDocumentsChain
)
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.prompts import PromptTemplate
import getpass

In [23]:
OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [37]:
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY,model_name="gpt-4o-mini")

In [42]:
# Map
map_prompt_template = """
Write a concise summary of the following text, and include the main details.
Text: {chunk}
"""

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["chunk"])
map_chain = LLMChain(llm=model, prompt=map_prompt)

In [43]:
# Reduce
reduce_prompt_template = """
Write a coincise summary of the following summaries, and include the main details.
Text: {summaries}
"""

reduce_prompt = PromptTemplate(
    template=reduce_prompt_template, input_variables=["summaries"]
)

reduce_chain = LLMChain(llm=model, prompt=reduce_prompt)


In [46]:
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="summaries"
)

In [47]:
reduce_documents_chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=combine_documents_chain,
    # LLM token limit you do not want to exceed# ANNOTATION
    token_max=4000,
)

In [48]:
# map reduce
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain (DO THIS AND BELOW COMMENTS AS CODE ANNOTATION)
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain, 
    # The variable name used in the map chain
    document_variable_name="chunk",
    # Return output of map steps 
    return_intermediate_steps=False,
)

In [49]:
map_reduce_response = map_reduce_chain.invoke(chunk_docs)

In [50]:
print(map_reduce_response['output_text'])

The Project Gutenberg eBook of "Moby-Dick; or The Whale" by Herman Melville, released in June 2001 and updated in August 2021, follows the narrator, Ishmael, as he embarks on a whaling voyage driven by a desire for adventure and introspection. Ishmael arrives in New Bedford on a cold December night, where he chooses to stay at the Spouter Inn, characterized by its chaotic atmosphere and a mysterious painting of a whale attacking a ship. Here, he encounters a harpooneer named Queequeg, whose intimidating appearance initially frightens him. Despite his anxiety and unease about sharing a bed with Queequeg, who practices unique rituals, Ishmael eventually comes to see him as a human being and recognizes their developing bond. The narrative blends humor and existential reflection, setting the stage for the larger themes of the novel.


## Summarizing across documents

In [52]:
from langchain.document_loaders import WikipediaLoader

wikipedia_loader = WikipediaLoader(query="Paestum", load_max_docs=2)
wikipedia_docs = wikipedia_loader.load()

In [53]:
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader

word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
word_docs = word_loader.load()

pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
pdf_docs = pdf_loader.load()

txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
txt_docs = txt_loader.load()

In [54]:
all_docs = wikipedia_docs + word_docs + pdf_docs + txt_docs

In [55]:
from langchain_openai import ChatOpenAI
from langchain.chains import (
    LLMChain,
    load_summarize_chain
)
from langchain_core.prompts import PromptTemplate
import getpass

In [56]:
OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [57]:
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY,model_name="gpt-4o-mini")

In [58]:
doc_summary_template = """Write a concise summary of the following text:
{text}
DOC SUMMARY:"""
doc_summary_prompt = PromptTemplate.from_template(doc_summary_template)

refine_summary_template = """
Your must produce a final summary from a temporary final summary
which has generated so far and from the content of an additional document.
This is the temporary final summary generated so far: {existing_answer}
This is the content of the additional document: {text}
Only use the content of the additional document if it is useful, 
otherwise return the temporary final summary as it is."""

refine_summary_prompt = PromptTemplate.from_template(refine_summary_template)

In [59]:
summary_refine_chain = load_summarize_chain(
    llm=model,
    chain_type="refine",
    question_prompt=doc_summary_prompt,
    refine_prompt=refine_summary_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="final_summary",
)

In [60]:
summary_result = summary_refine_chain.invoke({"input_documents": all_docs}, return_only_outputs=True)

In [61]:
print(summary_result)

{'intermediate_steps': ['Paestum, an ancient Greek city founded around 600 BC by settlers from Sybaris, is located on the Tyrrhenian Sea in Magna Graecia. Originally named Poseidonia, it flourished for two centuries before being seized by the Lucanians in 400 BC and later taken over by the Romans in 273 BC, who renamed it Paestum. The city is renowned for its well-preserved Doric temples, dating from 550 to 450 BC, dedicated to Hera and Athena, as well as its intact defensive walls and amphitheatre. Paestum declined due to changes in trade routes and flooding, becoming largely forgotten until the 18th century. Today, the archaeological site, part of the comune of Capaccio Paestum in Campania, Italy, is a popular tourist destination with a modern resort nearby.', 'Paestum, an ancient Greek city founded around 600 BC by settlers from Sybaris, is located on the Tyrrhenian Sea in Magna Graecia. Originally named Poseidonia, it flourished for two centuries before being seized by the Lucanian

## Summarizing structured data

In [62]:
watches = [
  {"brand": "Rolex", "model": "Submariner", "dial-size": 41, "dial-color": "black", "material": "steel", "status": "available"},
  {"brand": "Rolex", "model": "Dytona", "dial-size": 40, "dial-color": "black", "material": "steel", "status": "available"},
  {"brand": "Rolex", "model": "Dytona", "dial-size": 40, "dial-color": "white", "material": "gold", "status": "sold_yesterday"},
  {"brand": "Omega", "model": "Speedmaster Moonwatch", "dial-size": 42, "dial-color": "black", "material": "steel", "status": "available"},
  {"brand": "Omega", "model": "Seamaster", "dial-size": 43, "dial-color": "blue", "material": "steel", "status": "sold_yesterday"},    
]

In [63]:
from langchain.schema import Document

row_docs = [Document(page_content=f"We have {row['status']} a {row['material']} {row['brand']} {row['model']}  with a {row['dial-color']} {row['dial-size']}mm dial.", metadata={}) 
            for row in watches]

In [64]:
from langchain_openai import ChatOpenAI
from langchain.chains import (
    load_summarize_chain
)
import getpass

In [65]:
OPENAI_API_KEY = getpass.getpass('Enter your OPENAI_API_KEY')

Enter your OPENAI_API_KEY ········


In [66]:
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY,model_name="gpt-4o-mini")

In [67]:
structured_data_summary_chain = load_summarize_chain(model, chain_type="stuff")

In [68]:
summary_result = structured_data_summary_chain.invoke(row_docs)
print(summary_result['output_text'])

Available watches include a steel Rolex Submariner (black 41mm dial), a steel Rolex Daytona (black 40mm dial), and a steel Omega Speedmaster Moonwatch (black 42mm dial). Recently sold items include a gold Rolex Daytona (white 40mm dial) and a steel Omega Seamaster (blue 43mm dial).
