In [2]:
from langchain.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("1706.03762.pdf")
data = loader.load()
file_content = '\n'.join([d.page_content for d in data])

In [63]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=2)
texts = text_splitter.create_documents([file_content])


In [64]:
for i, t in enumerate(texts):
    t.page_content = 'PAGE %d\n\n%s' % (i+1, t.page_content) 

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
chat = ChatOpenAI(temperature=0)

In [75]:
from langchain.prompts import PromptTemplate

REFINE_PROMPT_TMPL = '''
As a knowledgeable AI, your task is to list all the sections of a scientific paper up to the provided point, excluding tables, figures, and references.
The paper's content will be presented sequentially, and you must accurately represent the relationships between sections and their respective subsections in your list.
You will be given an existing answer that has been generated so far.

EXISTING ANSWER:
{existing_answer}

Adhere to the given format and only refine the answer if necessary, using the additional context provided below.
Focus on identifying the section titles and their corresponding subsections that are explicitly mentioned in the text. Do not add extra details or overinterpret the content.
If the additional context is not helpful or no changes are required, you may return the original answer without any changes.

{text}

ANSWER:
'''
REFINE_PROMPT = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=REFINE_PROMPT_TMPL,
)


prompt_template = """
As a knowledgeable AI, your task is to list all the sections of a scientific paper up to the provided point, excluding tables, figures, and references.
The paper's content will be presented sequentially, and you must accurately represent the relationships between sections and their respective subsections in your list.
Focus on identifying the section titles and their corresponding subsections that are explicitly mentioned in the text. Do not add extra details or overinterpret the content.
-------------------
{text}
-------------------
ANSWER:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])

In [53]:
refine_chain = load_summarize_chain(chat, chain_type="refine", verbose=True, 
                                    refine_prompt=REFINE_PROMPT, 
                                    question_prompt=PROMPT,
                                    return_intermediate_steps=True,
                                    )

In [70]:
len(texts)

4

In [None]:
resp = refine_chain({"input_documents": texts}, return_only_outputs=True)

In [74]:
for s in resp['intermediate_steps']:
    print(s+'\n\n')


1. Abstract
2. Introduction
3. Background
   - Recurrent neural networks
   - Attention mechanisms
   - Self-attention
   - End-to-end memory networks
4. Model Architecture
   - Encoder and Decoder Stacks
   - Attention
      - Scaled Dot-Product Attention
      - Multi-Head Attention
      - Applications of Attention in our Model


1. Abstract
2. Introduction
3. Background
   - Recurrent neural networks
   - Attention mechanisms
   - Self-attention
   - End-to-end memory networks
4. Model Architecture
   - Encoder and Decoder Stacks
   - Attention
      - Scaled Dot-Product Attention
      - Multi-Head Attention
      - Applications of Attention in our Model
   - Position-wise Feed-Forward Networks
   - Embeddings and Softmax
   - Positional Encoding
5. Why Self-Attention
6. Training
   - Training Data and Batching
   - Hardware and Schedule
   - Optimizer
   - Regularization
7. Results
   - Machine Translation


1. Abstract
2. Introduction
3. Background
   - Recurrent neural networks