In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

In [2]:
import os
key = open('../key.txt','r').read().rstrip()
os.environ['OPENAI_API_KEY'] = key

In [3]:
pdf = PdfReader("1706.03762.pdf")

In [4]:
raw_text = ''
for i, page in enumerate(pdf.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [5]:
text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

In [7]:
texts = text_splitter.split_text(raw_text)

In [8]:
embeddings = OpenAIEmbeddings()

In [9]:
docsearch = FAISS.from_texts(texts, embeddings)

In [12]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

In [13]:
query = "Who are the authors of this article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The authors of this article are Jakob, Ashish, Noam, Niki, Llion, Lukasz, and Aidan.'

In [14]:
query = "How many layers does the encoder stack have ?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The encoder stack has 6 layers.'

In [15]:
query = "What are the different ways the Transformer uses multi-head attention?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The Transformer uses multi-head attention in three different ways: encoder-decoder attention layers, self-attention layers in the encoder, and self-attention layers in the decoder.'

In [16]:
query = "How many sentence pairs did was the model trained on?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The model was trained on the WMT 2014 English-French dataset consisting of 36M sentences.'

In [17]:
query = "What is the url link of the code used?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The url link of the code used is https://github.com/tensorflow/tensor2tensor.'