In [1]:
# https://github.com/kennethleungty/Llama-2-Open-Source-LLM-CPU-Inference/tree/main/data
# https://towardsdatascience.com/running-llama-2-on-cpu-inference-for-document-q-a-3d636037a3d8

In [2]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
# Load PDF file from data path
loader = DirectoryLoader('data/',
                         glob="*.pdf",
                         loader_cls=PyPDFLoader)
documents = loader.load()

In [4]:
# Split text from PDF into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                               chunk_overlap=50)
texts = text_splitter.split_documents(documents)

In [5]:
# Load embeddings model
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                   model_kwargs={'device': 'cpu'})

# Build and persist FAISS vector store
vectorstore = FAISS.from_documents(texts, embeddings)
vectorstore.save_local('vectorstore/db_faiss')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# qa_template = """Use the following pieces of information to answer the user's question.
# If you don't know the answer, just say that you don't know, don't try to make up an answer.
# Context: {context}
# Question: {question}
# Only return the helpful answer below and nothing else.
# Helpful answer:
# """

In [7]:
# qa_template = """
# You're helping a researcher to understand paper.
# Use the following pieces of information to answer the user's question.
# Context: {context}
# Question: {question}

# Pay attention if the context given is not answering or not convicing, you can give helpful answer that come from paper based on question. 
# Before you return helpful answer, you must mention paper title you used when you didn't use context. 
# Pay attention to not include citation number in your helpful answer.

# Pay attention to keep your answer short but clear.
# If you don't know the answer, just say that you don't know, don't try to make up an answer.
# Only return the helpful answer below and nothing else.
# Helpful answer:
# """

In [8]:
qa_template = """
You're helping machine learning researcher to understand paper. \
Use the following pieces of information to answer the user's question. \
Context: {context}
Question: {question}

Please pay attention to the context and the question provided. \
If the given context is helpful, please provide the answer based on that context. \
If the context is not sufficient to answer the question or is not convincing enough, \
you must mention the paper title and then provide a helpful answer. 
Make sure not to include citation numbers such as [15] or [54] in your response.

Please pay attention to keeping your answer short but clear. \
If you don't know the answer, just say that you don't know, don't try to make up an answer. \
Only provide the helpful answer below and nothing else.
Helpful answer:
"""

# Config: https://github.com/marella/ctransformers#config
# Download model from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main

from langchain.llms import CTransformers

# Local CTransformers wrapper for Llama-2-7B-Chat
llm = CTransformers(model='models/llama-2-7b-chat.ggmlv3.q8_0.bin', # Location of downloaded GGML model
                    model_type='llama', # Model type Llama
                    config={'max_new_tokens': 1000,
                            'temperature': 0.01,"repetition_penalty":2})

from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Wrap prompt template in a PromptTemplate object
def set_qa_prompt():
    prompt = PromptTemplate(template=qa_template,
                            input_variables=['context', 'question'])
    return prompt


# Build RetrievalQA object
def build_retrieval_qa(llm, prompt, vectordb):
    dbqa = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k':2}),
                                       return_source_documents=True,
                                       chain_type_kwargs={'prompt': prompt})
    return dbqa


# Instantiate QA object
def setup_dbqa():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cpu'})
    vectordb = FAISS.load_local('vectorstore/db_faiss', embeddings)
    qa_prompt = set_qa_prompt()
    dbqa = build_retrieval_qa(llm, qa_prompt, vectordb)

    return dbqa

dbqa = setup_dbqa()

In [9]:
response = dbqa({'query': "What is positional encoding?"})

In [10]:
print(f'\nAnswer: {response["result"]}')
print('='*50) # Formatting separator
source_docs = response['source_documents']
for i, doc in enumerate(source_docs):
    print(f'\nSource Document {i+1}\n')
    print(f'Source Text: {doc.page_content}')
    print(f'Document Name: {doc.metadata["source"]}')
    print(f'Page Number: {doc.metadata["page"]}\n')
    print('='* 50) # Formatting separator


Answer: Positional encoding is a technique used in machine learning models such as encoder-decoders (like those described by [38]) or transformer architectures like BERT ([2]), where each position of input sequence has an associated unique fixed length vector, called the "position embedding". This allows model to differentiate between different positions within a sequential data.
In this context it is used in both encoder and decoders stacks with Pdrop=0:1 which means that dropout rate for all positional embeddings are set equal 0 or not dropped at any layer of the network, so they can be computed during training process as well

Source Document 1

Source Text: positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of
Pdrop= 0:1.
7
Document Name: data/Attention Is All You Need.pdf
Page Number: 6


Source Document 2

Source Text: position in the decoder to attend over all positions in the input sequence. This mimics the
typical encoder-decoder at

In [16]:
response["source_documents"]

[Document(page_content='TPU-now-offers-preemptible-pricing-and-global-\navailability.html', metadata={'source': 'data/BERT- Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf', 'page': 12}),
 Document(page_content='that contain at least one of the provided possible answers.System Dev Test\nESIM+GloVe 51.9 52.7\nESIM+ELMo 59.1 59.2\nOpenAI GPT - 78.0\nBERT BASE 81.6 -\nBERT LARGE 86.6 86.3\nHuman (expert)y- 85.0\nHuman (5 annotations)y- 88.0\nTable 4: SWAG Dev and Test accuracies.yHuman per-\nformance is measured with 100 samples, as reported in\nthe SWAG paper.\n^si;j=maxj\x15iS\x01Ti+E\x01Tj. We predict a non-null\nanswer when ^si;j> s null+\x1c, where the thresh-\nold\x1cis selected on the dev set to maximize F1.', metadata={'source': 'data/BERT- Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf', 'page': 6})]

In [11]:
response = dbqa({'query': "What is definition of patch in Vision Transformer?"})
print(f'\nAnswer: {response["result"]}')
print('='*50) # Formatting separator
source_docs = response['source_documents']
for i, doc in enumerate(source_docs):
    print(f'\nSource Document {i+1}\n')
    print(f'Source Text: {doc.page_content}')
    print(f'Document Name: {doc.metadata["source"]}')
    print(f'Page Number: {doc.metadata["page"]}\n')
    print('='* 50) # Formatting separator


Answer: In Vaswani et al.'s paper (2017), a patch is defined as "a small fixed-size subregion of interest within each image." Specifically in Vision Transformer, they use 8x4 non overlappingpatches for the input images and apply them through linear embeddings to create an output sequence.

Source Document 1

Source Text: scribed in Vaswani et al. (2017) and released in
thetensor2tensor library.1Because the use
of Transformers has become common and our im-
plementation is almost identical to the original,
we will omit an exhaustive background descrip-
tion of the model architecture and refer readers to
Vaswani et al. (2017) as well as excellent guides
such as “The Annotated Transformer.”2
In this work, we denote the number of layers
(i.e., Transformer blocks) as L, the hidden size as
Document Name: data/BERT- Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf
Page Number: 2


Source Document 2

Source Text: the effort to evaluate this idea. Ashish, with Illi

In [12]:
response = dbqa({'query': "How much is the minimum guarantee payable by adidas?"})

print(f'\nAnswer: {response["result"]}')
print('='*50) # Formatting separator

source_docs = response['source_documents']
for i, doc in enumerate(source_docs):
    print(f'\nSource Document {i+1}\n')
    print(f'Source Text: {doc.page_content}')
    print(f'Document Name: {doc.metadata["source"]}')
    print(f'Page Number: {doc.metadata["page"]}\n')
    print('='* 50) # Formatting separator


Answer: The minimum guarantee is not mentioned in any of these papers or tables provided as context for this question; therefore I cannot give a specific amount payable by adidas without additional information that may be outside my knowledge cutoff date (2019-Aug).

Source Document 1

Source Text: TPU-now-offers-preemptible-pricing-and-global-
availability.html
Document Name: data/BERT- Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf
Page Number: 12


Source Document 2

Source Text: that contain at least one of the provided possible answers.System Dev Test
ESIM+GloVe 51.9 52.7
ESIM+ELMo 59.1 59.2
OpenAI GPT - 78.0
BERT BASE 81.6 -
BERT LARGE 86.6 86.3
Human (expert)y- 85.0
Human (5 annotations)y- 88.0
Table 4: SWAG Dev and Test accuracies.yHuman per-
formance is measured with 100 samples, as reported in
the SWAG paper.
^si;j=maxjiSTi+ETj. We predict a non-null
answer when ^si;j> s null+, where the thresh-
oldis selected on the dev set to maximize F

In [13]:
# response = dbqa({'query': "What is embeddings?"})
# print(f'\nAnswer: {response["result"]}')
# print('='*50) # Formatting separator

# source_docs = response['source_documents']
# for i, doc in enumerate(source_docs):
#     print(f'\nSource Document {i+1}\n')
#     print(f'Source Text: {doc.page_content}')
#     print(f'Document Name: {doc.metadata["source"]}')
#     print(f'Page Number: {doc.metadata["page"]}\n')
#     print('='* 50) # Formatting separator