# Small RAG from Local data without LLMs based on jaccard_similarity

In [1]:
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]

In [2]:
def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [3]:
def return_response(query, corpus):
    similarities = []
    for doc in corpus:
        similarity = jaccard_similarity(query, doc)
        similarities.append(similarity)
    return corpus_of_documents[similarities.index(max(similarities))]

In [4]:
user_prompt = "What is a leisure activity that you like?"

In [5]:
user_input = "I like to hike"

In [6]:
return_response(user_input, corpus_of_documents)

'Go for a hike and admire the natural scenery.'

### Problem with jaccard_similarity the meaning was different but the response was same because of common words

In [7]:
user_input = "I don't like to hike"

In [8]:
return_response(user_input, corpus_of_documents)

'Go for a hike and admire the natural scenery.'

### Now we are using llama3 for finiding best answer for  our query

In [9]:
import json
import requests

In [10]:
user_input = "I like to hike"
relevant_document = return_response(user_input, corpus_of_documents)
full_response = []

In [11]:
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

In [12]:
url = 'http://localhost:11434/api/generate'
data = {
    "model": "llama3",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}

In [13]:
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
try:
    count = 0
    for line in response.iter_lines():
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

Try a challenging trail!


In [14]:
relevant_document

'Go for a hike and admire the natural scenery.'

In [15]:
user_input = "I don't like to hike"
relevant_document = return_response(user_input, corpus_of_documents)
full_response = []

In [16]:
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

url = 'http://localhost:11434/api/generate'

data = {
    "model": "llama3",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
try:
    for line in response.iter_lines():
        # filter out keep-alive new lines
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])  # uncomment to results, token by token
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

Try birdwatching instead.


In [17]:
full_response.clear()

### Using own PDF as  a documents

In [38]:
import time as timer

from langchain.llms import Ollama
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [39]:
pdf_path = "./AttentionIsAllYouNeed.pdf"
loader = PyPDFLoader(pdf_path)
data = loader.load()

In [70]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)
all_splits = text_splitter.split_documents(data)

In [40]:
llm = Ollama(
    base_url='http://localhost:11434',
    model='llama3',
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
)

In [71]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)



In [72]:
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embeddings)

In [73]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

# Define RetrievalQA chain
chain = RetrievalQA.from_chain_type(
    llm, 
    retriever=vectorstore.as_retriever(),
    verbose=True,
    return_source_documents=True,
    chain_type_kwargs={"prompt":PROMPT}
)


In [74]:
query = "Explain about Encoder and Decoder Stacks in Transformers"

In [75]:
print(f"Query: {query}")
# docs = vectorstore.similarity_search(query)
# print(f"Docs (similarity search results): {docs}")

# Run the chain
start_t = timer.time()
response = chain({"query": query})
elapsed_t = timer.time() - start_t
print(f"\n\nElapsed time: {elapsed_t}")

Query: Explain about Encoder and Decoder Stacks in Transformers


[1m> Entering new RetrievalQA chain...[0m
According to the given context, the Transformer's architecture follows a stacked self-attention mechanism with point-wise, fully connected layers for both the encoder and decoder.

The Encoder Stack is composed of 6 identical layers, each containing two sub-layers: multi-head self-attention and a simple, position-wise fully connected feed-forward network. Each layer employs residual connections around the two sub-layers, followed by layer normalization.

Similarly, the Decoder Stack also has 6 identical layers, with an additional third sub-layer performing multi-head attention over the output of the encoder stack. The decoder layers also use residual connections and layer normalization.

In summary, both the Encoder and Decoder Stacks in Transformers consist of multiple identical layers, each containing self-attention mechanisms and feed-forward networks, with residual connecti

In [77]:
print(response['query'])

Explain about Encoder and Decoder Stacks in Transformers


In [81]:
print(len(response['source_documents']))

4


In [85]:
def print_source_docs(response):
    relevant_docs = response['source_documents']
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
    print("*" * 100)
    for i, doc in enumerate(relevant_docs):
        print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
        print("-"*100)
        print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 4 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: ./AttentionIsAllYouNeed.pdf, Page: 2
Content: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N= 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully connected feed-forward network. We employ a residual connection [ 11] around each of
the two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is
--------------------------------------------------------------------------

In [86]:
query = "Why we need scaled-dot product?"

# Run the chain
start_t = timer.time()
response = chain({"query": query})
elapsed_t = timer.time() - start_t
print(f"\n\nElapsed time: {elapsed_t}")



[1m> Entering new RetrievalQA chain...[0m
The answer is that without scaling, the dot products can grow large in magnitude as the value of `dk` increases, pushing the softmax function into regions where it has extremely small gradients. This makes the attention mechanism less effective for larger values of `dk`. To counteract this effect, we scale the dot products by 1/√dk, which helps to maintain a reasonable range of values and ensures that the attention mechanism remains effective even for large values of `dk`.
[1m> Finished chain.[0m


Elapsed time: 330.919091463089


In [87]:
relevant_docs = response['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')


There are 4 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: ./AttentionIsAllYouNeed.pdf, Page: 3
Content: much faster and more space-efficient in practice, since it can be implemented using highly optimized
matrix multiplication code.
While for small values of dkthe two mechanisms perform similarly, additive attention outperforms
dot product attention without scaling for larger values of dk[3]. We suspect that for large values of
dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has
extremely small gradients4. To counteract this effect, we scale the dot products by1√dk.
3.2.2 Multi-Head Attention
Instead of performing a single attention function with dmodel-dimensional keys, values and queries,
----------------------------------------------------------------------------------------------------
There a

2024-06-13 18:22:46.046 
  command:

    streamlit run C:\Users\Evoortsolutions\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


Collecting streamlit
  Downloading streamlit-1.35.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-win_amd64.whl.metadata (37 kB)
Collecting toolz (from altair<6,>=4.0->streamlit)
  Downloading toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downlo