# RAG

## Installation of libraries

In [1]:
!pip install -q transformers einops accelerate bitsandbytes
!pip install -q langchain langchain_community langchain-huggingface langchainhub langchain_chroma

In [2]:
import torch
import os
import getpass

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline

from langchain.prompts import PromptTemplate
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [3]:
os.environ["HF_TOKEN"] = getpass.getpass()

··········


## Loading the LLM

In [4]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, token=os.environ["HF_TOKEN"])
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ["HF_TOKEN"])

pipe = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    max_new_tokens=500,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
)
llm = HuggingFacePipeline(pipeline=pipe)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

## Template and Chain

In [5]:
# PHI 3
#template = """
#<|system|>
#You are a helpful virtual assistant. <|end|>
#<|user|>
#{question}<|end|>
#<|assistant|>
#"""

# LLAMA 3
template = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a helpful virtual assistant.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{question}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

template

'\n<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant.\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n{question}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n'

In [6]:
prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='\n<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant.\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n{question}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n')

In [7]:
chain = prompt | llm

chain.invoke({"question": "What day is today?"})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'I\'m happy to help! However, I\'m a large language model, I don\'t have the ability to know the current date or time. But you can easily find out what day it is by checking your device\'s calendar or asking a digital assistant like me: "What is the current date?"'


## Prompt for RAG

- Base prompt: https://smith.langchain.com/hub/rlm/rag-prompt

In [8]:
template_rag = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a helpful virtual assistant answering general questions.
Use the following bits of retrieved context to answer the question.
If you don't know the answer, just say you don't know. Keep your answer concise.
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Question: {question}
Context: {context}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

In [9]:
prompt_rag = PromptTemplate.from_template(template_rag)
print(prompt_rag)

input_variables=['context', 'question'] input_types={} partial_variables={} template="\n<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant answering general questions.\nUse the following bits of retrieved context to answer the question.\nIf you don't know the answer, just say you don't know. Keep your answer concise.\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nQuestion: {question}\nContext: {context}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n"


## Defining the context

In [10]:
from datetime import date

day = date.today()
print(day)

2024-10-28


In [11]:
context = "You know that today is {}".format(day)
print(context)

You know that today is 2024-10-28


## Chain Creation / Generation

In [12]:
chain_rag = prompt_rag | llm | StrOutputParser()

question = "What day is today? Return the date in format mm/dd/yyyy"

res = chain_rag.invoke({"question": question, "context": context})
res

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'Today is 10/28/2024.'

## RAG - Exploring

In [13]:
prompt_rag

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="\n<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant answering general questions.\nUse the following bits of retrieved context to answer the question.\nIf you don't know the answer, just say you don't know. Keep your answer concise.\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nQuestion: {question}\nContext: {context}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n")

In [14]:
chain_rag = prompt_rag | llm | StrOutputParser()

context = """Quarterly revenue numbers:
1º: $42476.40
2º: $46212.97
3º: $41324.56
4º: $56430.24"""

#question = "What is the revenue for the second quarter?"
question = "Which quarter had the highest revenue?"

chain_rag.invoke({"context": context, "question": question})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'The fourth quarter ($56430.24) had the highest revenue.'

## Debugging

- https://python.langchain.com/v0.2/docs/how_to/debugging/



In [15]:
from langchain.globals import set_debug
set_debug(True)

In [16]:
question = "Which quarter had the lowest revenue?"

chain_rag.invoke({
  "context": context,
  "question": question
})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "context": "Quarterly revenue numbers:\n1º: $42476.40\n2º: $46212.97\n3º: $41324.56\n4º: $56430.24",
  "question": "Which quarter had the lowest revenue?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > prompt:PromptTemplate] Entering Prompt run with input:
[0m{
  "context": "Quarterly revenue numbers:\n1º: $42476.40\n2º: $46212.97\n3º: $41324.56\n4º: $56430.24",
  "question": "Which quarter had the lowest revenue?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > prompt:PromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[chain:RunnableSequence > llm:HuggingFacePipeline] Entering LLM run with input:
[0m{
  "prompts": [
    "<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant answering general questions.\nUse the following bits of retrieved context to answer the que

'The third quarter had the lowest revenue, with $41324.56.'

In [17]:
set_debug(False)

## Application of RAG with larger contexts




## Indexing Steps





### 1 - Loading the content

> Document loaders: https://python.langchain.com/docs/integrations/document_loaders/


In [18]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma



In [19]:
loader = WebBaseLoader(web_paths = ("https://www.bbc.com/news/entertainment-arts-68530499",),)
docs = loader.load()

In [20]:
docs

[Document(metadata={'source': 'https://www.bbc.com/news/entertainment-arts-68530499', 'title': 'The full list of winners and nominees at the Oscars 2024', 'description': "See who has won and been nominated for this year's coveted Academy Awards in Hollywood.", 'language': 'en-GB'}, page_content="The full list of winners and nominees at the Oscars 2024Skip to contentBritish Broadcasting CorporationHomeNewsUS ElectionSportBusinessInnovationCultureArtsTravelEarthVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifyUS ElectionElection pollsKamala HarrisDonald TrumpJD VanceTim WalzSportBusinessExecutive LoungeTechnology of BusinessWomen at the HelmFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts i

In [21]:
print(docs[0].page_content)

The full list of winners and nominees at the Oscars 2024Skip to contentBritish Broadcasting CorporationHomeNewsUS ElectionSportBusinessInnovationCultureArtsTravelEarthVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifyUS ElectionElection pollsKamala HarrisDonald TrumpJD VanceTim WalzSportBusinessExecutive LoungeTechnology of BusinessWomen at the HelmFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingVideoLive

In [22]:
len(docs[0].page_content)

6164

### 2 - Split


> More about text splitters  https://python.langchain.com/docs/how_to/#text-splitters

In [23]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, add_start_index = True)
splits = text_splitter.split_documents(docs)

In [24]:
len(splits)

8

In [25]:
splits[1]

Document(metadata={'source': 'https://www.bbc.com/news/entertainment-arts-68530499', 'title': 'The full list of winners and nominees at the Oscars 2024', 'description': "See who has won and been nominated for this year's coveted Academy Awards in Hollywood.", 'language': 'en-GB', 'start_index': 797}, page_content="AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingVideoLiveLive NewsLive SportHomeNewsUS ElectionSportBusinessInnovationCultureArtsTravelEarthVideoLiveAudioWeatherNewslettersThe full list of winners and nominees at the Oscars 2024Getty ImagesDa'Vine Joy Randolph won best supporting actress for The HoldoversHollywood's finest have been rewarded with golden statuettes at the Oscars in Los Angeles' Dolby Theatre.Here is the full list of winners, as well as all the nominees.Best pictureWinner: OppenheimerAmerican FictionAnatomy of a FallB

In [26]:
splits[1].metadata

{'source': 'https://www.bbc.com/news/entertainment-arts-68530499',
 'title': 'The full list of winners and nominees at the Oscars 2024',
 'description': "See who has won and been nominated for this year's coveted Academy Awards in Hollywood.",
 'language': 'en-GB',
 'start_index': 797}

### 3 - Storage

#### Embeddings



In [27]:
hf_embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [32]:
input_test = "just a simple test"
result = hf_embedding.embed_query(input_test)

In [29]:
print(result)

[-0.054198604077100754, -0.09698554128408432, -0.023210447281599045, 0.0071061523631215096, -0.05736067518591881, 0.03741884231567383, 0.03521905839443207, 0.009271848946809769, 0.053111523389816284, 0.03640558570623398, 0.08860345929861069, -0.05440244451165199, 0.020299138501286507, -0.03145493194460869, 0.05896049365401268, 0.010907549411058426, 0.004335182253271341, -0.023187721148133278, 0.012392343021929264, -0.00016039671027101576, -0.041106026619672775, -0.016662200912833214, -0.03658305108547211, -0.02410758100450039, 0.026636894792318344, 0.07495388388633728, 0.0029555719811469316, 0.003283038502559066, 0.008013885468244553, -0.021078625693917274, 0.00030970890657044947, -0.00756420660763979, -0.010757217183709145, -0.026278896257281303, 1.718410544526705e-06, -0.036404259502887726, 0.03562411293387413, 0.0023064131382852793, -0.024003734812140465, 0.026083623990416527, -0.03695273771882057, 0.02897474355995655, -0.030239148065447807, 0.03874632343649864, -0.0234568789601326,

In [30]:
len(result)

768

#### Storing in the vector database

- https://python.langchain.com/docs/integrations/vectorstores/



In [33]:
vectorstore = Chroma.from_documents(documents=splits, embedding=hf_embedding)


## Retrieval and Generation Steps



### 4 - Retriever



In [34]:
retriever = vectorstore.as_retriever(search_type = "similarity", search_kwargs = {"k": 6})

### 5 - Generation


In [35]:
template_rag

"\n<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant answering general questions.\nUse the following bits of retrieved context to answer the question.\nIf you don't know the answer, just say you don't know. Keep your answer concise.\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nQuestion: {question}\nContext: {context}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n"

In [36]:
prompt_rag = PromptTemplate(input_variables = ["context", "question"], template = template_rag)
prompt_rag

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="\n<|begin_of_text|>\n<|start_header_id|>system<|end_header_id|>\nYou are a helpful virtual assistant answering general questions.\nUse the following bits of retrieved context to answer the question.\nIf you don't know the answer, just say you don't know. Keep your answer concise.\n<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\nQuestion: {question}\nContext: {context}\n<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n")

In [37]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

> Chain

In [38]:
chain_rag = ({"context": retriever | format_docs, "question": RunnablePassthrough()}
             | prompt_rag
             | llm
             | StrOutputParser())

> Generation  

In [39]:
# Test without RAG
chain.invoke("Which film won the most Oscars at the 2024 awards?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


"I'm happy to help! However, I need to correct you - since we're in 2023, the 2024 Academy Awards have not yet taken place. The 97th Academy Awards, honoring the best films of 2023, will likely occur in March 2024. Once the ceremony happens, I can provide you with the information on which film wins the most Oscars. Would you like me to keep an eye out for the results and update you once they're announced?"

In [45]:
# Test with RAG
chain_rag.invoke("who won the best actress award?")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'According to the provided context, Emma Stone won the Best Actress award for her role in "Poor Things".'

In [46]:
chain_rag.invoke("Who won the best actor award?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'According to the text, Cillian Murphy won the Best Actor award for his role in "Oppenheimer".'

> Cleaning up the vector store


In [None]:
vectorstore.delete_collection()