# RAG using SambaNova and Meta AI Llama-3 




In [1]:
import os
import openai

import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader

from llama_index.llms.sambanovasystems import SambaNovaCloud

from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import Settings
import qdrant_client

In [2]:
# allows nested access to the event loop
nest_asyncio.apply()

In [3]:
# add your documents in this directory, you can drag & drop
input_dir_path = '/teamspace/studios/this_studio/test-dir'

In [4]:
collection_name="chat_with_docs"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

def create_index(documents):
    vector_store = QdrantVectorStore(client=client, collection_name=collection_name)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
    )
    return index

In [5]:

# setup llm & embedding model
llm = SambaNovaCloud(model="Meta-Llama-3.3-70B-Instruct", temperature=0.7, top_p=0.01)

embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-large-en-v1.5", trust_remote_code=True)

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
# load data
loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".pdf"],
            recursive=True
        )
docs = loader.load_data()

# Creating an index over loaded data
Settings.embed_model = embed_model
try:
    index = create_index(docs)
    print('Using Qdrant collection')
except:
    index = VectorStoreIndex.from_documents(docs, show_progress=True)

# Create the query engine, where we use a cohere reranker on the fetched nodes
Settings.llm = llm
query_engine = index.as_query_engine()

# ====== Customise prompt template ======
qa_prompt_tmpl_str = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
"Query: {query_str}\n"
"Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# Generate the response
response = query_engine.query("What exactly is DSPy?",)

Parsing nodes:   0%|          | 0/17 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/26 [00:00<?, ?it/s]

In [7]:
display(Markdown(str(response)))

DSPy is a framework for programmatically solving advanced tasks with language and retrieval models through composing and declaring modules. It aims to replace brittle "prompt engineering" tricks with composable modules and automatic optimizers, allowing developers to define signatures that specify what a language model (LM) needs to do declaratively.

### ❗️❗️ Make sure you clear GPU memory by clicking on Restart button above, if you want to use Streamlit from here

In [1]:
# check GPU usage

!nvidia-smi

Fri Jan 10 09:13:51 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.42                 Driver Version: 537.42       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   44C    P5               5W /  75W |    132MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
!pip install llama-index-vector-stores-qdrant
!pip install llama-index qdrant_client



In [24]:
import os
from llama_index.llms.sambanovasystems import SambaNovaCloud
from dotenv import load_dotenv

load_dotenv()

os.getenv("SAMBANOVA_API_KEY")

'ff3c55a0-943b-438d-9f43-92db4192f433'

In [25]:
llm = SambaNovaCloud(model="Meta-Llama-3.3-70B-Instruct",
                     temperature=0.7,
                     top_p=0.01,
                    )

In [26]:

from llama_index.core.base.llms.types import (
    ChatMessage,
    MessageRole,
)

In [27]:
system_msg = ChatMessage(
    role=MessageRole.SYSTEM,
    content="You are a helpful assistant that translates English to French. Translate the user sentence.",
)
user_msg = ChatMessage(role=MessageRole.USER, content="I love programming.")

messages = [
    system_msg,
    user_msg,
]

response = llm.chat(messages)
print(response.message)

assistant: J'adore la programmation.


In [1]:
from rag_code import *
from llama_index.core import SimpleDirectoryReader

In [2]:
batch_size=32
loader = SimpleDirectoryReader(input_dir = "./docs/",
                               required_exts=[".pdf"],
                               recursive=True)

docs = loader.load_data()
documents = [doc.text for doc in docs]

# embed data    

embeddata = EmbedData(embed_model_name="BAAI/bge-large-en-v1.5", batch_size=batch_size)
embeddata.embed(documents)

# set up vector database
qdrant_vdb = QdrantVDB_QB(collection_name="chat-with-docs2",
                          batch_size=batch_size,
                          vector_dim=1024)

qdrant_vdb.define_client()
qdrant_vdb.create_collection()
qdrant_vdb.ingest_data(embeddata=embeddata)

# set up retriever
retriever = Retriever(vector_db=qdrant_vdb, embeddata=embeddata)

# set up rag
query_engine = RAG(retriever=retriever, llm_name="Meta-Llama-3.3-70B-Instruct")


In [3]:
messages = query_engine.query("What is DSPy?")

In [4]:
messages

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text="You are a helpful assistant that answers questions about the user's document.")]),
 ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='Context information is below.\n---------------------\nPreprint\n3.1 N ATURAL LANGUAGE SIGNATURES CAN ABSTRACT PROMPTING & FINETUNING\nInstead of free-form string prompts, DSPy programs use natural language signatures to assign work\nto the LM. A DSPy signature isnatural-language typed declaration of a function: a short declarative\nspec that tells DSPy what a text transformation needs to do (e.g., “consume questions and return\nanswers”), rather than how a specific LM should be prompted to implement that behavior. More\nformally, a DSPy signature is a tuple of input fields and output fields (and an optional instruction).\nA field consists offield name and optional metadata.4 In typi

In [45]:
# system_msg = ChatMessage(
#     role=MessageRole.SYSTEM,
#     content="You are a helpful assistant that translates English to French. Translate the user sentence.",
# )


# messages_new = [
#     system_msg,
#     user_msg,
# ]




<generator object llm_completion_callback.<locals>.wrap.<locals>.wrapped_llm_predict.<locals>.wrapped_gen at 0x35b122e60>

In [46]:
full_response = ""
        
user_msg = ChatMessage(role=MessageRole.USER, content="I love programming.")

# Simulate stream of response with milliseconds delay
streaming_response = query_engine.llm.stream_complete(user_msg.content)

for chunk in streaming_response:
    try:
        new_text = chunk.raw["choices"][0]["delta"]["content"]
        full_response += new_text
        print(new_text)
    except:
        pass


Programming can be a fascinating and rewarding 
field. There's 
something satisfying 
about 
bringing your ideas to 
life 
with code, 
and 
the 
constant 
learning 
and 
problem-solving can be really 
engaging.

What kind of programming do you enjoy most? Are 
you into web development, mobile app 
development, game development, or something 
else? Do you have a favorite programming language 
or 

technology stack?



In [41]:
ai_stream_msgs[7].raw["choices"][0]["delta"]["content"]

'to interact '

In [34]:
ai_stream_msgs[7])

{'message': ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00', 'finish_reason': None}, blocks=[TextBlock(block_type='text', text='DSPy appears to be a programming framework or system that utilizes natural language signatures to interact ')]),
 'raw': {'choices': [{'delta': {'content': 'to interact ',
     'role': 'assistant'},
    'finish_reason': None,
    'index': 0,
    'logprobs': None}],
  'created': 1735926260,
  'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00',
  'model': 'Meta-Llama-3.3-70B-Instruct',
  'object': 'chat.completion.chunk',
  'system_fingerprint': 'fastcoe'},
 'delta': 'to interact ',
 'logprobs': None,
 'additional_kwargs': {}}

In [6]:
full_response = ""
        
# Simulate stream of response with milliseconds delay
streaming_response = query_engine.llm.stream_chat(messages)
        
for chunk in streaming_response:
    full_response += chunk
    message_placeholder.markdown(full_response + "▌")

ai_stream_msgs = []
for stream in streaming_response:
    ai_stream_msgs.append(stream)

[ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00', 'finish_reason': None}, blocks=[TextBlock(block_type='text', text='')]), raw={'choices': [{'delta': {'content': '', 'role': 'assistant'}, 'finish_reason': None, 'index': 0, 'logprobs': None}], 'created': 1735926260, 'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00', 'model': 'Meta-Llama-3.3-70B-Instruct', 'object': 'chat.completion.chunk', 'system_fingerprint': 'fastcoe'}, delta='', logprobs=None, additional_kwargs={}),
 ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00', 'finish_reason': None}, blocks=[TextBlock(block_type='text', text='')]), raw={'choices': [{'delta': {'content': '', 'role': 'assistant'}, 'finish_reason': None, 'index': 0, 'logprobs': None}], 'created': 1735926260, 'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00', 'model': 'Meta-Llama-3.3-70B-Instru

In [7]:
ai_stream_msgs[-1]

ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={'id': '942f9445-47bf-48c0-b3ed-f33b7fd7bd00', 'finish_reason': None, 'usage': {'completion_tokens': 65, 'completion_tokens_after_first_per_sec': 198.08293442800777, 'completion_tokens_after_first_per_sec_first_ten': 200.5613091844793, 'completion_tokens_per_sec': 66.11378040544227, 'end_time': 1735926261.7462182, 'is_last_response': True, 'prompt_tokens': 2155, 'start_time': 1735926260.7630646, 'time_to_first_token': 0.6600565910339355, 'total_latency': 0.9831535816192627, 'total_tokens': 2220, 'total_tokens_per_sec': 2258.0398846166436}, 'model_name': 'Meta-Llama-3.3-70B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1735926260}, blocks=[TextBlock(block_type='text', text='DSPy appears to be a programming framework or system that utilizes natural language signatures to interact with Language Models (LMs), allowing for more abstract and flexible prompting and fine-tuning of these model

In [10]:
print(messages[1].content)

Context information is below.
---------------------
Preprint
3.1 N ATURAL LANGUAGE SIGNATURES CAN ABSTRACT PROMPTING & FINETUNING
Instead of free-form string prompts, DSPy programs use natural language signatures to assign work
to the LM. A DSPy signature isnatural-language typed declaration of a function: a short declarative
spec that tells DSPy what a text transformation needs to do (e.g., “consume questions and return
answers”), rather than how a specific LM should be prompted to implement that behavior. More
formally, a DSPy signature is a tuple of input fields and output fields (and an optional instruction).
A field consists offield name and optional metadata.4 In typical usage, the roles of fields are inferred
by DSPy as a function of field names. For instance, the DSPy compiler will use in-context learning
to interpret questiondifferently from answer and will iteratively refine its usage of these fields.
Signatures offer two benefits over prompts: they can be compiled into self-