# SambanNova Langchain Wrappers Usage

In [None]:
import os

from dotenv import load_dotenv
from langchain_embeddings import SambaStudioEmbeddings
from langchain_llms import SambaStudio, SambaNovaCloud
from langchain_chat_models import ChatSambaNovaCloud, ChatSambaStudio
from langchain_core.messages import SystemMessage, HumanMessage

current_dir = os.getcwd()
utils_dir = os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(utils_dir, '..'))

load_dotenv(os.path.join(repo_dir, '.env'), override=True)

# SambaStudio LLM

## Non streaming

In [9]:
llm = SambaStudio(
    streaming=False,
    # base_uri="api/predict/generic",
    model_kwargs={
        'do_sample': False,
        'temperature': 0.01,
        'max_tokens_to_generate': 256,
        'process_prompt': False,
        'select_expert': 'Meta-Llama-3-70B-Instruct-4096',
    },
)

In [11]:
llm.invoke('tell me a 50 word tale')

' of a brave knight\nSir Valoric, the fearless knight, charged into the dark forest, his armor shining like the sun. He battled the dragon, its fiery breath singeing his beard, but he stood tall, his sword flashing in the moonlight, until the beast lay defeated at his feet, its treasure his noble reward.'

## Streaming

In [None]:
llm = SambaStudio(
    streaming=True,
    model_kwargs={
        'do_sample': False,
        'max_tokens_to_generate': 256,
        'temperature': 0.01,
        'process_prompt': False,
        'select_expert': 'Meta-Llama-3-70B-Instruct-4096',
    },
)

In [None]:
for chunk in llm.stream('tell me a 50 word tale'):
    print(chunk, end='', flush=True)

 of a character who is a master of disguise

Sure! Here is a 50-word tale of a character who is a master of disguise:

"Araxys, the skilled disguise artist, transformed into a stunning mermaid to infiltrate a pirate's lair. With a flick of her tail, she charmed the pirates and stole their treasure."

# SambaNovaCloud LLM

## Non Streaming

In [4]:
llm = SambaNovaCloud(model='llama3-70b')

In [5]:
import json

llm.invoke(json.dumps([{'role': 'user', 'content': 'hello'}]))

'Hello. How can I assist you today?'

In [6]:
llm.invoke('hello')

'Hello. How can I assist you today?'

## Streaming

In [7]:
for i in llm.stream('hello tell me a long story'):
    print(i)


Here's a long story 
for you:

Once upon 
a time, in a small village 
nestled in the rolling hills of 
rural France, there lived a 
young girl named Sophie. Sophie 
was a curious and adventurous 
child, with a mop of curly 
brown hair and a smile that 
could light up the darkest 
of rooms. She lived with 
her parents, Pierre and 
Colette, in a small stone cottage 
on the outskirts of 
the village.

Sophie's village was 
a charming 
place, filled with narrow 
cobblestone streets, quaint shops, 
and 
bustling cafes. The villagers 
were a tight-knit 
community, and everyone knew each 
other's names and stories. Sophie 
loved listening to the villagers' 
tales of 
old, which 
often featured brave knights, 
beautiful princesses, and 
magical creatures.

One day, while exploring 
the village, Sophie stumbled upon 
a small, mysterious shop tucked 
away on a quiet street. 
The sign above the door 
read "Curios 
and Wonders," and the 
windows were filled 
with a dazzling array of strange 
and 

# SambaStudio Chat Model

## Non Streaming

In [2]:
llm = ChatSambaStudio(
    model="Meta-Llama-3-70B-Instruct-4096",
    max_tokens=1024,
    temperature=0.3,
    top_k=1,
    top_p=0.01,
    do_sample = True,
    process_prompt = False,
)

In [3]:
llm.invoke("tell me a joke")

AIMessage(content="Here's one:\n\nWhy couldn't the bicycle stand up by itself?\n\n(Wait for it...)\n\nBecause it was two-tired!\n\nHope that made you laugh!", additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 14, 'completion_tokens': 32, 'total_tokens': 46, 'throughput_after_first_token': 70.33306137927788, 'time_to_first_token': 0.22188520431518555, 'model_execution_time': 0.5915548801422119}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprint': '', 'created': 1727913501}, id='126f946f-3cbf-4739-b00b-bf99fa864d48')

In [4]:
messages = [
    SystemMessage(content="You are a helpful assistant with pirate accent"),
    HumanMessage(content="tell me a joke"),
]
llm.invoke(messages)

AIMessage(content='Arrr, listen up, matey! Here be a joke fer ye:\n\nWhy did the pirate quit his job?\n\nBecause he was sick o\' all the arrrr-guments! (get it? arguments, but with a pirate "arrr" sound? Aye, I be a regular comedic genius, savvy?)\n\nSo, did I make ye laugh, or did I walk the plank?', additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 27, 'completion_tokens': 80, 'total_tokens': 107, 'throughput_after_first_token': 71.2460325330454, 'time_to_first_token': 0.21974945068359375, 'model_execution_time': 1.2584037780761719}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprint': '', 'created': 1727913503}, id='1f48b1e1-19cb-43a2-852d-c4b11a8ab3b4')

In [5]:
future_response = llm.ainvoke("tell me a joke")
await future_response

AIMessage(content="Here's one:\n\nWhy couldn't the bicycle stand up by itself?\n\n(Wait for it...)\n\nBecause it was two-tired!\n\nHope that made you laugh!", additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 14, 'completion_tokens': 32, 'total_tokens': 46, 'throughput_after_first_token': 70.43063268918901, 'time_to_first_token': 0.22060894966125488, 'model_execution_time': 0.5897665023803711}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprint': '', 'created': 1727913507}, id='5960e729-b41d-4631-ac90-af1e8188b654')

## Batching

In [6]:
llm.batch(["tell me a joke", "which is the capital of UK?"])

[AIMessage(content="Here's one:\n\nWhy couldn't the bicycle stand up by itself?\n\n(Wait for it...)\n\nBecause it was two-tired!\n\nHope that made you laugh!", additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 14, 'completion_tokens': 32, 'total_tokens': 46, 'throughput_after_first_token': 70.06468839908355, 'time_to_first_token': 0.2191941738128662, 'model_execution_time': 0.5902798175811768}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprint': '', 'created': 1727913510}, id='f6ef319e-4ba2-4117-8d63-f5823d0bc947'),
 AIMessage(content='The capital of the United Kingdom (UK) is London.', additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 17, 'completion_tokens': 12, 'total_tokens': 29, 'throughput_after_first_token': 62.08816650383397, 'time_to_first_token': 0.21888446807861328, 'model_execution_time': 0.315521240234375}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprint'

In [7]:
future_responses = llm.abatch(["tell me a joke", "which is the capital of UK?"])
await future_responses

[AIMessage(content="Here's one:\n\nWhy couldn't the bicycle stand up by itself?\n\n(Wait for it...)\n\nBecause it was two-tired!\n\nHope that made you laugh!", additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 14, 'completion_tokens': 32, 'total_tokens': 46, 'throughput_after_first_token': 70.03251686884015, 'time_to_first_token': 0.21899962425231934, 'model_execution_time': 0.5902557373046875}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprint': '', 'created': 1727913513}, id='33123501-e370-4a70-bda1-92c8af70865b'),
 AIMessage(content='The capital of the United Kingdom (UK) is London.', additional_kwargs={}, response_metadata={'finish_reason': None, 'usage': {'prompt_tokens': 17, 'completion_tokens': 12, 'total_tokens': 29, 'throughput_after_first_token': 62.36218692927395, 'time_to_first_token': 0.21871089935302734, 'model_execution_time': 0.3149230480194092}, 'model_name': 'Meta-Llama-3-70B-Instruct-4096', 'system_fingerprin

## Streaming

In [8]:
for chunk in llm.stream("tell me a joke"):
    print(chunk.content, end="")

Here's one:

Why couldn't the bicycle stand up by itself?

(Wait for it...)

Because it was two-tired!

Hope that made you laugh!

In [9]:
messages = [
    SystemMessage(content="You are a helpful assistant with pirate accent"),
    HumanMessage(content="tell me a joke"),
]
for chunk in llm.stream(messages):
    print(chunk.content)

Arrr, 
listen up, matey! Here 
be a joke fer ye:


Why did the pirate quit his job?


Because he was 
sick o' all 
the arrrr-guments! (get 
it? arguments, but with a 
pirate "arrr" 
sound? Aye, I be 
a regular comedic genius, savvy?)


So, did I make 
ye laugh, or did I 
walk the plank?



In [10]:
async for chunk in llm.astream("tell me a joke"):
    print(chunk.content)

Here's one:


Why couldn't the bicycle stand 
up by itself?

(Wait 
for it...)

Because it 
was two-tired!

Hope that 
made you laugh!



# SambaNova Cloud Chat Model

## Non Streaming

In [11]:
llm = ChatSambaNovaCloud(
    model= "llama3-405b",
    max_tokens=1024,
    temperature=0.7,
    top_k=1,
    top_p=0.01,
    stream_options={'include_usage':True}
    )

In [12]:
llm.invoke("tell me a joke")

AIMessage(content='A man walked into a library and asked the librarian, "Do you have any books on Pavlov\'s dogs and Schrödinger\'s cat?"\n\nThe librarian replied, "It rings a bell, but I\'m not sure if it\'s here or not."', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 146.6490519186912, 'completion_tokens_after_first_per_sec_first_ten': 172.75245626647495, 'completion_tokens_per_sec': 82.40550555345376, 'end_time': 1727913533.7847056, 'is_last_response': True, 'prompt_tokens': 39, 'start_time': 1727913533.0805886, 'time_to_first_token': 0.342710018157959, 'total_latency': 0.6552960222417659, 'total_tokens': 93, 'total_tokens_per_sec': 141.92059289761482}, 'model_name': 'llama3-405b', 'system_fingerprint': 'fastcoe', 'created': 1727913533}, id='2a8be2d0-8947-4296-ab85-d8b78eff390c')

In [13]:
messages = [
    SystemMessage(content="You are a helpful assistant with pirate accent"),
    HumanMessage(content="tell me a joke")
    ]
llm.invoke(messages)

AIMessage(content="Yer lookin' fer a joke, eh? Alright then, matey! Here be one fer ye:\n\nWhy did the pirate quit his job?\n\n(pause fer dramatic effect)\n\nBecause he was sick o' all the arrrr-guments!\n\nYarrr, hope that made ye laugh, me hearty!", additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 5.153846153846154, 'completion_tokens': 64, 'completion_tokens_after_first_per_sec': 112.16796935854752, 'completion_tokens_after_first_per_sec_first_ten': 129.12026053938416, 'completion_tokens_per_sec': 76.30859164791121, 'end_time': 1727913536.4593658, 'is_last_response': True, 'prompt_tokens': 47, 'start_time': 1727913535.5546703, 'time_to_first_token': 0.34303784370422363, 'total_latency': 0.8386997927480668, 'total_tokens': 111, 'total_tokens_per_sec': 132.347713639346}, 'model_name': 'llama3-405b', 'system_fingerprint': 'fastcoe', 'created': 1727913535}, id='e5c8ca43-48d2-441f-89ba-159cb59ebfc3')

In [14]:
future_response = llm.ainvoke("tell me a joke")
await(future_response) 

AIMessage(content='A man walked into a library and asked the librarian, "Do you have any books on Pavlov\'s dogs and Schrödinger\'s cat?"\n\nThe librarian replied, "It rings a bell, but I\'m not sure if it\'s here or not."', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 140.31482546341212, 'completion_tokens_after_first_per_sec_first_ten': 165.52344871132541, 'completion_tokens_per_sec': 79.00364415123241, 'end_time': 1727913538.4355943, 'is_last_response': True, 'prompt_tokens': 39, 'start_time': 1727913537.7005973, 'time_to_first_token': 0.35727500915527344, 'total_latency': 0.6835127743807705, 'total_tokens': 93, 'total_tokens_per_sec': 136.06183159378915}, 'model_name': 'llama3-405b', 'system_fingerprint': 'fastcoe', 'created': 1727913537}, id='1ec805fa-5723-4b39-b4ef-b8c569b9075a')

## Batching

In [15]:
llm = ChatSambaNovaCloud(
    model= "llama3-405b",
    streaming=False,
    max_tokens=1024,
    temperature=0.7,
    top_k=1,
    top_p=0.01,
    stream_options={'include_usage':True}
    )

In [16]:
llm.batch(["tell me a joke","which is the capital of UK?"])

[AIMessage(content='A man walked into a library and asked the librarian, "Do you have any books on Pavlov\'s dogs and Schrödinger\'s cat?"\n\nThe librarian replied, "It rings a bell, but I\'m not sure if it\'s here or not."', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 146.6462464047286, 'completion_tokens_after_first_per_sec_first_ten': 172.78143949380438, 'completion_tokens_per_sec': 82.37972702699832, 'end_time': 1727913541.350326, 'is_last_response': True, 'prompt_tokens': 39, 'start_time': 1727913540.6459446, 'time_to_first_token': 0.34296751022338867, 'total_latency': 0.6555010795593261, 'total_tokens': 93, 'total_tokens_per_sec': 141.8761965464971}, 'model_name': 'llama3-405b', 'system_fingerprint': 'fastcoe', 'created': 1727913540}, id='1ea6c4d0-a67f-4f76-b973-bbc68342dfe5'),
 AIMessage(content='The capital of the United Kingdom is London.', additional_kwa

In [17]:
future_responses = llm.abatch(["tell me a joke","which is the capital of UK?"])
await(future_responses)

[AIMessage(content='A man walked into a library and asked the librarian, "Do you have any books on Pavlov\'s dogs and Schrödinger\'s cat?"\n\nThe librarian replied, "It rings a bell, but I\'m not sure if it\'s here or not."', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 6.875, 'completion_tokens': 54, 'completion_tokens_after_first_per_sec': 116.25315057604108, 'completion_tokens_after_first_per_sec_first_ten': 136.77229629418824, 'completion_tokens_per_sec': 35.56309530157825, 'end_time': 1727913544.6331508, 'is_last_response': True, 'prompt_tokens': 39, 'start_time': 1727913543.053638, 'time_to_first_token': 1.1236112117767334, 'total_latency': 1.5184280092065985, 'total_tokens': 93, 'total_tokens_per_sec': 61.24755301938477}, 'model_name': 'llama3-405b', 'system_fingerprint': 'fastcoe', 'created': 1727913543}, id='d7ae271e-922e-4157-8659-7b68813ea8e5'),
 AIMessage(content='The capital of the United Kingdom is London.', additional_kwa

## Streaming

In [18]:
llm = ChatSambaNovaCloud(
    model= "llama3-405b",
    streaming=True,
    max_tokens=1024,
    temperature=0.7,
    top_k=1,
    top_p=0.01,
    stream_options={'include_usage':True}
    )

In [19]:
for chunk in llm.stream("tell me a joke"):
    print(chunk.content)


A man walked into a 
library and asked the 
librarian, "Do you have any books 
on Pavlov's dogs 
and Schrödinger's cat?"


The librarian 
replied, "It rings a bell, 
but I'm not sure 
if it's here 
or not."





In [20]:
messages = [
    SystemMessage(content="You are a helpful assistant with pirate accent"),
    HumanMessage(content="tell me a joke")
    ]
for chunk in llm.stream(messages):
    print(chunk.content)


Yer lookin' 
fer a joke, eh? 
Alright then, matey! 
Here be 
one fer ye:

Why did 
the pirate quit 
his job?

(pause fer 
dramatic effect)

Because 
he was sick o' all 
the arrrr-guments!




Yarrr, hope that made ye 
laugh, 
me hearty!





In [21]:
async for chunk in llm.astream("tell me a joke"):
    print(chunk.content)


A man walked into a 
library and asked the 
librarian, "Do you have any books 
on Pavlov's dogs 
and Schrödinger's cat?"


The librarian 
replied, "It rings a bell, 
but I'm not sure 
if it's here 
or not."





# Sambastudio Embeddings

In [None]:
embedding = SambaStudioEmbeddings(batch_size=1, model_kwargs={'select_expert': 'e5-mistral-7b-instruct'})
embedding.embed_documents(['tell me a 50 word tale', 'tell me a joke'])
embedding.embed_query('tell me a 50 word tale')

In [13]:
from langchain.schema import Document
from langchain.vectorstores import Chroma

docs = [
    'tell me a 50 word tale',
    'tell me a joke',
    'when was America discoverd?',
    'how to build an engine?',
    'give me 3 party activities',
    'give me three healty dishes',
]
docs = [Document(doc) for doc in docs]

query = 'prompt for generating something fun'

vectordb = Chroma.from_documents(docs, embedding)
retriever = vectordb.as_retriever()

retriever.get_relevant_documents(query)

  warn_deprecated(


[Document(page_content='tell me a 50 word tale'),
 Document(page_content='tell me a joke'),
 Document(page_content='give me 3 party activities'),
 Document(page_content='give me three healty dishes')]