<a href="https://colab.research.google.com/github/rvernica/notebook/blob/main/mongodb/VoyageAI-Quickstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Voyage AI Quickstart Tutorial](https://docs.voyageai.com/docs/quickstart-tutorial)

In [1]:
pip install voyageai



In [2]:
import os
import google.colab.userdata
import numpy
import sklearn.metrics.pairwise
import google.generativeai

In [3]:
documents = [
    "The Mediterranean diet emphasizes fish, olive oil, and vegetables, believed to reduce chronic diseases.",
    "Photosynthesis in plants converts light energy into glucose and produces essential oxygen.",
    "20th-century innovations, from radios to smartphones, centered on electronic advancements.",
    "Rivers provide water, irrigation, and habitat for aquatic species, vital for ecosystems.",
    "Apple’s conference call to discuss fourth fiscal quarter results and business updates is scheduled for Thursday, November 2, 2023 at 2:00 p.m. PT / 5:00 p.m. ET.",
    "Shakespeare's works, like 'Hamlet' and 'A Midsummer Night's Dream,' endure in literature."
]

In [4]:
os.environ["VOYAGE_API_KEY"] = google.colab.userdata.get("VOYAGE_API_KEY")
os.environ["GOOGLE_API_KEY"] = google.colab.userdata.get("GOOGLE_API_KEY")

In [5]:
import voyageai

vo = voyageai.Client()
# This will automatically use the environment variable VOYAGE_API_KEY.
# Alternatively, you can use vo = voyageai.Client(api_key="<your secret key>")

# Embed the documents
documents_embeddings = vo.embed(
    documents, model="voyage-3.5", input_type="document"
).embeddings

In [6]:
# Print the tokenized results
tokenized = vo.tokenize(documents)
for i in range(len(documents)):
    print(tokenized[i].tokens)

# Count the total tokens
print(vo.count_tokens(documents))



['<s>', '▁The', '▁Mediter', 'rane', 'an', '▁di', 'et', '▁emphas', 'izes', '▁fish', ',', '▁o', 'live', '▁oil', ',', '▁and', '▁veget', 'ables', ',', '▁believed', '▁to', '▁reduce', '▁chron', 'ic', '▁dise', 'ases', '.']
['<s>', '▁Ph', 'otos', 'yn', 'thesis', '▁in', '▁plants', '▁converts', '▁light', '▁energy', '▁into', '▁gl', 'uc', 'ose', '▁and', '▁produces', '▁essential', '▁o', 'xygen', '.']
['<s>', '▁', '2', '0', 'th', '-', 'century', '▁innov', 'ations', ',', '▁from', '▁rad', 'ios', '▁to', '▁smart', 'ph', 'ones', ',', '▁centered', '▁on', '▁electronic', '▁adv', 'anc', 'ements', '.']
['<s>', '▁R', 'ivers', '▁provide', '▁water', ',', '▁ir', 'rig', 'ation', ',', '▁and', '▁habitat', '▁for', '▁aqu', 'atic', '▁species', ',', '▁vital', '▁for', '▁e', 'cos', 'ystem', 's', '.']
['<s>', '▁Apple', '’', 's', '▁conference', '▁call', '▁to', '▁discuss', '▁fourth', '▁fis', 'cal', '▁quarter', '▁results', '▁and', '▁business', '▁updates', '▁is', '▁scheduled', '▁for', '▁Th', 'urs', 'day', ',', '▁November', '▁'

In [7]:
query = "When is Apple's conference call scheduled?"

In [8]:
# Get the embedding of the query
query_embedding = vo.embed([query], model="voyage-3.5", input_type="query").embeddings[0]

In [9]:
# Compute the similarity
# Voyage embeddings are normalized to length 1, therefore dot-product and cosine
# similarity are the same.
similarities = numpy.dot(documents_embeddings, query_embedding)
print(similarities)

retrieved_id = numpy.argmax(similarities)
retrieved_doc = documents[retrieved_id]
print(retrieved_doc)

[0.12507503 0.17040602 0.21611312 0.1184509  0.6560744  0.14526141]
Apple’s conference call to discuss fourth fiscal quarter results and business updates is scheduled for Thursday, November 2, 2023 at 2:00 p.m. PT / 5:00 p.m. ET.


In [10]:
def k_nearest_neighbors(query_embedding, documents_embeddings, k=5):
    # Convert to numpy array
    query_embedding = numpy.array(query_embedding)
    documents_embeddings = numpy.array(documents_embeddings)

    # Reshape the query vector embedding to a matrix of shape (1, n) to make it
    # compatible with cosine_similarity
    query_embedding = query_embedding.reshape(1, -1)

    # Calculate the similarity for each item in data
    cosine_sim = sklearn.metrics.pairwise.cosine_similarity(query_embedding, documents_embeddings)

    # Sort the data by similarity in descending order and take the top k items
    sorted_indices = numpy.argsort(cosine_sim[0])[::-1]

    # Take the top k related embeddings
    top_k_related_indices = sorted_indices[:k]
    top_k_related_embeddings = documents_embeddings[sorted_indices[:k]]
    top_k_related_embeddings = [
        list(row[:]) for row in top_k_related_embeddings
    ]  # convert to list

    return top_k_related_embeddings, top_k_related_indices

In [11]:
# Use the k-nearest neighbor algorithm to identify the top-k documents with the highest similarity
retrieved_embds, retrieved_embd_indices = k_nearest_neighbors(
    query_embedding, documents_embeddings, k=3
)
retrieved_docs = [documents[index] for index in retrieved_embd_indices]
print(retrieved_docs)

['Apple’s conference call to discuss fourth fiscal quarter results and business updates is scheduled for Thursday, November 2, 2023 at 2:00 p.m. PT / 5:00 p.m. ET.', '20th-century innovations, from radios to smartphones, centered on electronic advancements.', 'Photosynthesis in plants converts light energy into glucose and produces essential oxygen.']


In [12]:
# Reranking
documents_reranked = vo.rerank(query, documents, model="rerank-2.5", top_k=3)

In [13]:
for r in documents_reranked.results:
    print(f"Document: {r.document}")
    print(f"Relevance Score: {r.relevance_score}")
    print(f"Index: {r.index}")
    print()

Document: Apple’s conference call to discuss fourth fiscal quarter results and business updates is scheduled for Thursday, November 2, 2023 at 2:00 p.m. PT / 5:00 p.m. ET.
Relevance Score: 0.94140625
Index: 4

Document: 20th-century innovations, from radios to smartphones, centered on electronic advancements.
Relevance Score: 0.283203125
Index: 2

Document: The Mediterranean diet emphasizes fish, olive oil, and vegetables, believed to reduce chronic diseases.
Relevance Score: 0.263671875
Index: 0



In [14]:
# Take the retrieved document and use it as a prompt for the text generation model
prompt = f"Based on the information: '{retrieved_doc}', generate a response of {query}"
print(prompt)

Based on the information: 'Apple’s conference call to discuss fourth fiscal quarter results and business updates is scheduled for Thursday, November 2, 2023 at 2:00 p.m. PT / 5:00 p.m. ET.', generate a response of When is Apple's conference call scheduled?


In [15]:
# Initialize the Gemini model
google.generativeai.configure(api_key=os.environ["GOOGLE_API_KEY"])
gemini_model = google.generativeai.GenerativeModel('gemini-2.5-flash')

# Generate the response using Gemini
response = gemini_model.generate_content(prompt)

print(response.text)

Apple's conference call is scheduled for Thursday, November 2, 2023 at 2:00 p.m. PT / 5:00 p.m. ET.
