In [45]:
import os

from dotenv import find_dotenv, load_dotenv
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

In [46]:
API_KEY = os.getenv("API_KEY")

In [47]:

import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('API_KEY')}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding

text = "The weather is sunny today. Better to go out for a walk!"

embedding = generate_embeddings(text)

In [48]:
embedding 

array([-0.0115451 , -0.0084069 , -0.03051822, ...,  0.00911516,
       -0.00591938,  0.0277543 ], shape=(1536,))

In [49]:
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0

from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
# model = SentenceTransformer(
#     "Qwen/Qwen3-Embedding-0.6B",
#     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
#     tokenizer_kwargs={"padding_side": "left"},
# )

# The queries and documents to embed
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

# Encode the queries and documents. Note that queries benefit from using a prompt
# Here we use the prompt called "query" stored under `model.prompts`, but you can
# also pass your own prompt via the `prompt` argument
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

# Compute the (cosine) similarity between the query and document embeddings
# similarity = model.similarity(query_embeddings, document_embeddings)
# print(similarity)
# tensor([[0.7646, 0.1414],
#         [0.1355, 0.6000]])


In [50]:
document_embeddings

array([[-0.04718243, -0.02091684,  0.00366517, ...,  0.05624258,
         0.07076075, -0.01714639],
       [-0.0531292 , -0.01502094, -0.0012658 , ...,  0.0036741 ,
        -0.02054259,  0.01957474]], shape=(2, 1024), dtype=float32)

In [51]:
len(document_embeddings[0])

1024

In [52]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

documents = [
    "my name is himanshu and i am a software engineer",
    "himanshu is my name and i am a software engineer",
    "apple is red and it is a fruit",
    "banana is yellow and it is also a fruit",
    "himanshu is taking a class for gen ai and will prepare for gen ai interviews",
]


document_embeddings = model.encode(documents)

In [53]:
document_embeddings

array([[ 0.00054985, -0.08300398, -0.00976096, ..., -0.01892373,
        -0.02088322, -0.01572801],
       [ 0.01099291, -0.07449266, -0.01088307, ..., -0.03351662,
        -0.01934452, -0.01644885],
       [-0.08274741,  0.05246587, -0.00250334, ..., -0.00065252,
         0.01755753,  0.01571289],
       [-0.05337092,  0.00404484, -0.00416533, ..., -0.01018745,
        -0.03779412,  0.0176566 ],
       [-0.02787851, -0.03393041, -0.01018015, ...,  0.00933281,
         0.00708564, -0.01425746]], shape=(5, 1024), dtype=float32)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
sent1 = document_embeddings[0]
sent2 = document_embeddings[1]
sent3 = document_embeddings[2]
sent4 = document_embeddings[3]
sent5 = document_embeddings[4]

In [56]:
cosine_similarity(sent1.reshape(1, -1), sent2.reshape(1, -1))

array([[0.9306487]], dtype=float32)

In [57]:
cosine_similarity(sent2.reshape(1, -1), sent3.reshape(1, -1))

array([[0.39299363]], dtype=float32)

In [58]:
sent1.reshape(1, -1)

array([[ 0.00054985, -0.08300398, -0.00976096, ..., -0.01892373,
        -0.02088322, -0.01572801]], shape=(1, 1024), dtype=float32)

In [59]:
sent1

array([ 0.00054985, -0.08300398, -0.00976096, ..., -0.01892373,
       -0.02088322, -0.01572801], shape=(1024,), dtype=float32)

In [60]:
cosine_similarity(sent3.reshape(1, -1), sent4.reshape(1, -1))

array([[0.77635837]], dtype=float32)

In [61]:
## manual calculation
from numpy import dot
from numpy.linalg import norm

dot(sent1, sent2) / (norm(sent1) * norm(sent2))

np.float32(0.93064857)

In [62]:
query = "who is himanshu"

query_embedding = model.encode([query])

In [63]:
dist1 = cosine_similarity(query_embedding.reshape(1, -1), sent1.reshape(1, -1))
dist2 = cosine_similarity(query_embedding.reshape(1, -1), sent5.reshape(1, -1))
dist3 = cosine_similarity(query_embedding.reshape(1, -1), sent2.reshape(1, -1))
dist4 = cosine_similarity(query_embedding.reshape(1, -1), sent3.reshape(1, -1))
dist5 = cosine_similarity(query_embedding.reshape(1, -1), sent4.reshape(1, -1))

In [64]:
print(dist1, dist2, dist3, dist4, dist5)

[[0.7226063]] [[0.5987707]] [[0.64874417]] [[0.38814497]] [[0.38637748]]


In [65]:
# The distances show that sent1 and sent2 are the most similar to the query "who is himanshu"

In [66]:
## We will explore diferent vector databases next

#### We will explore diferent vector databases next

## FAISS

In [None]:
# pip install faiss-cpu
# pip install numpy requests 

In [67]:
data = """

Making an Impact
Helping Millions of Students Succeed
Sudhanshu's commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Sudhanshu himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.

In 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Sudhanshu remained focused on his mission. Even after the acquisition, iNeuron continued to offer some of the most affordable and accessible tech courses in the world.

The Entrepreneur and Teacher: Sudhanshu's Dual Legacy
Sudhanshu's journey isn't just one of entrepreneurial success; it's also a story of dedication to teaching. Throughout his career, he has remained a passionate educator, constantly looking for ways to empower others through knowledge. Whether teaching courses in Big Data, Data Science, or programming, Sudhanshu has always sought to make complex subjects accessible to learners at all levels.

His commitment to affordable education has earned him the respect and admiration of countless students. Many credit Sudhanshu with changing their lives, helping them secure jobs, improve their skills, and break free from the limitations of their backgrounds.
"""


In [68]:
clean_data = data.strip()
len(clean_data)

1498

In [69]:
max_char = 300
overlap = 100
chunks = []
i = 0
while i < len(clean_data):
    chunks.append(clean_data[i:i + max_char])
    i += max_char - overlap

In [70]:
chunks

["Making an Impact\nHelping Millions of Students Succeed\nSudhanshu's commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in toda",
 "1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Sudhanshu himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.\n\nIn 20",
 'backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.\n\nIn 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Sudhanshu remained focused on his mission. Even after the acquisition, iNeuron',
 "ignificant milestone, Sudhanshu remained focused on his mission. Even after t

In [71]:
len(chunks)

8

In [72]:

import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('API_KEY')}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding


In [73]:
for chunk in chunks:
    embedding = generate_embeddings(chunk)
    # Store or process the embedding as needed
    print(embedding)

[ 0.01636502 -0.01330146  0.04360536 ... -0.00716132 -0.00025957
  0.00777273]
[-0.02781193  0.00343053  0.04798149 ... -0.01521909 -0.01743826
 -0.00741914]
[-0.0211933  -0.03513513  0.01883369 ... -0.01879053  0.00953914
  0.02064656]
[ 0.00012402 -0.01615066  0.03857188 ... -0.03406656  0.00343498
  0.02556968]
[ 0.00538704 -0.01142304  0.04874471 ... -0.0355971   0.00306157
  0.00572053]
[-0.00912461 -0.01832682  0.04314627 ... -0.02185768 -0.00779892
 -0.01152378]
[-0.00252494 -0.02891203  0.04281055 ... -0.00076008 -0.01947349
  0.00868658]
[ 0.0317256   0.01637366  0.04584625 ... -0.01135022 -0.06958151
  0.01558773]


In [74]:
emb_list = []
meta = []

for idx, chunk in enumerate(chunks):
    vec = generate_embeddings(chunk)
    # Store or process the embedding as needed
    emb_list.append(vec.astype('float32'))
    meta.append({"id": idx, "text": chunk})
    

In [75]:
emb_list

[array([ 1.6688565e-02, -1.3501738e-02,  4.3314822e-02, ...,
        -7.1801147e-03, -1.3807718e-05,  7.8434953e-03],
       shape=(1536,), dtype=float32),
 array([-0.02781024,  0.003473  ,  0.04789979, ..., -0.01521816,
        -0.01742407, -0.00741869], shape=(1536,), dtype=float32),
 array([-0.0211933 , -0.03513513,  0.01883369, ..., -0.01879053,
         0.00953914,  0.02064656], shape=(1536,), dtype=float32),
 array([ 0.00011691, -0.01620286,  0.03854437, ..., -0.03406553,
         0.00343817,  0.02559526], shape=(1536,), dtype=float32),
 array([ 0.00537847, -0.01142924,  0.04893444, ..., -0.03565635,
         0.00286331,  0.00577165], shape=(1536,), dtype=float32),
 array([-0.00912461, -0.01832682,  0.04314627, ..., -0.02185768,
        -0.00779892, -0.01152378], shape=(1536,), dtype=float32),
 array([-0.00249095, -0.02888657,  0.04278532, ..., -0.00077873,
        -0.01946084,  0.00868023], shape=(1536,), dtype=float32),
 array([ 0.0317256 ,  0.01637366,  0.04584625, ..., -0.011

In [76]:
meta

[{'id': 0,
  'text': "Making an Impact\nHelping Millions of Students Succeed\nSudhanshu's commitment to affordable education wasn't just a business strategy—it was his life's mission. Over the years, iNeuron has helped over 1.5 million students from 34+ countries, providing them with the skills they need to succeed in toda"},
 {'id': 1,
  'text': "1.5 million students from 34+ countries, providing them with the skills they need to succeed in today's competitive job market. Many of these students, like Sudhanshu himself, came from disadvantaged backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.\n\nIn 20"},
 {'id': 2,
  'text': 'backgrounds. They saw iNeuron as a lifeline—an opportunity to rise above their circumstances.\n\nIn 2022, iNeuron was acquired by PhysicsWallah in a deal worth ₹250 crore. While this acquisition was a significant milestone, Sudhanshu remained focused on his mission. Even after the acquisition, iNeuron'},
 {'id': 3,
  'te

In [77]:
type(emb_list)

list

In [78]:
# convert into numpy compatible 2D array

xb = np.vstack(emb_list)

In [79]:
xb

array([[ 1.6688565e-02, -1.3501738e-02,  4.3314822e-02, ...,
        -7.1801147e-03, -1.3807718e-05,  7.8434953e-03],
       [-2.7810242e-02,  3.4729976e-03,  4.7899794e-02, ...,
        -1.5218164e-02, -1.7424075e-02, -7.4186907e-03],
       [-2.1193301e-02, -3.5135128e-02,  1.8833693e-02, ...,
        -1.8790530e-02,  9.5391441e-03,  2.0646563e-02],
       ...,
       [-9.1246096e-03, -1.8326821e-02,  4.3146275e-02, ...,
        -2.1857677e-02, -7.7989222e-03, -1.1523780e-02],
       [-2.4909484e-03, -2.8886573e-02,  4.2785320e-02, ...,
        -7.7872525e-04, -1.9460838e-02,  8.6802337e-03],
       [ 3.1725604e-02,  1.6373660e-02,  4.5846250e-02, ...,
        -1.1350221e-02, -6.9581509e-02,  1.5587725e-02]],
      shape=(8, 1536), dtype=float32)

In [80]:
xb.shape

(8, 1536)

In [81]:
import faiss

In [82]:
faiss.normalize_L2(xb) # Normalize the vectors to unit length

In [83]:
xb

array([[ 1.6688565e-02, -1.3501738e-02,  4.3314822e-02, ...,
        -7.1801147e-03, -1.3807718e-05,  7.8434953e-03],
       [-2.7810242e-02,  3.4729976e-03,  4.7899794e-02, ...,
        -1.5218164e-02, -1.7424075e-02, -7.4186907e-03],
       [-2.1193303e-02, -3.5135131e-02,  1.8833695e-02, ...,
        -1.8790532e-02,  9.5391450e-03,  2.0646565e-02],
       ...,
       [-9.1246096e-03, -1.8326821e-02,  4.3146275e-02, ...,
        -2.1857677e-02, -7.7989222e-03, -1.1523780e-02],
       [-2.4909484e-03, -2.8886573e-02,  4.2785320e-02, ...,
        -7.7872525e-04, -1.9460838e-02,  8.6802337e-03],
       [ 3.1725600e-02,  1.6373659e-02,  4.5846246e-02, ...,
        -1.1350220e-02, -6.9581501e-02,  1.5587723e-02]],
      shape=(8, 1536), dtype=float32)

In [84]:
d = xb.shape[1]  # dimension

In [85]:
d

1536

In [86]:
index = faiss.IndexFlatIP(d)  # setting the index dimension
index.add(xb)  # adding the vectors to the index. currently, this is stored in-memory

In [87]:
index.ntotal  # total number of vectors in the index

8

In [88]:
index_path = "index_sudhanshu.faiss"
meta_path = "meta_sudhanshu.jsonl"

In [89]:
faiss.write_index(index, index_path)  # write the index to disk

In [90]:
import json, os

In [91]:
# write the metadata to a jsonl file
with open(meta_path, 'w') as f:
    for item in meta:
        f.write(json.dumps(item) + "\n")

In [92]:
## search the vector db with a query
def search(query, k = 3):
    q = generate_embeddings(query).astype("float32").reshape(1, -1)
    faiss.normalize_L2(q)
    D, I = index.search(q, k)
    # return f"Document ID: {I[0]}, Distance: {D[0]}"
    return D, I

In [93]:
def get_result(D, I):
    results = []
    for i, distance in zip(I[0], D[0]):
        results.append((meta[i]['text'], distance))
    return results

In [94]:
query = "who is Sudhanshu?"
D, I = search(query)
results = get_result(D, I)
# print(*results, sep = "\n")
print(f"Query: {query}\n")
for text, distance in results:
    print(f"Text: {text}\nDistance: {distance}\n")

Query: who is Sudhanshu?

Text: preneur and Teacher: Sudhanshu's Dual Legacy
Sudhanshu's journey isn't just one of entrepreneurial success; it's also a story of dedication to teaching. Throughout his career, he has remained a passionate educator, constantly looking for ways to empower others through knowledge. Whether teaching cou
Distance: 0.5532038807868958

Text:  accessible to learners at all levels.

His commitment to affordable education has earned him the respect and admiration of countless students. Many credit Sudhanshu with changing their lives, helping them secure jobs, improve their skills, and break free from the limitations of their backgrounds.
Distance: 0.5193718671798706

Text: nate educator, constantly looking for ways to empower others through knowledge. Whether teaching courses in Big Data, Data Science, or programming, Sudhanshu has always sought to make complex subjects accessible to learners at all levels.

His commitment to affordable education has earned him th

## Qdrant Vector DB

In [61]:
# pip install qdrant-client

In [None]:
# API Key: *****
# Cluster Endpoint: https://bd599374-cdfa-4e92-a34b-2dcdd0783be1.europe-west3-0.gcp.cloud.qdrant.io

In [95]:
qdrant_api_key = os.getenv("API_KEY") 


In [96]:
import os
from dotenv import load_dotenv

from qdrant_client import QdrantClient

# Force reload in case env already loaded
load_dotenv(override=True)

qdrant_api_key = os.getenv("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url="https://bd599374-cdfa-4e92-a34b-2dcdd0783be1.europe-west3-0.gcp.cloud.qdrant.io", 
    api_key= qdrant_api_key,
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='sudhanshu_story')]


In [97]:
collection_name  = "sudhanshu_story"

In [98]:
from qdrant_client import models

qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)

  qdrant_client.recreate_collection(


True

In [99]:
points = []

for idx, (chunk, emb) in enumerate(zip(chunks, emb_list)):
    point = models.PointStruct(
        id=idx,
        vector=emb.astype("float32").tolist(),
        payload={"text": chunk}
    )
    points.append(point)    

In [100]:
points

[PointStruct(id=0, vector=[0.01668856479227543, -0.01350173819810152, 0.043314822018146515, 0.04922020807862282, 0.03803379833698273, -0.03889228776097298, 0.002807984361425042, 0.10999103635549545, -0.03028135374188423, 0.008136162534356117, 0.06170736625790596, -0.04729510471224785, -0.05936602130532265, -0.018938854336738586, 0.013150536455214024, -0.007485790178179741, -0.010796187445521355, -0.04180596023797989, 0.006383408326655626, 0.04516188055276871, 0.04458955302834511, 0.04321076348423958, 0.03186826407909393, -0.01036694087088108, 0.07721225172281265, -0.043054673820734024, 0.016480445861816406, 0.009586494415998459, -0.007199625950306654, -0.05754498019814491, 0.0021023298613727093, -0.010418971069157124, -0.03803379833698273, -0.009924687445163727, -0.015270751900970936, 0.010848216712474823, -0.03139999508857727, 0.0020990779157727957, -0.00337868626229465, 0.02354349195957184, 0.02516942471265793, -0.02102004736661911, 0.0366550087928772, 0.03831996023654938, -0.0329608

In [101]:
# embeddings are pushed to qdrant vector db in cloud
qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [102]:
query = "who is sudhanshu"

In [103]:
q_resp = generate_embeddings(query)

In [104]:
q_vec= np.array(q_resp).astype("float32")
q_vec

array([-2.1635571e-02, -2.5846353e-02, -5.9905625e-03, ...,
       -5.6839213e-03,  1.9030705e-02, -7.9870872e-05],
      shape=(1536,), dtype=float32)

In [105]:
qdrant_client.search(
    collection_name=collection_name,
    query_vector=q_vec,
    limit=3,
    with_payload=True
)

  qdrant_client.search(


[ScoredPoint(id=4, version=1, score=0.5387037, payload={'text': "preneur and Teacher: Sudhanshu's Dual Legacy\nSudhanshu's journey isn't just one of entrepreneurial success; it's also a story of dedication to teaching. Throughout his career, he has remained a passionate educator, constantly looking for ways to empower others through knowledge. Whether teaching cou"}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=5, version=1, score=0.5018914, payload={'text': 'nate educator, constantly looking for ways to empower others through knowledge. Whether teaching courses in Big Data, Data Science, or programming, Sudhanshu has always sought to make complex subjects accessible to learners at all levels.\n\nHis commitment to affordable education has earned him the re'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=6, version=1, score=0.50098884, payload={'text': ' accessible to learners at all levels.\n\nHis commitment to affordable education has earned him t

## Chroma DB

In [None]:
## pip install chromadb

## pip show chromadb    ## this will show chromadb version and location

In [106]:
import chromadb
client = chromadb.Client()  ## client means connection

In [107]:
texts = [
    "Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.",
    "His early life was marked by financial hardship and limited resources.",
    "He believed education could transform his life and pursued it relentlessly.",
    "Sudhanshu earned a Computer Science and Engineering degree.",
    "He worked at Wipro, Deloitte, Verizon Labs, and Ernst & Young.",
    "He gained expertise in SAP WebDynpro, Fiori UI5 HANA, Java, Big Data, and Data Analytics.",
    "Sudhanshu remained committed to making education accessible for everyone.",
    "His teaching empowered countless students to build their careers.",
    "Many students credit him with changing their lives through affordable learning.",
    "His journey is a testament to triumph over adversity and the power of knowledge."
]

In [108]:

import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('API_KEY')}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding


In [109]:
embeddings = [generate_embeddings(i).tolist() for i in texts]

In [110]:
embeddings[0]

[-0.024888895,
 -0.021206673,
 0.0047149123,
 0.018578006,
 0.0033718925,
 0.0052990606,
 0.023324212,
 0.029666394,
 0.0044854255,
 -0.033859745,
 0.008266743,
 -0.03031313,
 -0.017284535,
 -0.015991064,
 0.033400774,
 0.014541123,
 0.025598219,
 -0.06951366,
 -0.00032141202,
 0.012527897,
 0.07581412,
 0.0011356992,
 0.02186384,
 -0.015865888,
 0.0368848,
 -0.013956975,
 -0.053240955,
 -0.039888993,
 -0.014176031,
 -0.04994469,
 -0.023011275,
 0.0015972808,
 0.0042011747,
 -0.01134917,
 0.025034932,
 -0.019193448,
 -0.00964888,
 0.010702434,
 -0.0069941343,
 -0.0058362684,
 -0.019579403,
 -0.05908244,
 -0.02280265,
 0.04548013,
 -0.018317226,
 0.020080103,
 -0.0056902315,
 0.02518097,
 -0.04969434,
 -0.0054190196,
 -0.00793816,
 -0.020132259,
 0.061919734,
 -0.00044854253,
 0.008798735,
 -0.008715286,
 0.01570942,
 0.0278305,
 -0.0042272527,
 -0.039304845,
 0.0012843442,
 -0.018817924,
 0.013602314,
 0.003781318,
 0.0073383646,
 -0.04356078,
 0.017315827,
 -0.01733669,
 -0.046064276,

In [111]:
len(embeddings)

10

In [112]:
collection_name = "sudhanshu_collection"

In [113]:
collectiion = client.create_collection(name=collection_name)

In [114]:
collectiion.add(
    documents=texts,
    embeddings=embeddings,
    ids=[str(i) for i in range(len(texts))]
)

In [115]:
collectiion.count()

10

In [116]:
# details about data
collectiion.get()

{'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
 'embeddings': None,
 'documents': ['Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.',
  'His early life was marked by financial hardship and limited resources.',
  'He believed education could transform his life and pursued it relentlessly.',
  'Sudhanshu earned a Computer Science and Engineering degree.',
  'He worked at Wipro, Deloitte, Verizon Labs, and Ernst & Young.',
  'He gained expertise in SAP WebDynpro, Fiori UI5 HANA, Java, Big Data, and Data Analytics.',
  'Sudhanshu remained committed to making education accessible for everyone.',
  'His teaching empowered countless students to build their careers.',
  'Many students credit him with changing their lives through affordable learning.',
  'His journey is a testament to triumph over adversity and the power of knowledge.'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None, None, None, None, None, 

In [117]:
def query_result(query):
    query_embedding = generate_embeddings(query)
    results = collectiion.query(
        query_embeddings=[query_embedding],
        n_results=3
    )
    return results

In [118]:
query = "who is sudhanshu ?"
query_result(query)


{'ids': [['0', '3', '6']],
 'embeddings': None,
 'documents': [['Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.',
   'Sudhanshu earned a Computer Science and Engineering degree.',
   'Sudhanshu remained committed to making education accessible for everyone.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[0.8116778135299683, 0.9027023315429688, 1.0123217105865479]]}

In [119]:
"""
Chroma default metric: l2 (Euclidean distance).

You can choose: "l2", "cosine", or "ip" per collection via metadata={"hnsw:space": "..."}. 

For "cosine", results show cosine distance, not similarity → closer to 0 = more similar.
"""

'\nChroma default metric: l2 (Euclidean distance).\n\nYou can choose: "l2", "cosine", or "ip" per collection via metadata={"hnsw:space": "..."}. \n\nFor "cosine", results show cosine distance, not similarity → closer to 0 = more similar.\n'

In [120]:
# Create (or get) a collection configured to use cosine distance
collection = client.get_or_create_collection(
    name = collection_name,
    metadata={"hnsw:space": "cosine"}  # use "l2" or "ip" if you prefer
)

In [121]:
def query_result(query):
    query_embedding = generate_embeddings(query)
    results = collectiion.query(
        query_embeddings=[query_embedding],
        n_results=3
    )
    return results

In [122]:
query = "who is sudhanshu ?"
query_result(query)

{'ids': [['0', '3', '6']],
 'embeddings': None,
 'documents': [['Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.',
   'Sudhanshu earned a Computer Science and Engineering degree.',
   'Sudhanshu remained committed to making education accessible for everyone.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None, None]],
 'distances': [[0.8116778135299683, 0.9027023315429688, 1.0123217105865479]]}

In [123]:
## for client existing in Cloud

import chromadb

client = chromadb.CloudClient(
  api_key=os.getenv("CHROMA_API_KEY"),
  tenant='83d527bb-fc7e-4523-8a55-ef5c1912c580',
  database='test'
)

In [125]:
collection = client.create_collection(name="sudhanshu_story2")

In [126]:
collection.add(
    documents = texts,
    embeddings = embeddings,
    ids = [str(i) for i in range(len(texts))]
)

## Pinecone

In [95]:
# pip install pinecone

In [128]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("sudhstory")

In [129]:
texts = [
    "Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.",
    "His early life was marked by financial hardship and limited resources.",
    "He believed education could transform his life and pursued it relentlessly.",
    "Sudhanshu earned a Computer Science and Engineering degree.",
    "He worked at Wipro, Deloitte, Verizon Labs, and Ernst & Young.",
    "He gained expertise in SAP WebDynpro, Fiori UI5 HANA, Java, Big Data, and Data Analytics.",
    "Sudhanshu remained committed to making education accessible for everyone.",
    "His teaching empowered countless students to build their careers.",
    "Many students credit him with changing their lives through affordable learning.",
    "His journey is a testament to triumph over adversity and the power of knowledge."
]

In [130]:
import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('API_KEY')}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding

In [131]:
embeddings = [generate_embeddings(i).tolist() for i in texts]

In [132]:
index.upsert(
    vectors = [
        (
            str(i),
            embeddings[i],
            {"text": texts[i]}
        )
        for i in range(len(texts))
    ]
)

{'upserted_count': 10}

In [133]:
query = "sudhanshu worked in deloitte"

In [134]:
query_embed = generate_embeddings(query).tolist()

In [135]:
query_embed

[-0.024259316,
 -0.009171045,
 0.06313457,
 -0.027473774,
 -0.0071505285,
 0.03555584,
 -0.023012893,
 0.085229054,
 0.009695854,
 -0.056994304,
 0.011158761,
 -0.03778628,
 -0.023786988,
 -0.038232367,
 0.041958515,
 0.030753834,
 -0.0046380037,
 -0.06061549,
 -0.025400776,
 0.042457085,
 0.03510975,
 0.07536264,
 0.033640284,
 -0.0018155127,
 0.038678456,
 -0.03106872,
 -0.06161263,
 -0.01337608,
 0.026922725,
 -0.038967103,
 -0.03891462,
 -0.023498343,
 0.039885517,
 0.0071505285,
 -0.0031931375,
 0.0142092155,
 0.0038179888,
 0.05557732,
 0.01717439,
 -0.014602823,
 -0.013435122,
 -0.0062944335,
 0.0019516351,
 0.012431424,
 0.019155545,
 -0.017896002,
 -0.032433223,
 0.0053038555,
 -0.00832479,
 -0.0008716757,
 -0.017148148,
 0.0011890214,
 0.053740487,
 0.046025787,
 -0.014392899,
 0.0011775412,
 0.012457664,
 0.059880756,
 0.024797246,
 -0.022422483,
 0.011886934,
 -0.023419622,
 0.029887898,
 0.04670804,
 -0.0050020902,
 -0.025650062,
 -0.052638385,
 0.03007158,
 -0.029284367,


In [136]:
index.query(
    vector=query_embed,
    top_k=3,
    include_metadata=True
)

{'matches': [{'id': '3',
              'metadata': {'text': 'Sudhanshu earned a Computer Science and '
                                   'Engineering degree.'},
              'score': 0.570962965,
              'values': []},
             {'id': '4',
              'metadata': {'text': 'He worked at Wipro, Deloitte, Verizon '
                                   'Labs, and Ernst & Young.'},
              'score': 0.510824203,
              'values': []},
             {'id': '0',
              'metadata': {'text': 'Sudhanshu Kumar was born in Jamshedpur, '
                                   'Jharkhand, India, to a modest family.'},
              'score': 0.487629831,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 1}}

## Weaviate

In [137]:
import os
import weaviate
from weaviate.classes.init import Auth

In [None]:
# export WEAVIATE_URL="stlh8uytbgfwohutl2rq.c0.asia-southeast1.gcp.weaviate.cloud"
# export WEAVIATE_API_KEY="<Your API Key>"

In [None]:
# import os
# import weaviate
# from weaviate.classes.init import Auth

# # Best practice: store your credentials in environment variables
# weaviate_url = os.environ["WEAVIATE_URL"]
# weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# # Connect to Weaviate Cloud
# client = weaviate.connect_to_weaviate_cloud(
#     cluster_url=weaviate_url,
#     auth_credentials=Auth.api_key(weaviate_api_key),
# )

# print(client.is_ready())

In [138]:
weaviate_url = "https://stlh8uytbgfwohutl2rq.c0.asia-southeast1.gcp.weaviate.cloud"
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")

In [None]:
# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())

True


In [140]:
texts = [
    "Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.",
    "His early life was marked by financial hardship and limited resources.",
    "He believed education could transform his life and pursued it relentlessly.",
    "Sudhanshu earned a Computer Science and Engineering degree.",
    "He worked at Wipro, Deloitte, Verizon Labs, and Ernst & Young.",
    "He gained expertise in SAP WebDynpro, Fiori UI5 HANA, Java, Big Data, and Data Analytics.",
    "Sudhanshu remained committed to making education accessible for everyone.",
    "His teaching empowered countless students to build their careers.",
    "Many students credit him with changing their lives through affordable learning.",
    "His journey is a testament to triumph over adversity and the power of knowledge."
]

In [141]:
import requests
import numpy as np

def generate_embeddings(text):
    url = "https://api.euron.one/api/v1/euri/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('API_KEY')}"
    }
    payload = {
        "input": text,
        "model": "text-embedding-3-small"
    }

    response = requests.post(url, headers=headers, json=payload)
    data = response.json()
    
    embedding = np.array(data['data'][0]['embedding'])
    
    return embedding

In [142]:
embeddings = [generate_embeddings(i).tolist() for i in texts]

In [147]:
import weaviate
from weaviate.classes.config import Property, DataType, Configure

COLLECTION_NAME = "MyDocuments"  # change if you like

# You already have this:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)
print(client.is_ready())

# Create collection if it does not exist yet
existing = [c for c in client.collections.list_all()]

if COLLECTION_NAME not in existing:
    client.collections.create(
        name=COLLECTION_NAME,
        properties=[
            Property(name="text", data_type=DataType.TEXT),
            Property(name="source_id", data_type=DataType.TEXT),
        ],
        # Important: tell Weaviate we will provide vectors ourselves
        vector_config=Configure.Vectors.self_provided(),
    )

collection = client.collections.get(COLLECTION_NAME)

# client.close()


True


In [148]:
from weaviate.util import generate_uuid5  # deterministic UUIDs

# texts = [
#     "first document text",
#     "second document text",
#     "third document text",
# ]
vectors = embeddings  # list[list[float]], same length as texts

assert len(texts) == len(vectors), "texts and embeddings must align"

with collection.batch.dynamic() as batch:
    for text, vector in zip(texts, vectors):
        # deterministic ID based on text (or some other stable key)
        obj_id = generate_uuid5(text)

        batch.add_object(
            uuid=obj_id,
            properties={
                "text": text,
                "source_id": obj_id,
            },
            vector=vector,
        )

        if batch.number_errors > 0:
            print("Errors in batch:", batch.batch_errors)

print("Batch upsert complete.")


Batch upsert complete.


In [149]:
query = "who is sudhanshu ?"
query_embedding = generate_embeddings(query).astype("float32")

In [150]:
query_embedding

array([-0.02320408, -0.0206725 , -0.00223581, ..., -0.00172695,
        0.01413363,  0.0011354 ], shape=(1536,), dtype=float32)

In [151]:
from weaviate.classes.query import MetadataQuery

COLLECTION_NAME = "MyDocuments"  # use the same name you used while inserting

# Get the collection
collection = client.collections.get(COLLECTION_NAME)

# Ensure query_embedding is a plain list, not a numpy array
query_vector = query_embedding.tolist()

# Perform similarity search (vector search)
result = collection.query.near_vector(
    near_vector=query_vector,
    limit=3,  # top 3 closest
    return_metadata=MetadataQuery(distance=True)  # so we also see distances
)

# Print results
for rank, obj in enumerate(result.objects, start=1):
    text = obj.properties.get("text")
    distance = obj.metadata.distance  # smaller = more similar
    print(f"Rank {rank}:")
    print(f"  Distance: {distance}")
    print(f"  Text: {text}")
    print("-" * 40)


Rank 1:
  Distance: 0.4058385491371155
  Text: Sudhanshu Kumar was born in Jamshedpur, Jharkhand, India, to a modest family.
----------------------------------------
Rank 2:
  Distance: 0.45135098695755005
  Text: Sudhanshu earned a Computer Science and Engineering degree.
----------------------------------------
Rank 3:
  Distance: 0.5061606764793396
  Text: Sudhanshu remained committed to making education accessible for everyone.
----------------------------------------
