In [1]:
import os
from dotenv import load_dotenv
# from pymilvus import connections

# # If using Docker standalone Milvus
# connections.connect("default", host="127.0.0.1", port="19530")

from pymilvus import connections

load_dotenv(override=True, dotenv_path="../.env.local")

milvus_uri = os.getenv("MILVUS_URI")
milvus_token = os.getenv("MILVUS_API_KEY")


connections.connect(
    alias="default",
    uri=milvus_uri,
    token=milvus_token
)

print("Connected to Milvus on Zilliz Cloud")

Connected to Milvus on Zilliz Cloud


In [2]:
from pymilvus import db
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType

# 1. Create a new database
# db.create_database("rag_db")

# 2. Switch to that database
db.using_database("rag_db")

# ----- Create schema -----
fields = [
    FieldSchema("doc_id", DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema("title", DataType.VARCHAR, max_length=200),
    FieldSchema("domain", DataType.VARCHAR, max_length=100),
    FieldSchema("content", DataType.VARCHAR, max_length=2000),
    FieldSchema("embedding", DataType.FLOAT_VECTOR, dim=384) 
]

schema = CollectionSchema(fields, description="Motorcycle types with embeddings")
collection = Collection("motorcycle_type", schema)

# ----- Create index -----
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128},
}
collection.create_index(field_name="embedding", index_params=index_params)

Status(code=0, message=)

In [3]:
# ----- Example data -----
content_chunks = [
    {
        "doc_id": 1,
        "section": "Cost of maintenance",
        "title": "Cost of Accessories and Maintenance",
        "domain": "Finance",
        "content": "Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compared to cruisers,ADVs, and sport bikes."
    },
    {
        "doc_id": 2,
        "section": "Different motorcycle types",
        "title": "Types of Motorcycles",
        "domain": "Motorcycle Types",
        "content": "There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes."
    },
    {
        "doc_id": 3,
        "section": "Riding positions",
        "title": "Riding experience",
        "domain": "Riding Motorcycles",
        "content": "ADVS offer an upright riding position suitable for long-distance touring, while sport bikes have a forward-leaning position designed for speed and agility.Dirt bikes provide a standing position for better control on rough terrains."
    }
]


# content_chunks_list = []
# for chunk in content_chunks:
#     content_chunks_list.append(chunk["content"])
content_chunks_list = [chunk["content"] for chunk in content_chunks]
print(content_chunks_list)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

doc_vectors = model.encode(content_chunks_list)
doc_vectors.shape

['Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compared to cruisers,ADVs, and sport bikes.', 'There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes.', 'ADVS offer an upright riding position suitable for long-distance touring, while sport bikes have a forward-leaning position designed for speed and agility.Dirt bikes provide a standing position for better control on rough terrains.']


  from .autonotebook import tqdm as notebook_tqdm


(3, 384)

In [4]:
# ---- Build columnar data ----
doc_ids = [int(i + 1) for i in range(len(content_chunks))]             # INT64
titles = [str(doc["title"]) for doc in content_chunks]                 # VARCHAR
domains = [str(doc["domain"]) for doc in content_chunks]               # VARCHAR
content = [str(doc["content"]) for doc in content_chunks]               # VARCHAR
embeddings = [list(map(float, vec)) for vec in doc_vectors]       # FLOAT_VECTOR(768)


# ---- Insert column-wise ----
collection.insert([doc_ids, titles, domains, content, embeddings])
collection.flush()

print(f"Successfully inserted {len(doc_ids)} documents into Milvus.")

Successfully inserted 3 documents into Milvus.


In [5]:
#Load the collection before searching or querying
collection.load()
res = collection.query(expr="doc_id > 0", output_fields=["doc_id", "title", "domain", "content", "embedding"], limit=5)
print(res)

data: ["{'doc_id': 1, 'title': 'Cost of Accessories and Maintenance', 'domain': 'Finance', 'content': 'Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compared to cruisers,ADVs, and sport bikes.', 'embedding': [-0.007442957255989313, 0.021816488355398178, 0.09829466044902802, -0.019319824874401093, 0.06200375780463219, -0.08512988686561584, -0.05971066281199455, 0.06615879386663437, -0.004948060028254986, -0.006398465018719435, -0.011155018582940102, 0.04371156170964241, 0.05143973231315613, 0.0360875241458416, 0.002581691136583686, -0.011653424240648746, 0.11827302724123001, 0.0037483246996998787, -0.08007146418094635, 0.09391702711582184, 0.01224276889115572, 0.07015281170606613, 0.06309502571821213, 0.05272732302546501, -0.052285533398389816, -0.008503812365233898, -0.040759045630693436, -0.0025368379428982735, -0.047886766493320465, 0.012694009579718113, -0.06674415618181229, 0.07359975576400757, -0.003905358724296093, -0.02502173744

In [6]:
# print(utility.has_collection("demo_collection"))

# # Get details about a specific collection
# # Get collection details
# collection = Collection("demo_collection")  # instantiate the collection object
print(collection.schema)                    # show the schema
print(collection.num_entities)              # number of entities
print(collection.description)               # optional

{'auto_id': False, 'description': 'Motorcycle types with embeddings', 'fields': [{'name': 'doc_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 200}}, {'name': 'domain', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 100}}, {'name': 'content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2000}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': False, 'enable_namespace': False}
3
Motorcycle types with embeddings


In [7]:
# Display results
for record in res:
    print(f"Doc ID: {record['doc_id']}")
    print(f"Title: {record['title']}")
    print(f"Domain: {record['domain']}")
    print(f"Content: {record['content']}")
    # Show only first 5 embedding values for readability
    print(f"Embedding (first 5): {record['embedding'][:5]}")
    print("-" * 80)

Doc ID: 1
Title: Cost of Accessories and Maintenance
Domain: Finance
Content: Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compared to cruisers,ADVs, and sport bikes.
Embedding (first 5): [-0.007442957255989313, 0.021816488355398178, 0.09829466044902802, -0.019319824874401093, 0.06200375780463219]
--------------------------------------------------------------------------------
Doc ID: 2
Title: Types of Motorcycles
Domain: Motorcycle Types
Content: There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes.
Embedding (first 5): [0.003236997639760375, 0.048829589039087296, -0.0003227116249036044, -0.021307695657014847, -0.013017075136303902]
--------------------------------------------------------------------------------
Doc ID: 3
Title: Riding experience
Domain: Riding Motorcycles
Content: ADVS offer an upright riding position suitable for long-distance touring, while sport bikes have a forward-leaning

In [8]:
query = "What types of motorcycles are there?"
query_vector = model.encode([query])[0]
query_vector[:5]  # Show only first 5 values

array([-0.00600575,  0.06411073, -0.02026865,  0.00098146, -0.07386595],
      dtype=float32)

In [9]:
# Search for closest match only in the  domain
results = collection.search(
    data=[query_vector],
    anns_field="embedding",
    param={"metric_type": "COSINE", "params": {"nprobe": 10}},
    limit=3,
    output_fields=["doc_id", "title", "domain", "content"]
)

context_string = ""
for res in results[0]:
    print(f"doc_id={res.entity.get('doc_id')}, "
          f"title={res.entity.get('title')}, "
          f"domain={res.entity.get('domain')}, "
          f"content={res.entity.get('content')}, "
          f"score={res.distance}")
    context_string += f"\n -- \n {res.entity.get('content')} " # Append content to context string

print("\nContext String for RAG:\n", context_string)    

doc_id=2, title=Types of Motorcycles, domain=Motorcycle Types, content=There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes., score=0.8455122113227844
doc_id=1, title=Cost of Accessories and Maintenance, domain=Finance, content=Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compared to cruisers,ADVs, and sport bikes., score=0.4227937161922455
doc_id=3, title=Riding experience, domain=Riding Motorcycles, content=ADVS offer an upright riding position suitable for long-distance touring, while sport bikes have a forward-leaning position designed for speed and agility.Dirt bikes provide a standing position for better control on rough terrains., score=0.32379022240638733

Context String for RAG:
 
 -- 
 There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes. 
 -- 
 Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compa

In [10]:
from llm_utility import ask_question_open_ai
  
query = "What types of motorcycles are there?"
response = ask_question_open_ai(query, context_string)
response

'- Dirt bikes\n- Cruisers\n- ADVs (adventure motorcycles)\n- Sport bikes'

In [11]:
print(f"User query: {query}")
print(f"Context: {context_string}")

print(f"\n\nOpen AI Response: {response}")

User query: What types of motorcycles are there?
Context: 
 -- 
 There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes. 
 -- 
 Depending on the motorcycle type, maintenance costs vary.Dirt bikes have lower maintenance costs compared to cruisers,ADVs, and sport bikes. 
 -- 
 ADVS offer an upright riding position suitable for long-distance touring, while sport bikes have a forward-leaning position designed for speed and agility.Dirt bikes provide a standing position for better control on rough terrains. 


Open AI Response: - Dirt bikes
- Cruisers
- ADVs (adventure motorcycles)
- Sport bikes


In [None]:
# RAG Evaluation
# Context_Recall: Did the system retrieve all relevant documents?
# Context_Precision: What proportion of retrieved documents are relevant?

input = "What types of motorcycle are there?"
expected_context = "There are several types of motorcycles including dirt bikes, cruisers, ADVs, and sport bikes."
actual_context = '''
 Employees must submit a leave request for approval. 
 -- 
 Employees are paid bi-weekly via direct deposit. 
 '''
#Context_Recall: 0%


# input = "What’s the leave policy?"
# expected_context = ["Employees are entitled to 20 days of paid leave annually."," They must submit a leave request for approval."]
# actual_context = ["Employees must submit a leave request for approval. ", " Employees are paid bi-weekly via direct deposit. ", " Company internet must be used for work-related tasks only. "]
# #Context_Recall: 1 out of 2 expected = 1/2 = 50%


# input = "What’s the leave policy?"
# expected_context = ["Employees are entitled to 20 days of paid leave annually."," They must submit a leave request for approval."]
# actual_context = ["They are entitled to 20 days of paid leave in a year."," They must submit a leave request for approval."]
# #Context_Recall: 2 out of 2 expected = 2/2 = 100%

In [None]:
# # Context_Precision: What proportion of retrieved documents are relevant?
# input = "What’s the leave policy?"
# expected_context = ["Employees are entitled to 20 days of paid leave annually."," They must submit a leave request for approval."]
# actual_context = ["Employees must submit a leave request for approval. ", " Employees are paid bi-weekly via direct deposit. ", " Company internet must be used for work-related tasks only. "]
# #Context_Precision: 1 out of 3 retrieved = 1/3 = 33%


# input = "What’s the leave policy?"
# expected_context = ["Employees are entitled to 20 days of paid leave annually."," They must submit a leave request for approval."]
# actual_context = ["They are entitled to 20 days of paid leave in a year."," They must submit a leave request for approval.", " Company internet must be used for work-related tasks only. "]
# #Context_Precision: 2 out of 3 retrieved = 2/3 = 67%

In [None]:
##### RAG Evaluation
### Retrieval Metrics
# Context_Recall: Did the system retrieve all relevant documents?
# Context_Precision: What proportion of retrieved documents are relevant?

### Generative Metrics
# Faithfulness
# Accuracy


# F1 = 2 * (Context_Precision * Context_Recall) / (Context_Precision + Context_Recall) -- NOT USED

In [None]:
### Generative Metrics
# Accuracy

input = "What's the leave policy?"
context = ["Employees are entitled to 20 days of paid leave annually."," They must submit a leave request for approval."]

#expected_llm_output = '''
#llm as a judge
groundtruth = ''' 
    Based on the provided context, the leave policy states that employees are entitled to 20 days of paid leave annually 
    and must submit a leave request for approval.'''

actual_llm_output = '''
    Based on the provided context, the leave policy states that employees must submit a leave request for approval. 
    No other details are given. If you have more of the policy, I can summarize that as well.'''

accuracy = 0.50


# Faithfulness - Is the generated output consistent with the provided context?
faithfulness = 0.50