In [108]:
from pymilvus import connections

connections.connect("default", host="127.0.0.1", port="19530")

In [109]:
from pymilvus import db
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType

# 1. Create a new database
#db.create_database("rag_db")

# 2. Switch to that database
db.using_database("rag_db")

# ----- Create schema -----
fields = [
    FieldSchema("doc_id", DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema("title", DataType.VARCHAR, max_length=200),
    FieldSchema("domain", DataType.VARCHAR, max_length=100),
    FieldSchema("content", DataType.VARCHAR, max_length=2000),
    FieldSchema("embedding", DataType.FLOAT_VECTOR, dim=384)
    
]

schema = CollectionSchema(fields, description="Policy documents with embeddings")
collection = Collection("policy_docs_7", schema)

# ----- Create index -----
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128},
}
collection.create_index(field_name="embedding", index_params=index_params)

Status(code=0, message=)

In [110]:
# ----- Example data -----
content_chunks = [
    {
        "doc_id": 1,
        "section": "face packs",
        "title": "Applying fruit face packs",
        "domain": "Beauty tips",
        "content": "Applying fruit face packs on the face is a natural and refreshing way to glow the skin."
    },
    {
        "doc_id": 2,
        "section": "cosmetic treatment",
        "title": "chemical peel on skin",
        "domain": "Beauty tips",
        "content": "A chemical peel is a cosmetic treatment that uses a chemical solution to remove the outer layer of dead skin cells."
    },
    {
        "doc_id": 3,
        "section": "Investment in stocks",
        "title": "Stocks to make more money",
        "domain": "Stock market",
        "content": "Investing in stocks with strong growth potential, like technology or healthcare companies, can help grow your wealth over time. Diversifying across sectors and holding for the long term reduces risk while maximizing returns."
    },
    {
        "doc_id": 4,
        "section": "Investment",
        "title": "mutual funds",
        "domain": "Stock market",
        "content": "Mutual funds pool money from multiple investors to invest in a diversified portfolio of stocks, bonds, or other assets. They offer professional management and risk spreading, making investing easier for individuals."
    },
    {
        "doc_id": 5,
        "section": "Hair Application",
        "title": "Products to make for hair care",
        "domain": "Haircare",
        "content": "Hair care involves maintaining healthy, strong, and shiny hair through proper cleansing, conditioning, and nourishment. Using the right products and treatments can prevent damage, promote growth, and enhance overall hair appearance."
    },
    {
        "doc_id": 6,
        "section": "mud face packs",
        "title": "Applying mud face packs",
        "domain": "Beauty tips",
        "content": "Mud face packs cleanse and detoxify the skin, removing impurities while nourishing and refreshing for a healthy glow."
    },
    {
        "doc_id": 7,
        "section": "skin treatment",
        "title": "Microdermabrasion",
        "domain": "Beauty tips",
        "content": "Gently exfoliates to improve texture and radiance."
    },
    {
        "doc_id": 8,
        "section": "skin laser treatment",
        "title": "Laser Treatments",
        "domain": "Beauty tips",
        "content": "Reduce pigmentation, fine lines, and uneven tone."
    },
    {
        "doc_id": 9,
        "section": "Hydrafacial / Oxygen Therapy",
        "title": "skin therapy",
        "domain": "Beauty tips",
        "content": "Hydrates and rejuvenates skin for instant glow."
    }
]


# content_chunks_list = []
# for chunk in content_chunks:
#     content_chunks_list.append(chunk["content"])
content_chunks_list = [chunk["content"] for chunk in content_chunks]
print(content_chunks_list)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

doc_vectors = model.encode(content_chunks_list)
doc_vectors.shape

['Applying fruit face packs on the face is a natural and refreshing way to glow the skin.', 'A chemical peel is a cosmetic treatment that uses a chemical solution to remove the outer layer of dead skin cells.', 'Investing in stocks with strong growth potential, like technology or healthcare companies, can help grow your wealth over time. Diversifying across sectors and holding for the long term reduces risk while maximizing returns.', 'Mutual funds pool money from multiple investors to invest in a diversified portfolio of stocks, bonds, or other assets. They offer professional management and risk spreading, making investing easier for individuals.', 'Hair care involves maintaining healthy, strong, and shiny hair through proper cleansing, conditioning, and nourishment. Using the right products and treatments can prevent damage, promote growth, and enhance overall hair appearance.', 'Mud face packs cleanse and detoxify the skin, removing impurities while nourishing and refreshing for a h

(9, 384)

In [111]:
# ---- Build columnar data ----
doc_ids = [int(i + 1) for i in range(len(content_chunks))]             # INT64
titles = [str(doc["title"]) for doc in content_chunks]                 # VARCHAR
domains = [str(doc["domain"]) for doc in content_chunks]               # VARCHAR
content = [str(doc["content"]) for doc in content_chunks]               # VARCHAR
embeddings = [list(map(float, vec)) for vec in doc_vectors]       # FLOAT_VECTOR(768)


# ---- Insert column-wise ----
collection.insert([doc_ids, titles, domains, content, embeddings])
collection.flush()

print(f"Successfully inserted {len(doc_ids)} documents into Milvus.")

Successfully inserted 9 documents into Milvus.


In [112]:
#Load the collection before searching or querying
collection.load()
res = collection.query(expr="doc_id > 0", output_fields=["doc_id", "title", "domain", "content","embedding"])
print(res)

data: ["{'doc_id': 1, 'title': 'Applying fruit face packs', 'domain': 'Beauty tips', 'content': 'Applying fruit face packs on the face is a natural and refreshing way to glow the skin.', 'embedding': [-0.03787931799888611, 0.010238511487841606, -0.025692349299788475, 0.0499643012881279, 0.03625587746500969, 0.037938810884952545, 0.07119961827993393, 0.0023059803061187267, -0.04571813717484474, 0.008909271098673344, 0.10150840878486633, -0.042613666504621506, -0.016550103202462196, 0.009247465059161186, 0.15519021451473236, 0.05009282007813454, 0.031562089920043945, 0.03355611860752106, 0.03746639937162399, -0.06924903392791748, 0.0603962279856205, -0.04284476488828659, -0.07724182307720184, -0.037022363394498825, 0.0012194616720080376, -0.010085610672831535, -0.023902414366602898, 0.020580770447850227, 0.02485944703221321, -0.0251773651689291, 0.04257441684603691, 0.015089735388755798, 0.026829242706298828, 0.019675536081194878, -0.07590028643608093, -0.014166619628667831, 0.0264973938

In [113]:
# Display results
for record in res:
    print(f"Doc ID: {record['doc_id']}")
    print(f"Title: {record['title']}")
    print(f"Domain: {record['domain']}")
    print(f"Content: {record['content']}")
    # Show only first 5 embedding values for readability
    print(f"Embedding (first 5): {record['embedding'][:5]}")
    print("-" * 80)

Doc ID: 1
Title: Applying fruit face packs
Domain: Beauty tips
Content: Applying fruit face packs on the face is a natural and refreshing way to glow the skin.
Embedding (first 5): [-0.03787931799888611, 0.010238511487841606, -0.025692349299788475, 0.0499643012881279, 0.03625587746500969]
--------------------------------------------------------------------------------
Doc ID: 2
Title: chemical peel on skin
Domain: Beauty tips
Content: A chemical peel is a cosmetic treatment that uses a chemical solution to remove the outer layer of dead skin cells.
Embedding (first 5): [-0.012566202320158482, 0.01626780442893505, 0.02784735895693302, 0.060777872800827026, 0.02240844815969467]
--------------------------------------------------------------------------------
Doc ID: 3
Title: Stocks to make more money
Domain: Stock market
Content: Investing in stocks with strong growth potential, like technology or healthcare companies, can help grow your wealth over time. Diversifying across sectors and h

In [114]:
import numpy as np

query = "Tips to help skin quality?"
query_vector = model.encode([query])[0]
query_vector[:5]  # Show only first 5 values

array([0.01582566, 0.03938188, 0.07516891, 0.03077848, 0.00057622],
      dtype=float32)

In [115]:
%pip install numpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
similarities = model.similarity(query_vector, doc_vectors)

# Ensure it's a 1D numpy array
similarities = np.asarray(similarities).squeeze()

# Now get top 3
top_3_indices = np.argsort(similarities)[::-1][:3]
top_scores = similarities[top_3_indices]

In [117]:
top_scores

array([0.577085  , 0.5431898 , 0.42794243], dtype=float32)

In [118]:
top_docs = [content_chunks[i]['content'] for i in top_3_indices]
context = f"\n---\n".join(top_docs)
context

'Gently exfoliates to improve texture and radiance.\n---\nReduce pigmentation, fine lines, and uneven tone.\n---\nHydrates and rejuvenates skin for instant glow.'

In [119]:
# Search for closest match only in the 'Beauty' domain
"""
results = collection.search(
    data=[query_vector],
    anns_field="embedding",
    param={"metric_type": "COSINE", "params": {"nprobe": 10}},
    limit=3,
    expr='domain == "Beauty tips"',
    output_fields=["doc_id", "title", "domain", "content"]
)

context_sring = ""
for res in results[0]:
    print(f"doc_id={res.entity.get('doc_id')}, "
          f"title={res.entity.get('title')}, "
          f"domain={res.entity.get('domain')}, "
          f"content={res.entity.get('content')}, "
          f"score={res.distance}")
    context_sring += f"\n -- \n {res.entity.get('content')} " # Append content to context string

print("\nContext String for RAG:\n", context_sring)  ##
"""

'\nresults = collection.search(\n    data=[query_vector],\n    anns_field="embedding",\n    param={"metric_type": "COSINE", "params": {"nprobe": 10}},\n    limit=3,\n    expr=\'domain == "Beauty tips"\',\n    output_fields=["doc_id", "title", "domain", "content"]\n)\n\ncontext_sring = ""\nfor res in results[0]:\n    print(f"doc_id={res.entity.get(\'doc_id\')}, "\n          f"title={res.entity.get(\'title\')}, "\n          f"domain={res.entity.get(\'domain\')}, "\n          f"content={res.entity.get(\'content\')}, "\n          f"score={res.distance}")\n    context_sring += f"\n -- \n {res.entity.get(\'content\')} " # Append content to context string\n\nprint("\nContext String for RAG:\n", context_sring)  ##\n'

In [120]:
from model_utility import ask_question_open_ai 

query = "Tips to help skin quality?"
response = ask_question_open_ai(query, context)
response

'- Gently exfoliate to improve texture and radiance.\n- Focus on reducing pigmentation, fine lines, and uneven tone.\n- Hydrate and rejuvenate the skin for an instant glow.'

In [121]:
print(f"User query: {query}")
print(f"Context: {context}")

print(f"\n\nOpen AI Response: {response}")

User query: Tips to help skin quality?
Context: Gently exfoliates to improve texture and radiance.
---
Reduce pigmentation, fine lines, and uneven tone.
---
Hydrates and rejuvenates skin for instant glow.


Open AI Response: - Gently exfoliate to improve texture and radiance.
- Focus on reducing pigmentation, fine lines, and uneven tone.
- Hydrate and rejuvenate the skin for an instant glow.
