In [None]:
!pip install pymilvus sentence_transformers langchain

## Test Milvus deployment

In [45]:
from pymilvus import connections, utility

MILVUS_HOST = "HOST"  # or replace with external route URL if accessible
MILVUS_PORT = "19530"  # Default port for Milvus gRPC

def test_milvus_connection():
    # Connect to Milvus server
    connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
    
    # Test connection by checking the server version
    try:
        if utility.has_collection("example_collection"):
            print("Connection to Milvus successful, example collection exists.")
        else:
            print("Connection to Milvus successful, example collection does not exist.")
    except Exception as e:
        print("Failed to connect to Milvus:", e)

def create_sample_collection():
    from pymilvus import Collection, FieldSchema, CollectionSchema, DataType

    # Define the schema
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=128),
    ]
    schema = CollectionSchema(fields, "Sample schema for testing")

    # Create a collection named "example_collection"
    collection = Collection("example_collection", schema)
    print("Collection created:", collection.name)

    return collection

def insert_sample_data(collection):
    import numpy as np

    # Generate sample data
    vectors = np.random.random([10, 128]).astype("float32")  # 10 vectors of dimension 128
    data = [vectors]

    # Insert data
    collection.insert(data)
    print("Sample data inserted into collection:", collection.name)

if __name__ == "__main__":
    print("Testing connection to Milvus:")
    test_milvus_connection()

    # Optional: Create and insert data if connected successfully
    collection_name = "example_collection"
    if not utility.has_collection(collection_name):
        print("Creating collection and inserting sample data.")
        collection = create_sample_collection()
        insert_sample_data(collection)
    else:
        print("Collection already exists.")

Testing connection to Milvus:
Connection to Milvus successful, example collection does not exist.
Creating collection and inserting sample data.
Collection created: example_collection
Sample data inserted into collection: example_collection


## Create Milvus Collection

In [46]:
import os
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import MarkdownTextSplitter
import markdown
from pathlib import Path
# Vectorization model
model = SentenceTransformer("WhereIsAI/UAE-Large-V1")

# Connect to Milvus
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)    

def read_md_files(md_dir):
    """Read and parse markdown files from a directory."""
    md_texts = []
    for md_file in Path(md_dir).rglob("*.md"):
        with open(md_file, "r", encoding="utf-8") as file:
            text = file.read()
            html = markdown.markdown(text)
            md_texts.append(html)
    return md_texts


def chunk_text(text, chunk_size=1000, chunk_overlap=500):
    text_splitter = MarkdownTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

def vectorize_and_insert(collection, md_dir):
    """Chunk, vectorize, and insert Markdown text data into Milvus."""
    md_texts = read_md_files(md_dir)
    
    # Prepare data for insertion
    all_text_chunks = []
    all_vectors = []
    
    for md_text in md_texts:
        chunks = chunk_text(md_text)
        embeddings = model.encode(chunks, show_progress_bar=True)
        
        # Collect data for Milvus insertion
        all_text_chunks.extend(chunks)
        all_vectors.extend(embeddings)

    # Insert into Milvus
    collection.insert([all_text_chunks, all_vectors])
    print(f"Inserted {len(all_vectors)} vectors into Milvus.")


def create_collection(collection_name, markdown_directory):
    # Define Milvus schema
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="text_chunk", dtype=DataType.VARCHAR, max_length=1024),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024),
    ]
    schema = CollectionSchema(fields, "Schema for Markdown file vectors")

    # Create collection if it doesn't exist
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema)
    else:
        collection = Collection(name=collection_name)

    # Ensure the index on the embedding field
    index_params = {
        "metric_type": "L2",
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128}
    }

    # Check if an index already exists; create if it doesn't
    if not collection.has_index():
        collection.create_index(field_name="embedding", index_params=index_params)
        print("Index created on 'embedding' field.")
    else:
        print("Index already exists on 'embedding' field.")

    vectorize_and_insert(collection, markdown_directory)

In [47]:
collection_name = "product_details"
markdown_directory = "../docs/product_details" 
create_collection(collection_name, markdown_directory)

Index created on 'embedding' field.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserted 226 vectors into Milvus.


In [48]:
collection_name = "HR_policies"
markdown_directory = "../docs/HR_policies" 
create_collection(collection_name, markdown_directory)

Index created on 'embedding' field.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Inserted 72 vectors into Milvus.


In [49]:
collection_name = "customer_accounts"
markdown_directory = "../docs/customer_accounts" 
create_collection(collection_name, markdown_directory)

Index created on 'embedding' field.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inserted 75 vectors into Milvus.


In [31]:
def delete_collection(collection_name, host=MILVUS_HOST, port=MILVUS_PORT):
    """
    Deletes a specified collection in Milvus.

    Parameters:
        collection_name (str): The name of the collection to delete.
        host (str): The host address of the Milvus server. Default is "localhost".
        port (str): The port of the Milvus server. Default is "19530".
    """
    # Connect to Milvus
    connections.connect("default", host, port)

    collection = Collection(collection_name)
        
        # Drop the collection
    collection.drop()
    print(f"Collection '{collection_name}' has been successfully deleted.")

# delete_collection("customer_accounts")
# delete_collection("product_details")
# delete_collection("HR_policies")


## Query Milvus Collection

In [50]:
from pymilvus import Collection
from sentence_transformers import SentenceTransformer

# Initialize the embedding model (same as used for data insertion)
model = SentenceTransformer("WhereIsAI/UAE-Large-V1")

def query_milvus(collection_name, query_text, top_k=3):
    """
    Query the Milvus index with a string and return the top K matches.

    Parameters:
    - collection_name (str): Name of the Milvus collection.
    - query_text (str): The query string to search for.
    - top_k (int): The number of top matches to return (default is 3).

    Returns:
    - List of dictionaries with fields: "id", "text_chunk", and "score".
    """
    # Load the collection
    collection = Collection(name=collection_name)

    # Ensure the collection is loaded into memory
    collection.load()

    # Vectorize the query text
    query_embedding = model.encode([query_text])[0]  # Single vector

    # Perform search
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[query_embedding],  # Query vector
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=["text_chunk"]  # Retrieve the text chunk field
    )

    # Process and return results
    top_matches = []
    for result in results[0]:  # results[0] because search returns a list of lists
        match = {
            "id": result.id,
            "text_chunk": result.entity.get("text_chunk"),
            "score": result.distance  # Lower scores indicate closer matches for "L2"
        }
        top_matches.append(match)

    return top_matches


In [51]:
collection_name = "product_details"
query_text = "What is CloudForge Migrate?"
top_matches = query_milvus(collection_name, query_text)
print("Top matches:", top_matches)

Top matches: [{'id': 453745768502133076, 'text_chunk': '<h2><strong>1. CloudForge Migrate</strong></h2>\n<h3><strong>Detailed Description</strong></h3>\n<p><strong>CloudForge Migrate</strong> is an all-in-one cloud migration platform designed to simplify the complex process of moving applications, data, and infrastructure to the cloud. It provides a seamless transition from on-premises or legacy systems to modern cloud environments, supporting public, private, and hybrid cloud models across major providers like AWS, Microsoft Azure, and Google Cloud Platform.</p>\n<p><strong>Key Components:</strong></p>\n<ul>\n<li><strong>Migration Assessment Module:</strong> Analyzes your existing IT landscape to create a detailed migration plan, identifying dependencies, potential risks, and optimization opportunities.</li>\n<li><strong>Data Migration Engine:</strong> Handles the secure transfer of databases and files, ensuring data integrity and minimal downtime.</li>', 'score': 88.76605224609375}, 

In [52]:
collection_name = "HR_policies"
query_text = "How can I join the first day?"
top_matches = query_milvus(collection_name, query_text)
print("Top matches:", top_matches)

Top matches: [{'id': 453745768502133143, 'text_chunk': '<li>Prepare any questions for the <strong>HR Benefits Session</strong> on your first day.</li>\n<li>Decide on health insurance plan options and dependent coverage.</li>\n</ul>\n<h3><strong>3. First Day Logistics</strong></h3>\n<ul>\n<li><strong>Start Date and Time:</strong> Confirmed with your HR representative.</li>\n<li><strong>Office Address:</strong> 1234 Innovation Drive, Tech City, USA.</li>\n<li><strong>Parking and Transportation:</strong></li>\n<li>Parking pass details (if applicable).</li>\n<li>Public transportation options.</li>\n<li><strong>Dress Code:</strong> Business casual attire recommended for the first day.</li>\n<li><strong>Contact Information:</strong></li>\n<li><strong>HR Representative:</strong> [Name], [Email], [Phone].</li>\n</ul>\n<hr />\n<h2><strong>First Day Orientation</strong></h2>\n<h3><strong>1. Welcome Session</strong></h3>\n<ul>\n<li><strong>Time:</strong> 9:00 AM</li>\n<li><strong>Location:</stron

In [53]:
collection_name = "customer_accounts"
query_text = "top accounts?"
top_matches = query_milvus(collection_name, query_text)
print("Top matches:", top_matches)

Top matches: [{'id': 453745768502133235, 'text_chunk': '<strong>Account Number:</strong> CF-MS-2023<br />\n<strong>Period Covered:</strong> January 1, 2023 – December 31, 2023</p>\n<hr />\n<h2><strong>Account Summary</strong></h2>\n<p>| Description                               | Amount (USD)   |\n|-------------------------------------------|----------------|\n| <strong>Opening Balance (January 1, 2023)</strong>     | $0.00          |\n| <strong>Total Invoiced</strong>                        | $495,000.00    |\n| <strong>Total Payments Received</strong>               | $495,000.00    |\n| <strong>Closing Balance (December 31, 2023)</strong>   | <strong>$0.00</strong>      |</p>\n<hr />\n<h2><strong>Detailed Transaction History</strong></h2>\n<h3><strong>Invoices Issued</strong></h3>\n<p>| Date         | Invoice #     | Description                                              | Amount (USD)   |\n|--------------|---------------|----------------------------------------------------------|-