In [None]:
from pymilvus import MilvusClient

client = MilvusClient
client.create_collection()

In [68]:
from datasets import load_dataset

from pymilvus import (
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    AnnSearchRequest,
    RRFRanker,
    connections,
)

from pymilvus.model.hybrid import BGEM3EmbeddingFunction

In [69]:
dataset = load_dataset("tasksource/esci", split="train")

dataset = dataset.select(range(500))
dataset = dataset.filter(lambda x: x["product_locale"] == "us")
dataset

Dataset({
    features: ['example_id', 'query', 'query_id', 'product_id', 'product_locale', 'esci_label', 'small_version', 'large_version', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', 'product_text'],
    num_rows: 427
})

In [70]:
source_df = dataset.to_pandas()
df = source_df.drop_duplicates(
    subset=["product_text", "product_title", "product_bullet_point", "product_brand"]
)
df = df.dropna(
    subset=["product_text", "product_title", "product_bullet_point", "product_brand"]
)
df.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,product_title,product_description,product_bullet_point,product_brand,product_color,product_text
0,0,revent 80 cfm,0,B000MOO21W,us,Irrelevant,0,1,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,,WhisperCeiling fans feature a totally enclosed...,Panasonic,White,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...
2,1,revent 80 cfm,0,B07X3Y6B1V,us,Exact,0,1,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,Homewerks 7141-80 Bathroom Fan Integrated LED ...
3,2,revent 80 cfm,0,B07WDM7MQQ,us,Exact,0,1,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...
4,3,revent 80 cfm,0,B07RH6Z8KW,us,Exact,0,1,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White,Delta Electronics RAD80L BreezRadiance 80 CFM ...
5,4,revent 80 cfm,0,B07QJ7WYFQ,us,Exact,0,1,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White,Panasonic FV-08VRE2 Ventilation Fan with Reces...


In [71]:
len(df)

143

In [72]:
df["merged_text"] = (
    df["product_title"] + "\n" + df["product_text"] + "\n" + df["product_bullet_point"]
)

In [73]:
docs = df["merged_text"].to_list()
len(docs)

143

In [74]:
ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]
docs_embeddings = ef(docs)
query = "Do you have an example of a Panasonic product?"
query_embeddings = ef([query])

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [75]:
query_embeddings

{'dense': [array([-0.02941401, -0.02504556, -0.0574033 , ..., -0.05055605,
         -0.06463919, -0.00945763], dtype=float32)],
 'sparse': <1x250002 sparse array of type '<class 'numpy.float32'>'
 	with 10 stored elements in Compressed Sparse Row format>}

In [76]:
connections.connect()

fields = [
    # Use auto generated id as primary key
    FieldSchema(
        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
    ),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
]
schema = CollectionSchema(fields, "")
col = Collection("sparse_dense_demo", schema)

sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)

Status(code=0, message=)

In [78]:
entities = [
    df["merged_text"].to_list(),
    docs_embeddings["sparse"],
    docs_embeddings["dense"],
]
col.insert(entities)

(insert count: 143, delete count: 0, upsert count: 0, timestamp: 449418190134706177, success count: 143, err count: 0)

In [79]:
def query_hybrid_search(query: str):
    query_embeddings = ef([query])

    sparse_req = AnnSearchRequest(
        query_embeddings["sparse"], "sparse_vector", {"metric_type": "IP"}, limit=2
    )
    dense_req = AnnSearchRequest(
        query_embeddings["dense"], "dense_vector", {"metric_type": "COSINE"}, limit=2
    )

    res = col.hybrid_search(
        [sparse_req, dense_req], rerank=RRFRanker(), limit=2, output_fields=["text"]
    )

    return res

In [80]:
query_hybrid_search("Do you have a Homewerks product?")[0]

['id: 449353344520491318, distance: 0.032786883413791656, entity: {\'text\': "Homewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM\\nHomewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM\\nHomewerks\\n80 CFM\\nNone\\nOUTSTANDING PERFORMANCE: This Homewerk\'s bath fan ensures comfort in your home by quietly eliminating moisture and humidity in the bathroom. This exhaust fan is 1.1 sones at 80 CFM which means it’s able to manage spaces up to 80 square feet and is very quiet..\\nBATH FANS HELPS REMOVE HARSH ODOR: When cleaning the bathroom or toilet, harsh chemicals are used and they can leave an obnoxious odor behind. Homewerk’s bathroom fans can help remove this odor with its powerful ventilation\\nBUILD QUALITY: Designed to be corrosion resistant with its galvanized steel construction featuring a modern style round shape and has an 4000K Cool White Light LED Light. AC motor.\\nEASY IN

In [90]:
def query_dense_search(query: str):
    query_embeddings = ef([query])
    search_param = {
        "data": query_embeddings["dense"],
        "anns_field": "dense_vector",
        "param": {"metric_type": "COSINE"},
        "limit": 2,
        "output_fields": ["text"],
    }
    res_dense = col.search(**search_param)

    return res_dense

In [123]:
query_dense_search("shipping included")

['[\'id: 449353344520491390, distance: 0.5341320037841797, entity: {\\\'text\\\': \\\'BAZIC Self Seal White Envelope 3 5/8" x 6 1/2" #6, No Window Mailing Envelopes, Peel & Seal Mailer for Business Invoice Check (100/Pack), 1-Pack\\\\nBAZIC Self Seal White Envelope 3 5/8" x 6 1/2" #6, No Window Mailing Envelopes, Peel & Seal Mailer for Business Invoice Check (100/Pack), 1-Pack\\\\nBAZIC Products\\\\n#6 3/4 (100-count)\\\\n<p><strong>BACK TO BAZIC</strong></p> <p>Our goal is to provide each customer with long-lasting supplies at an affordable cost. Since 1998, we’ve delivered on this promise and will only continue to improve every year. We’ve built our brand on integrity and quality, so customers know exactly what to expect.</p> <p><strong>COMMITTED TO VALUES</strong></p> <p>We are a value-driven company, guided by the principles of excellence through strong product design at low cost. Our commitment to these values is reflected in our dedication to improving current products and develo

In [122]:
query_hybrid_search("shipping included")

['[\'id: 449353344520491358, distance: 0.016393441706895828, entity: {\\\'text\\\': \\\'ASURION 4 Year Home Improvement Protection Plan $20-29.99\\\\nASURION 4 Year Home Improvement Protection Plan $20-29.99\\\\nASURION\\\\nNone\\\\nAsurion is taking the guesswork out of finding product protection plans to fit your needs. Products fail - often at the most inconvenient time. It’s a good thing you’re covered because no other plan can protect your stuff the way an Asurion Protection Plan can. Simply put, Asurion Protection Plans cover your products when you need it most with a fast and easy claims process. Buy a protection plan from a company that you know and trust. Add an Asurion Protection Plan to your cart today! Please see "User Guide [pdf]" below for detailed terms and conditions related to this plan.\\\\nNO ADDITIONAL COST: You pay $0 for repairs – parts, labor and shipping included.\\\\nCOVERAGE: Plan starts on the date of purchase. Drops, spills and cracked screens due to normal 

In [92]:
df.head(20)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,product_title,product_description,product_bullet_point,product_brand,product_color,product_text,merged_text
0,0,revent 80 cfm,0,B000MOO21W,us,Irrelevant,0,1,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,,WhisperCeiling fans feature a totally enclosed...,Panasonic,White,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...
2,1,revent 80 cfm,0,B07X3Y6B1V,us,Exact,0,1,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,Homewerks 7141-80 Bathroom Fan Integrated LED ...,Homewerks 7141-80 Bathroom Fan Integrated LED ...
3,2,revent 80 cfm,0,B07WDM7MQQ,us,Exact,0,1,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...
4,3,revent 80 cfm,0,B07RH6Z8KW,us,Exact,0,1,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White,Delta Electronics RAD80L BreezRadiance 80 CFM ...,Delta Electronics RAD80L BreezRadiance 80 CFM ...
5,4,revent 80 cfm,0,B07QJ7WYFQ,us,Exact,0,1,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White,Panasonic FV-08VRE2 Ventilation Fan with Reces...,Panasonic FV-08VRE2 Ventilation Fan with Reces...
6,5,revent 80 cfm,0,B076Q7V5WX,us,Exact,0,1,Panasonic FV-0511VQ1 WhisperCeiling DC Ventila...,,Installation: Features a 4-inch or 6-inch duct...,Panasonic,White,Panasonic FV-0511VQ1 WhisperCeiling DC Ventila...,Panasonic FV-0511VQ1 WhisperCeiling DC Ventila...
11,6,revent 80 cfm,0,B075ZBF9HG,us,Exact,0,1,Panasonic FV-0510VSL1 WhisperValue DC Ventilat...,,Installation: Features a low profile can ideal...,Panasonic,White,Panasonic FV-0510VSL1 WhisperValue DC Ventilat...,Panasonic FV-0510VSL1 WhisperValue DC Ventilat...
14,7,revent 80 cfm,0,B06W2LB17J,us,Exact,0,1,Panasonic FV-0510VS1 WhisperValue DC Ventilati...,,Installation: Features a low profile can ideal...,Panasonic,White,Panasonic FV-0510VS1 WhisperValue DC Ventilati...,Panasonic FV-0510VS1 WhisperValue DC Ventilati...
17,8,revent 80 cfm,0,B07JY1PQNT,us,Exact,0,1,Aero Pure ABF80 L5 W ABF80L5 Ceiling Mount 80 ...,,"Quiet 0.3 Sones, 80 CFM fan with choice of thr...",Aero Pure,White,Aero Pure ABF80 L5 W ABF80L5 Ceiling Mount 80 ...,Aero Pure ABF80 L5 W ABF80L5 Ceiling Mount 80 ...
18,9,revent 80 cfm,0,B01MZIK0PI,us,Exact,0,1,Delta Electronics (Americas) Ltd. RAD80 Delta ...,,Quiet operation at 1.5 Sones\nPrecision engine...,DELTA ELECTRONICS (AMERICAS) LTD.,With Heater,Delta Electronics (Americas) Ltd. RAD80 Delta ...,Delta Electronics (Americas) Ltd. RAD80 Delta ...
