In [68]:
from datasets import load_dataset

from pymilvus import (
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    AnnSearchRequest,
    RRFRanker,
    connections,
)

from pymilvus.model.hybrid import BGEM3EmbeddingFunction

In [69]:
dataset = load_dataset("tasksource/esci", split="train")

dataset = dataset.select(range(500))
dataset = dataset.filter(lambda x: x["product_locale"] == "us")
dataset

Dataset({
    features: ['example_id', 'query', 'query_id', 'product_id', 'product_locale', 'esci_label', 'small_version', 'large_version', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', 'product_text'],
    num_rows: 427
})

In [70]:
source_df = dataset.to_pandas()
df = source_df.drop_duplicates(
    subset=["product_text", "product_title", "product_bullet_point", "product_brand"]
)
df = df.dropna(
    subset=["product_text", "product_title", "product_bullet_point", "product_brand"]
)
df.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,product_title,product_description,product_bullet_point,product_brand,product_color,product_text
0,0,revent 80 cfm,0,B000MOO21W,us,Irrelevant,0,1,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,,WhisperCeiling fans feature a totally enclosed...,Panasonic,White,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...
2,1,revent 80 cfm,0,B07X3Y6B1V,us,Exact,0,1,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,Homewerks 7141-80 Bathroom Fan Integrated LED ...
3,2,revent 80 cfm,0,B07WDM7MQQ,us,Exact,0,1,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...
4,3,revent 80 cfm,0,B07RH6Z8KW,us,Exact,0,1,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White,Delta Electronics RAD80L BreezRadiance 80 CFM ...
5,4,revent 80 cfm,0,B07QJ7WYFQ,us,Exact,0,1,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White,Panasonic FV-08VRE2 Ventilation Fan with Reces...


In [71]:
len(df)

143

In [72]:
df["merged_text"] = (
    df["product_title"] + "\n" + df["product_text"] + "\n" + df["product_bullet_point"]
)

In [73]:
docs = df["merged_text"].to_list()
len(docs)

143

In [74]:
ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]
docs_embeddings = ef(docs)
query = "Do you have an example of a Panasonic product?"
query_embeddings = ef([query])

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [75]:
query_embeddings

{'dense': [array([-0.02941401, -0.02504556, -0.0574033 , ..., -0.05055605,
         -0.06463919, -0.00945763], dtype=float32)],
 'sparse': <1x250002 sparse array of type '<class 'numpy.float32'>'
 	with 10 stored elements in Compressed Sparse Row format>}

In [76]:
connections.connect()

fields = [
    # Use auto generated id as primary key
    FieldSchema(
        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
    ),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
]
schema = CollectionSchema(fields, "")
col = Collection("sparse_dense_demo", schema)

sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)

Status(code=0, message=)

In [78]:
entities = [
    df["merged_text"].to_list(),
    docs_embeddings["sparse"],
    docs_embeddings["dense"],
]
col.insert(entities)

(insert count: 143, delete count: 0, upsert count: 0, timestamp: 449418190134706177, success count: 143, err count: 0)

In [79]:
def query_hybrid_search(query: str):
    query_embeddings = ef([query])

    sparse_req = AnnSearchRequest(
        query_embeddings["sparse"], "sparse_vector", {"metric_type": "IP"}, limit=2
    )
    dense_req = AnnSearchRequest(
        query_embeddings["dense"], "dense_vector", {"metric_type": "COSINE"}, limit=2
    )

    res = col.hybrid_search(
        [sparse_req, dense_req], rerank=RRFRanker(), limit=2, output_fields=["text"]
    )

    return res

In [80]:
query_hybrid_search("Do you have a Homewerks product?")[0]

['id: 449353344520491318, distance: 0.032786883413791656, entity: {\'text\': "Homewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM\\nHomewerks 7141-80 Bathroom Fan Integrated LED Light Ceiling Mount Exhaust Ventilation, 1.1 Sones, 80 CFM\\nHomewerks\\n80 CFM\\nNone\\nOUTSTANDING PERFORMANCE: This Homewerk\'s bath fan ensures comfort in your home by quietly eliminating moisture and humidity in the bathroom. This exhaust fan is 1.1 sones at 80 CFM which means it’s able to manage spaces up to 80 square feet and is very quiet..\\nBATH FANS HELPS REMOVE HARSH ODOR: When cleaning the bathroom or toilet, harsh chemicals are used and they can leave an obnoxious odor behind. Homewerk’s bathroom fans can help remove this odor with its powerful ventilation\\nBUILD QUALITY: Designed to be corrosion resistant with its galvanized steel construction featuring a modern style round shape and has an 4000K Cool White Light LED Light. AC motor.\\nEASY IN