### Import Dependencies

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, SparseVectorParams, Document, Prefetch, FusionQuery
from qdrant_client import models

import pandas as pd
import openai
import fastembed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qdrant_client = QdrantClient(url="http://localhost:6333")

### Create Qdrant collection for hybrid search

In [3]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-hybrid-search",
    vectors_config={
        "text-embedding-3-small": VectorParams(size=1536, distance=Distance.COSINE)
    },
    sparse_vectors_config={
        "bm25": SparseVectorParams(modifier=models.Modifier.IDF)
    }
)

True

In [4]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid-search",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embedding Functions

In [5]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [6]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

### Process and Embed Amazon Items Data

In [7]:
df_items = pd.read_json("../../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

In [8]:
df_items.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Electronics,2 Pack-iPhone Earbuds Wired Lightning Headphon...,3.4,598,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],WASABI MANGO,"[Electronics, Headphones, Earbuds & Accessorie...",{'Product Dimensions': '23.62 x 19.69 x 27.56 ...,B0B1ZVC7GJ,,,
1,Computers,"Mini PC 16GB DDR4 256GB M.2 SSD,Quad-Core 2.7G...",4.3,450,[„ÄêMeet to Sufficient Memory Storage„ÄëThis Mini ...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],OUVISLITE,"[Electronics, Computers & Accessories, Compute...","{'Screen Resolution': '3840 x 2160', 'Max Scre...",B0B1HNV2V9,,,
2,Computers,Samsers Foldable Bluetooth Keyboard with Touch...,4.4,308,[„ÄêFull-size Folding Wireless Keyboard„Äë Samsers...,[],48.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Love this keyboard!', 'url': 'http...",Samsers,"[Electronics, Computers & Accessories, Compute...",{'Product Dimensions': '13.5 x 4.5 x 0.4 inche...,B0C2Q8BDTX,,,
3,Computers,"Rolling Laptop Bag Women with Wheels, Rolling ...",4.5,152,[MOBILE OFFICE: EMPSIGN rolling bag with lapto...,[rolling laptop bag],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Elegant Laptop Tote Bag for Women ...,Ytonet,"[Electronics, Computers & Accessories, Laptop ...","{'Brand': 'Ytonet', 'Item model number': '1332...",B092Z9CTNK,,,
4,All Electronics,"Wireless Mouse, 2.4G Silent Mouse with USB Rec...",4.6,677,[Plug & Play Super Easy to Use- Just plug and ...,[],10.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],MagoFeliz,"[Electronics, Computers & Accessories, Compute...",{'Product Dimensions': '4.69 x 2.6 x 0.01 inch...,B0C77L1G7V,,,


In [9]:
len(df_items)

1000

In [10]:
def preprocess_description(row):
    return f"{row['title']} {' '.join(row['features'])}"

In [11]:
def extract_first_large_image(row):
    return row["images"][0].get("large", "")

In [12]:
df_items["description"] = df_items.apply(preprocess_description, axis=1)
df_items["image"] = df_items.apply(extract_first_large_image, axis=1)

In [13]:
data_to_embed = df_items[["description", "image", "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")

In [14]:
data_to_embed

[{'description': '2 Pack-iPhone Earbuds Wired Lightning Headphone„ÄêApple MFi Certified„Äëin-Ear Headset Stereo Noise Canceling with Built-in Microphone & Volume Control Compatible with iPhone 13/12/11/SE/X/XR/8/7-All iOS ',
  'image': 'https://m.media-amazon.com/images/I/31eziY1O3EL._AC_.jpg',
  'rating_number': 598,
  'price': nan,
  'average_rating': 3.4,
  'parent_asin': 'B0B1ZVC7GJ'},
 {'description': 'Mini PC 16GB DDR4 256GB M.2 SSD,Quad-Core 2.7GHz Processor Windows 11 Pro Mini Computers, Small Form Factor Desktop PC Support 2TB Expansion, Win 10, 4K, Dual HDMI, 2.4G/5G WiFi, BT, Gigabit Ethernet „ÄêMeet to Sufficient Memory Storage„ÄëThis Mini Computer equiped with 16GB DDR4 and 256GB SSD. While ensuring cost-effectiveness, the larger capacity brings you a better multitasking experience. Newbie friendly Storage upgrade:1)2.5 inch SATA SSD/HDD up to 2TB(not include). 2)replace the M.2 SATA HDD. Expand your applications, files, video or audio freely. „ÄêJ4125 Mini PC with Windows

In [15]:
text_to_embed = [data["description"] for data in data_to_embed]

In [16]:
text_to_embed

['2 Pack-iPhone Earbuds Wired Lightning Headphone„ÄêApple MFi Certified„Äëin-Ear Headset Stereo Noise Canceling with Built-in Microphone & Volume Control Compatible with iPhone 13/12/11/SE/X/XR/8/7-All iOS ',
 'Mini PC 16GB DDR4 256GB M.2 SSD,Quad-Core 2.7GHz Processor Windows 11 Pro Mini Computers, Small Form Factor Desktop PC Support 2TB Expansion, Win 10, 4K, Dual HDMI, 2.4G/5G WiFi, BT, Gigabit Ethernet „ÄêMeet to Sufficient Memory Storage„ÄëThis Mini Computer equiped with 16GB DDR4 and 256GB SSD. While ensuring cost-effectiveness, the larger capacity brings you a better multitasking experience. Newbie friendly Storage upgrade:1)2.5 inch SATA SSD/HDD up to 2TB(not include). 2)replace the M.2 SATA HDD. Expand your applications, files, video or audio freely. „ÄêJ4125 Mini PC with Windows 11 Pro„ÄëCeleron Gemini Lake J4125 Quad-Core processor with base frequency of 2.0 GHz and up to 2.7 GHz in burst. Let the minis to run multiple applications, Web pages, full HD video or light game sm

In [17]:
embeddings = get_embeddings_batch(text_to_embed)

Processed 100 of 1000
Processed 200 of 1000
Processed 300 of 1000
Processed 400 of 1000
Processed 500 of 1000
Processed 600 of 1000
Processed 700 of 1000
Processed 800 of 1000
Processed 900 of 1000
Processed 1000 of 1000


In [18]:
len(embeddings)

1000

In [19]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings, data_to_embed):
    pointstructs.append(
        PointStruct(
            id=i,
            vector={
                "text-embedding-3-small": embedding,
                "bm25": Document(
                    text=data["description"],
                    model="qdrant/bm25"
                )
            },
            payload=data
        )
    )
    i += 1

In [20]:
pointstructs[0].vector

{'text-embedding-3-small': [0.05254307761788368,
  0.01494694221764803,
  -0.0773656815290451,
  0.0035389363765716553,
  -0.027663283050060272,
  0.005161842796951532,
  0.02434597723186016,
  0.053915757685899734,
  0.020876150578260422,
  -0.06829074770212173,
  0.017415856942534447,
  0.020552046597003937,
  -0.0472048856317997,
  0.015175722539424896,
  0.010295087471604347,
  0.02257293276488781,
  -0.05345819517970085,
  0.0012535222340375185,
  -0.015595152042806149,
  0.0428580678999424,
  0.0033745011314749718,
  0.04678545519709587,
  0.02308768779039383,
  0.03542272746562958,
  0.030732743442058563,
  0.00010172952897846699,
  0.01069545280188322,
  -0.005118946544826031,
  0.026824424043297768,
  0.015919256955385208,
  0.00534772640094161,
  -0.010819374583661556,
  0.04731927439570427,
  -0.019198432564735413,
  -0.01963692717254162,
  0.0009532488766126335,
  0.009770801290869713,
  -0.03586122393608093,
  0.009990048594772816,
  -0.01916983537375927,
  0.0078881345689

In [21]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search",
    points=pointstructs[0:500],
    wait=True
)

Fetching 18 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<00:00, 47.15it/s]


UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search",
    points=pointstructs[500:],
    wait=True
)

UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

### Hybrid Retrieval

In [23]:
def retrieve_data(query, qdrant_client, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid-search",
        prefetch=[
            Prefetch(
                query=query_embedding,
                using="text-embedding-3-small",
                limit=20
            ),
            Prefetch(
                query=Document(
                    text=query,
                    model="qdrant/bm25"
                ),
                using="bm25",
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k,
    )

    retrieved_context_ids = []
    retrieved_context = []
    similarity_scores = []
    retrieved_context_ratings = []

    for result in results.points:
        retrieved_context_ids.append(result.payload["parent_asin"])
        retrieved_context.append(result.payload["description"])
        retrieved_context_ratings.append(result.payload["average_rating"])
        similarity_scores.append(result.score)

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "retrieved_context_ratings": retrieved_context_ratings,
        "similarity_scores": similarity_scores,
    }

In [24]:
results = retrieve_data("Can I get some tablet?", qdrant_client, k=20)

In [25]:
results

{'retrieved_context_ids': ['B0BTPK1R2D',
  'B0B4JSD5Z8',
  'B0BVZ512TS',
  'B09ZLFV5PC',
  'B09P29VXG1',
  'B0C3LXVGBW',
  'B0CKGB5463',
  'B09F8TLBZL',
  'B0BLVY5ZFX',
  'B0C69LMFL9',
  'B09RN3KN5C',
  'B0B9JMKFK8',
  'B09VP8SZSR',
  'B0C78B1BTB',
  'B09TFS9298',
  'B0B4CJ52DL',
  'B0B44TGKRX',
  'B09W9JMJ3L',
  'B0BZCM9CBR',
  'B0C35RS6MS'],
 'retrieved_context': ['Android Tablet, 10 inch Tablets, 2GB+32GB Computer Tablet Support 512GB Expand, 2MP + 8MP Camera, IPS Screen, WiFi, Bluetooth, 6000mAh, Google GMS Certified Tableta Black „ÄêAndroid Tablet„ÄëTablet featuring Android 10 operating system, More stable and smaller occupy. Powerful Quad core Processor which is more energy-saving. More Quicker and faster When open apps. Smooth when surfing ineternet.Google GMS Certified Tablet so that you can get apps such as Netflix, TikTok, Yahoo, Twitter from the Google Play. „Äê10 inch IPS Display„ÄëTablet equipped with a resolution 1280x800 IPS Screen. 10 inch Large screen, wide viewing ang