In [1]:
!pip install transformers
!pip install qdrant-client

Collecting qdrant-client
  Obtaining dependency information for qdrant-client from https://files.pythonhosted.org/packages/c6/2e/abb6befe93d321904d05d6667762f9a9fb59af1b144de535f50e1ec0cc14/qdrant_client-1.6.9-py3-none-any.whl.metadata
  Downloading qdrant_client-1.6.9-py3-none-any.whl.metadata (9.3 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Obtaining dependency information for grpcio-tools>=1.41.0 from https://files.pythonhosted.org/packages/d3/df/a51af6461494efc9ecf5379967aed75006b8b58a2fb3dfc6a0f48a7d9d11/grpcio_tools-1.59.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading grpcio_tools-1.59.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting httpx[http2]>=0.14.0 (from qdrant-client)
  Obtaining dependency information for httpx[http2]>=0.14.0 from https://files.pythonhosted.org/packages/a2/65/6940eeb21dcb2953778a6895281c179efd9100463ff08cb6232bb6480da7/httpx-0.25.2-py3-none-any.whl.metadata
  Downl

In [2]:
import pandas as pd
import os
from dotenv import load_dotenv
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from qdrant_client import QdrantClient
from qdrant_client.http import models
import csv

In [3]:
# Load the dataset
file_path = '/kaggle/input/chaabi/bigBasketProducts.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [4]:
print(data.describe())

             index    sale_price  market_price        rating
count  27555.00000  27555.000000  27555.000000  18929.000000
mean   13778.00000    322.514808    382.056664      3.943410
std     7954.58767    486.263116    581.730717      0.739063
min        1.00000      2.450000      3.000000      1.000000
25%     6889.50000     95.000000    100.000000      3.700000
50%    13778.00000    190.000000    220.000000      4.100000
75%    20666.50000    359.000000    425.000000      4.300000
max    27555.00000  12500.000000  12500.000000      5.000000


In [5]:
data.isna().sum()

index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [6]:
data = data[data["description"].notnull()]

In [7]:
data.shape

(27440, 10)

In [8]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Check if multiple GPUs are available and wrap the model using DataParallel
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)
device = ("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to GPU
model.to(device)
texts = list(data['description'])
def generate_embeddings(texts, batch_size=32, model_name='bert-base-uncased'):
    # Load tokenizer and model
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    model.eval()  # Put the model in evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        # Tokenize and encode the batch of texts
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        # Extract the embeddings for the [CLS] token (first token)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        # Convert each embedding in the batch to a list of floats and append to the result
        for embedding in cls_embeddings:
            all_embeddings.append(embedding.cpu().tolist())

    return all_embeddings

vector_list = generate_embeddings(texts)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using 2 GPUs


In [9]:
np.savetxt("vectors.csv",
        vector_list,
        delimiter =", ",
        fmt ='% s')

In [10]:
# with open('vectors.csv', newline='') as f:
#     reader = csv.reader(f)
#     vector_list = list(reader)
# vector_list = [[float(j) for j in i] for i in vector_list]

In [11]:
load_dotenv()
client = QdrantClient(
    url="https://f92f74ee-f795-441b-a5b9-2783cb082d6e.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="FWFmcohveoQQQltuEQyCsiaAgCEP4K4ZaDWCb6IcMWSzFWWgCO9lJw",
)

In [12]:
collection_name = 'Big_Basket'

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE)
)

True

In [14]:
# Upload data to Qdrant
size = len(vector_list)
ids = list(range(size))

# Convert PyTorch tensors to lists of floats
batch_size = 500
for i in range(0, size, batch_size):
    j = min(size, i+batch_size)
    payload = data[['product', 'brand', 'sale_price', 'market_price']].iloc[i:j].to_dict(orient="records")

    client.upsert(collection_name=collection_name,
                points=models.Batch(
                    ids=ids[i:j],
                    vectors=vector_list[i:j],
                    payloads=payload
                    )
                )

UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Format error in JSON body: expected value at line 1 column 8103333"},"time":0.0}'

In [15]:
client.search(
          collection_name="Big_Basket",
          query_vector=generate_embeddings("soap")[0],
          limit=5)

[ScoredPoint(id=13765, version=27, score=0.9401583, payload={'brand': 'Weikfield', 'market_price': 55.0, 'product': 'Soya Sauce', 'sale_price': 49.0}, vector=None),
 ScoredPoint(id=4126, version=8, score=0.93682706, payload={'brand': 'TrueSouth', 'market_price': 150.0, 'product': 'Coffee Decoction - Ready to Use Filter, Eighty20', 'sale_price': 150.0}, vector=None),
 ScoredPoint(id=5035, version=10, score=0.9306706, payload={'brand': 'Tiffany', 'market_price': 225.0, 'product': 'Sugarfree Cookies - Oatmeal', 'sale_price': 225.0}, vector=None),
 ScoredPoint(id=2617, version=5, score=0.92474127, payload={'brand': 'Priya', 'market_price': 90.0, 'product': 'Pickle - Gongura (With Garlic)', 'sale_price': 90.0}, vector=None),
 ScoredPoint(id=80, version=0, score=0.92017555, payload={'brand': 'Sanjay ', 'market_price': 65.0, 'product': 'Chips - Topica', 'sale_price': 65.0}, vector=None)]