# Imports

In [None]:
!pip install datasets sentence-transformers --upgrade
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloa

# Load Pretrained and Fine-tuned Models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
price_tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Swup Swap/fine_tuned_price_model')
price_model = AutoModelForTokenClassification.from_pretrained('/content/drive/MyDrive/Swup Swap/fine_tuned_price_model')
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Load Dataset and Precomputed Embeddings

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/swap_dataset.csv")
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df.dropna(subset=['Price'], inplace=True)
df.reset_index(drop=True, inplace=True)
text_embeddings = np.load("/content/drive/MyDrive/Swup Swap/sbert_text_embeddings.npy")

  df = pd.read_csv("/content/drive/MyDrive/Datasets/swap_dataset.csv")


# Price Extraction with Fine-tuned Model

In [None]:
def extract_price_with_finetuned_model(text):
    inputs = price_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = price_model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    tokens = price_tokenizer.tokenize(text)
    predicted_labels = predictions[0].tolist()[:len(tokens)]
    price_tokens = [token.replace('##', '') for token, label in zip(tokens, predicted_labels) if label in [1, 2]]
    price_str = ''.join(price_tokens)
    price_str = re.sub(r'[^\d.]', '', price_str)
    try:
        return float(price_str)
    except:
        return extract_price_regex_fallback(text)

# Fallback Price Extraction Using Regex

In [None]:
def extract_price_regex_fallback(text):
    patterns = [r'(\d+)\$', r'\$(\d+)', r'(\d+)\s*dollars?', r'(\d+)\s*Dollars?', r'(\d+)\s*USD', r'(\d+)\s*usd',
                r'(\d+)\s*pounds?', r'(\d+)\s*Pounds?', r'(\d+)\s*[Ee]uro?', r'(\d+)\s*EURO', r'\$\s*(\d+)', r'(\d+)\s*\$',
                r'USD\s*(\d+)', r'usd\s*(\d+)', r'Dollars?\s*(\d+)', r'dollars?\s*(\d+)']
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                return float(matches[0])
            except:
                continue
    raise ValueError("No price found in text")

# Connect to Astra DB

In [None]:
!pip install cassandra-driver

Collecting cassandra-driver
  Downloading cassandra_driver-3.29.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting geomet<0.3,>=0.1 (from cassandra-driver)
  Downloading geomet-0.2.1.post1-py3-none-any.whl.metadata (1.0 kB)
Downloading cassandra_driver-3.29.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading geomet-0.2.1.post1-py3-none-any.whl (18 kB)
Installing collected packages: geomet, cassandra-driver
Successfully installed cassandra-driver-3.29.2 geomet-0.2.1.post1


### 1. Connect to Astra DB

In [None]:
!pip install -q qdrant-client transformers torch

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install --upgrade qdrant-client




In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
import os

In [None]:
from google.colab import userdata
QDRANT_API_KEY= userdata.get('QDRANT_API_KEY')

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import PayloadSchemaType, CollectionStatus

client = QdrantClient(
    url="https://69814424-871c-4965-b522-59c83512349d.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key=QDRANT_API_KEY
)

print(client.get_collections())

collections=[CollectionDescription(name='swap-embeddings')]


# Create Qdrant Collection

In [None]:
COLLECTION_NAME = "swap-embeddings"

client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)  # 384 for MiniLM
)


  client.recreate_collection(


True

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm
import uuid
from qdrant_client.models import PointStruct

BATCH_SIZE = 100
points_batch = []

# tqdm shows progress bar over total number of rows
for i, row in tqdm(df.iterrows(), total=len(df), desc="Uploading embeddings"):
    embedding = text_embeddings[i] / np.linalg.norm(text_embeddings[i])
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding.tolist(),
        payload={
            "Description": row["Description"],
            "Price": row["Price"],
            "Category": row["Category"]
        }
    )
    points_batch.append(point)

    if len(points_batch) >= BATCH_SIZE:
        client.upsert(collection_name=COLLECTION_NAME, points=points_batch)
        points_batch = []


if points_batch:
    client.upsert(collection_name=COLLECTION_NAME, points=points_batch)

print("\n \n All embeddings uploaded in batches.")


Uploading embeddings: 100%|██████████| 295618/295618 [13:09<00:00, 374.42it/s]


 
 All embeddings uploaded in batches.





# Create a Payload for category and price

Payload is metadata (additional information) associated with each vector point in the collection.
It is used for filtering, re-ranking and displaying information with each recommendation

In [None]:
from qdrant_client.models import PayloadSchemaType

client.create_payload_index(
    collection_name=COLLECTION_NAME,
    field_name="Category",
    field_schema=PayloadSchemaType.KEYWORD
)

client.create_payload_index(
    collection_name=COLLECTION_NAME,
    field_name="Price",
    field_schema=PayloadSchemaType.FLOAT
)


UpdateResult(operation_id=2960, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
from qdrant_client.models import Filter, FieldCondition, Range, MatchValue

def get_recommendations_qdrant(query_text, category_preference, tolerance=500, limit=10):
    try:
        query_price = extract_price_with_finetuned_model(query_text)
    except:
        return pd.DataFrame(columns=["Description", "Price", "Category", "combined_score"])

    min_price = query_price - tolerance
    max_price = query_price + tolerance

    query_embedding = sbert_model.encode([query_text])[0]
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    search_results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding.tolist(),
        limit=limit * 3,
        with_payload=True,
        query_filter=Filter(
            must=[
                FieldCondition(key="Category", match=MatchValue(value=category_preference)),
                FieldCondition(key="Price", range=Range(gte=min_price, lte=max_price))
            ]
        )
    )

    if not search_results:
        return pd.DataFrame(columns=["Description", "Price", "Category", "combined_score"])

    result_rows = []
    for res in search_results:
        payload = res.payload
        price = float(payload["Price"])
        price_sim = 1 / (1 + abs(price - query_price))
        result_rows.append({
            "Description": payload["Description"],
            "Price": price,
            "Category": payload["Category"],
            "combined_score": price_sim * 0.7 + res.score * 0.3
        })

    result_df = pd.DataFrame(result_rows)
    result_df = result_df.sort_values(by="combined_score", ascending=False).head(limit)
    return result_df


In [None]:
query = "necklace gold color for 656 usd new not used"
category = "Clothes"
recommendations = get_recommendations_qdrant(query, category)
print(recommendations)


  search_results = client.search(


                                          Description   Price Category  \
28  Blue floral print medium-coverage braNon padde...   659.0  Clothes   
18  Pink printed straight kurta, has a boat neck, ...   666.0  Clothes   
11  Maroon and Yellow colourblocked sweater, has a...   674.0  Clothes   
2   Set content: 2 Door curtainsColour: BrownPatte...   629.0  Clothes   
0   Blue solid T-shirt, has a polo collar, short s...   519.0  Clothes   
1   Purple & beige checked casual shirt, has a spr...  1034.0  Clothes   
22  Navy Blue embroidered straight kurta, has a ma...   683.0  Clothes   
7   Grey printed casual shirt, has a spread collar...   612.0  Clothes   
9   Blue printed casual shirt, has a spread collar...   612.0  Clothes   
12  Black solid T-shirt, has a round neck, and lon...   699.0  Clothes   

    combined_score  
28        0.262486  
18        0.155721  
11        0.132162  
2         0.127390  
0         0.119987  
1         0.116356  
22        0.115627  
7         0.11316

Note: In Faiss we used cosine similarity, so the price similarity is combined with the cosine similarity to give a score between 0 and 1

Wheresas in Qdrant the score is the result of the internal cosine distance cnversion. So the  values may fall into a lower range between 0.1 and 0.4 , depending on how Qdrant computes the internal similarity.

Score ranges are not directly comparable between Faiss and Qdrant due to the difference in how the similarity is calculated and normalized.

Normalizing the scores are and using a different price similarity function (giving more credits to items that are +-100-200 apart)

In [None]:
from qdrant_client.models import Filter, FieldCondition, Range, MatchValue

def normalize(score, min_s, max_s):
    if max_s == min_s:
        return 0.0  # prevent division by zero
    return (score - min_s) / (max_s - min_s)

def get_recommendations_qdrant_normalized(query_text, category_preference, tolerance=500, limit=10,
                                price_weight=0.7, text_weight=0.3):
    try:
        query_price = extract_price_with_finetuned_model(query_text)
    except:
        return pd.DataFrame(columns=["Description", "Price", "Category", "combined_score"])

    min_price = query_price - tolerance
    max_price = query_price + tolerance

    query_embedding = sbert_model.encode([query_text])[0]
    query_embedding = query_embedding / np.linalg.norm(query_embedding)

    search_results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding.tolist(),
        limit=limit * 5,
        with_payload=True,
        query_filter=Filter(
            must=[
                FieldCondition(key="Category", match=MatchValue(value=category_preference)),
                FieldCondition(key="Price", range=Range(gte=min_price, lte=max_price))
            ]
        )
    )

    if not search_results:
        return pd.DataFrame(columns=["Description", "Price", "Category", "combined_score"])


    scores = [res.score for res in search_results]
    min_score, max_score = min(scores), max(scores)

    result_rows = []
    for res in search_results:
        payload = res.payload
        price = float(payload["Price"])
        price_sim = np.exp(-abs(price - query_price) / 200)
        text_sim_norm = normalize(res.score, min_score, max_score)

        combined_score = price_weight * price_sim + text_weight * text_sim_norm

        result_rows.append({
            "Description": payload["Description"],
            "Price": price,
            "Category": payload["Category"],
            "price_similarity": price_sim,
            "combined_score": combined_score
        })

    result_df = pd.DataFrame(result_rows)
    result_df = result_df.sort_values(by="combined_score", ascending=False).head(limit)
    return result_df


In [None]:
query = "necklace gold color for 656 usd new not used"
category = "Clothes"
recommendations = get_recommendations_qdrant_normalized(query, category)
print(recommendations)


  search_results = client.search(


                                          Description  Price Category  \
2   Set content: 2 Door curtainsColour: BrownPatte...  629.0  Clothes   
11  Maroon and Yellow colourblocked sweater, has a...  674.0  Clothes   
18  Pink printed straight kurta, has a boat neck, ...  666.0  Clothes   
28  Blue floral print medium-coverage braNon padde...  659.0  Clothes   
7   Grey printed casual shirt, has a spread collar...  612.0  Clothes   
9   Blue printed casual shirt, has a spread collar...  612.0  Clothes   
22  Navy Blue embroidered straight kurta, has a ma...  683.0  Clothes   
12  Black solid T-shirt, has a round neck, and lon...  699.0  Clothes   
0   Blue solid T-shirt, has a polo collar, short s...  519.0  Clothes   
46  Red self-design T-shirt, has a polo collar, sh...  674.0  Clothes   

    price_similarity  combined_score  
2           0.873716        0.796322  
11          0.913931        0.759394  
18          0.951229        0.755727  
28          0.985112        0.737111  
7

In [None]:
query = "vintage leather bag brown color for 645 usd like new"
category = "Clothes"
recommendations = get_recommendations_qdrant_normalized(query, category)
print(recommendations)

  search_results = client.search(


                                          Description  Price Category  \
0   Blue self-design formal shirt, has a spread co...  649.0  Clothes   
3   A pair of white & black round-toe flats, has r...  661.0  Clothes   
7   Red printed lounge t-shirts, has a round neck,...  616.0  Clothes   
18  White and Black striped straight kurta, has a ...  679.0  Clothes   
38  A pair of blue open-toed flats, has mid-top st...  649.0  Clothes   
42  Purple self-design casual shirt, has a spread ...  647.0  Clothes   
35  Set content: 2 Door curtainsColour: BrownPatte...  629.0  Clothes   
11  Blue solid sling bag, has a button closure1 ma...  721.0  Clothes   
10  Red printed woven A-line dress, has a round ne...  739.0  Clothes   
21  Pink solid knitted mid-rise track pants, has a...  704.0  Clothes   

    price_similarity  combined_score  
0           0.980199        0.986139  
3           0.923116        0.933461  
7           0.865022        0.856116  
18          0.843665        0.736478  
3

In [None]:
query = "DSLR camera with lens kit 660 dollars mint condition"
category = "Electronics"
recommendations = get_recommendations_qdrant_normalized(query, category)
print(recommendations)

  search_results = client.search(


                                          Description       Price  \
11  3 Year Manufacturer Warranty   1 yr. Bravia Co...  649.990000   
2   Brand: Sony, OS: Android, Screen size: 15.37 i...  548.342670   
0   Brand: Sony, OS: Android, Screen size: 10.16 i...  401.566204   
5   Brand: Sony, OS: Android, Screen size: 12.7 in...  476.813901   
3   Brand: Sony, OS: Android, Screen size: 12.7 in...  453.935085   
1   Brand: Sony, OS: Android, Screen size: 10.16 i...  378.940329   
9   Indoor / Outdoor 2K Wi-Fi NVR 4 Camera CCTV Se...  499.990000   
6   Brand: Sony, OS: Android, Screen size: 12.7 in...  454.329478   
28  Brand: Sony, OS: Android, Screen size: 15.24 i...  534.041854   
24  Brand: Sony, OS: Android, Screen size: 12.7 in...  510.758050   

       Category  price_similarity  combined_score  
11  Electronics          0.951182        0.767404  
2   Electronics          0.572189        0.624838  
0   Electronics          0.274674        0.492272  
5   Electronics          0.40014

In [None]:
query = "kitchen blender high power for 655 usd unopened box"
category = "Electronics"
recommendations = get_recommendations_qdrant_normalized(query, category)
print(recommendations)

  search_results = client.search(


                                          Description   Price     Category  \
0   Crystal Processor 4K\nCrystal Display   Slim D...  649.99  Electronics   
43  3 Year Manufacturer Warranty   1 yr. Bravia Co...  649.99  Electronics   
3   α5 Gen 5 AI Processor 4K   60Hz Refresh Rate  ...  579.99  Electronics   
22  a5 AI Processor Gen6   60Hz Refresh Rate   AI ...  679.99  Electronics   
15  LG Dryer was ordered (1 item(s)) for $600.00 o...  600.00  Electronics   
17  LG Dryer was ordered (1 item(s)) for $600.00 o...  600.00  Electronics   
18  LG Dryer was ordered (1 item(s)) for $600.00 o...  600.00  Electronics   
19  LG Dryer was ordered (1 item(s)) for $600.00 o...  600.00  Electronics   
20  LG Dryer was ordered (1 item(s)) for $600.00 o...  600.00  Electronics   
21  LG Dryer was ordered (1 item(s)) for $600.00 o...  600.00  Electronics   

    price_similarity  combined_score  
0           0.975261        0.982683  
43          0.975261        0.688107  
3           0.687255    