In [8]:
import torch
from transformers import CLIPProcessor, CLIPModel
from faker import Faker
import numpy as np
from tqdm import tqdm

In [9]:
qdrant_api_token = os.getenv("QDRANT_TOKEN")

In [45]:
model.eval()
torch.no_grad()

<torch.autograd.grad_mode.no_grad at 0x35cb6a640>

In [46]:
record = {
    'brand': fake.company(),
    'model': fake.word(),
    'color': fake.color_name(),
    'storage': f"{fake.random_int(64, 512)} GB",
    'condition': fake.random_element(['New', 'Used', 'Refurbished']),
    'price': round(fake.random_number(digits=3), 2),
    'city': fake.city(),
    'description': fake.sentence(),
    'id': fake.uuid4()
    }

In [47]:
def generate_text(record):
    fields = [
        f"brand: {record.get('brand', 'N/A')}Model: {record.get('model', 'N/A')}",
        f"color: {record.get('color', 'N/A')}",
        f"storage: {record.get('storage', 'N/A')}",
        f"condition: {record.get('condition', 'N/A')}",
        f"price: ${record.get('price', 'N/A')}",
        f"city: {record.get('city', 'N/A')}",
        f"description: {record.get('description', '')}"
    ]
    return ". ".join(fields)

In [48]:
generate_text(record).lower()

'brand: gibbs, davis and wheelermodel: down. color: darkmagenta. storage: 236 gb. condition: used. price: $369. city: hooperborough. description: beautiful particularly indicate strong use machine majority father.'

In [49]:
# Function to generate a single embedding
def generate_embedding(fields_concat):
    inputs = processor(text=fields_concat, images=None, return_tensors="pt", padding=True)
    outputs = model.get_text_features(**inputs)
    return outputs.detach().numpy().flatten()

In [53]:
indexed_vectors = []

# Number of vectors and points
num_vectors = 200_000

In [54]:
for _ in tqdm(range(num_vectors), desc="Generating indexed vectors"):
    # Create a randomized record similar to the provided format using Faker
    record = {
        'brand': fake.company(),
        'model': fake.word(),
        'color': fake.color_name(),
        'storage': f"{fake.random_int(64, 512)} GB",
        'condition': fake.random_element(['New', 'Used', 'Refurbished']),
        'price': round(fake.random_number(digits=3), 2),
        'city': fake.city(),
        'description': fake.sentence(),
        'id': fake.uuid4()
    }
    text = generate_text(record)
    embedding = generate_embedding(text)
    indexed_vectors.append({"id": record['id'], "vector": embedding, "record": record})

Generating indexed vectors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 200000/200000 [44:11<00:00, 75.42it/s]


In [57]:
indexed_vectors[1]

{'id': '39471b24-32bf-43d5-9844-5a0497c92ecc',
 'vector': array([ 1.16839908e-01, -1.51083916e-01,  1.45644024e-02, -5.71777523e-02,
        -4.76212144e-01,  1.76852793e-01, -6.06795371e-01,  1.25966072e-01,
        -5.70282459e-01,  5.20936251e-02, -2.26912513e-01,  2.77590483e-01,
         4.78487611e-02, -5.46748042e-02, -4.21057865e-02,  1.68138117e-01,
         1.66653171e-02,  2.80171573e-01, -2.00785458e-01, -4.21660617e-02,
         3.99372965e-01, -2.90792882e-01,  3.18693459e-01, -1.38133675e-01,
        -1.67281538e-01, -1.37304142e-02, -1.62990153e-01, -4.43391919e-01,
        -4.57302034e-02, -1.63907498e-01, -1.54298976e-01,  1.19418927e-01,
         2.32386857e-01,  1.10357948e-01, -1.58626765e-01,  3.14723372e-01,
        -5.25415018e-02,  1.02685004e-01, -3.40776563e-01, -4.11175877e-01,
         7.40989596e-02,  9.77465734e-02,  2.27091238e-02,  2.48404682e-01,
        -3.38782758e-01, -1.64552718e-01, -2.43533939e-01,  2.53480375e-01,
         1.27060741e-01,  1.535

In [59]:
np.save("indexed_vectors.npy", indexed_vectors)

In [105]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams

qdrant_client = QdrantClient(
    url="https://f2216b0f-7418-45f1-963b-9c367be87692.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key=qdrant_api_token,
)

print(qdrant_client.get_collections())

collections=[]


In [106]:
collection_name = "random_data_clip_collection"

# Define the vector configuration
vector_config = {
    "size": 512,  
    "distance": "Cosine"  
}

if not qdrant_client.collection_exists(collection_name):
    qdrant_client.create_collection(
        collection_name,
        vectors_config=VectorParams(size=512, distance='Cosine')
    )

In [107]:
BATCH_SIZE = 150  # Reduce batch size
points_to_upsert = [
    {
        'id': vector_data['id'],
        'vector': vector_data['vector'],
        'payload': vector_data['record']
    } for vector_data in indexed_vectors
]

# Upsert points in batches with retry mechanism
total_points = len(points_to_upsert)
num_batches = math.ceil(total_points / BATCH_SIZE)

for i in range(num_batches):
    batch = points_to_upsert[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
    success = False
    retries = 3  # Number of retries
    while not success and retries > 0:
        try:
            qdrant_client.upsert(
                collection_name,
                points=batch
            )
            success = True
        except Exception as e:
            print(f"Error during upsert: {e}. Retrying in 5 seconds...")
            time.sleep(5)
            retries -= 1
    if not success:
        print("Failed to upsert batch after 3 retries.")

In [140]:
query = 'brand is Macdonald, Wilson and Mosley, storage is equal to 353 GB, color is Thistle, city equal to Trevinoland, description is: Support get pass great'
embedded_query =  generate_embedding(query)

In [141]:
responses = qdrant_client.search(
    collection_name=collection_name,
    query_vector=embedded_query,
    limit = 3)

In [142]:
responses

[ScoredPoint(id='12e86092-399e-4d1e-b390-af058c0989f1', version=428, score=0.8161876, payload={'brand': 'Brown, Torres and Mueller', 'model': 'last', 'color': 'Thistle', 'storage': '93 GB', 'condition': 'Used', 'price': 50, 'city': 'Mcdonaldfurt', 'description': 'Better discussion although cause.', 'id': '12e86092-399e-4d1e-b390-af058c0989f1'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='851f6e59-4f11-45ec-8862-2831d0172056', version=528, score=0.81456876, payload={'brand': 'Mcdonald, Payne and Thompson', 'model': 'possible', 'color': 'MediumVioletRed', 'storage': '416 GB', 'condition': 'New', 'price': 707, 'city': 'Benitezhaven', 'description': 'Perhaps organization happy full address.', 'id': '851f6e59-4f11-45ec-8862-2831d0172056'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='c5e50085-862f-4dc5-9fc1-3a594b2526a6', version=476, score=0.8135719, payload={'brand': 'Mcdonald-Martinez', 'model': 'work', 'color': 'SlateGray', 'storage': '410 GB', '

In [143]:
for value in responses:
    print(value.score, value.payload)

0.8161876 {'brand': 'Brown, Torres and Mueller', 'model': 'last', 'color': 'Thistle', 'storage': '93 GB', 'condition': 'Used', 'price': 50, 'city': 'Mcdonaldfurt', 'description': 'Better discussion although cause.', 'id': '12e86092-399e-4d1e-b390-af058c0989f1'}
0.81456876 {'brand': 'Mcdonald, Payne and Thompson', 'model': 'possible', 'color': 'MediumVioletRed', 'storage': '416 GB', 'condition': 'New', 'price': 707, 'city': 'Benitezhaven', 'description': 'Perhaps organization happy full address.', 'id': '851f6e59-4f11-45ec-8862-2831d0172056'}
0.8135719 {'brand': 'Mcdonald-Martinez', 'model': 'work', 'color': 'SlateGray', 'storage': '410 GB', 'condition': 'Used', 'price': 127, 'city': 'North Vernonmouth', 'description': 'Role voice however party song wind beautiful mind.', 'id': 'c5e50085-862f-4dc5-9fc1-3a594b2526a6'}
