In [1]:
# read json file
import json

json_data = ""
with open('diabetes_org_processed_embedded.json', 'r') as f:
    json_data = json.loads(f.read())

from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

connections.connect("default", host="localhost", port="19530")

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
]
schema = CollectionSchema(fields=fields, description="ada demo database", enable_dynamic_field=True)
ada_demo = Collection("ada_demo", schema)


In [2]:
from tqdm.notebook import tqdm

from pymilvus import MilvusException

failed_rows = []

print(f"Processing {len(json_data['rows'])} rows")

for row in tqdm(json_data['rows'], desc="Processing rows in json_data part "):
    try:
        insert_vectors = ada_demo.insert(row)
    except (Exception, MilvusException) as e:
        failed_rows.append(row)

ada_demo.flush()


Processing 25146 rows


Processing rows in json_data part :   0%|          | 0/25146 [00:00<?, ?it/s]

RPC error: [insert_rows], <ParamError: (code=1, message=Collection field dim is 1024, but entities field dim is 0)>, <Time:{'RPC start': '2024-02-28 04:48:03.569390', 'RPC error': '2024-02-28 04:48:03.569732'}>
RPC error: [insert_rows], <MilvusException: (code=1100, message=the length (96168) of dynamic field exceeds max length (65536): invalid parameter[expected=valid length dynamic field][actual=length exceeds max length])>, <Time:{'RPC start': '2024-02-28 04:48:03.905917', 'RPC error': '2024-02-28 04:48:03.910975'}>
RPC error: [insert_rows], <MilvusException: (code=1100, message=the length (95914) of dynamic field exceeds max length (65536): invalid parameter[expected=valid length dynamic field][actual=length exceeds max length])>, <Time:{'RPC start': '2024-02-28 04:48:03.911613', 'RPC error': '2024-02-28 04:48:03.916092'}>
RPC error: [insert_rows], <MilvusException: (code=1100, message=the length (95792) of dynamic field exceeds max length (65536): invalid parameter[expected=valid 

In [3]:
print(f"Failed to insert {len(failed_rows)} rows")

Failed to insert 373 rows


## Fix failed rows

In [4]:
all_keys = []

for row in failed_rows:
    all_keys.append(list(row.keys()))

# extract only the keys available in all rows
common_keys = set.intersection(*map(set, all_keys))
print(common_keys)

{'summary', 'meta', 'title', 'processed_text', 'url', 'idxd', 'text', 'level_1', 'level_4', 'level_3', 'vector', 'level_2'}


In [5]:
common_keys.remove('processed_text')
common_keys.remove('meta')

# now only keep the common keys in the failed rows, drop everything else
for row in failed_rows:
    for key in list(row.keys()):
        if key not in common_keys:
            row.pop(key)

In [6]:
from tqdm.notebook import tqdm

from pymilvus import MilvusException

prev_rows = failed_rows.copy()

failed_rows = []

print(f"Processing {len(prev_rows)} rows")

for row in tqdm(prev_rows, desc="Processing rows in json_data part "):
    try:
        insert_vectors = ada_demo.insert(row)
    except (Exception, MilvusException) as e:
        failed_rows.append(row)

ada_demo.flush()


Processing 373 rows


Processing rows in json_data part :   0%|          | 0/373 [00:00<?, ?it/s]

RPC error: [insert_rows], <ParamError: (code=1, message=Collection field dim is 1024, but entities field dim is 0)>, <Time:{'RPC start': '2024-02-28 04:49:42.940394', 'RPC error': '2024-02-28 04:49:42.940621'}>


In [7]:
index = {
    "index_type": "FLAT",
    "metric_type": "COSINE",
}
ada_demo.create_index("vector", index)

Status(code=0, message=)

In [8]:
import random
import time

# Assuming ada_demo and json_data are already defined/imported.

# Load the database
ada_demo.load()

# Calculate the total number of rows
num_rows = len(json_data['rows'])

# Initialize a list to store the duration of each search
search_times = []

for _ in range(1000):
    # Pick a random vector
    random_index = random.randint(0, num_rows - 1)
    vector_to_search = [json_data['rows'][random_index]['vector']]
    
    # Start timing
    start_time = time.time()
    
    # Execute the search
    search_params = {"metric_type": "COSINE"}
    ada_demo.search(vector_to_search, "vector", search_params, limit=5, output_fields=["idxd"])
    
    # End timing and calculate the duration
    end_time = time.time()
    search_times.append((end_time - start_time) * 1000)  # Convert to milliseconds

# Calculate the average time taken
average_time_ms = sum(search_times) / len(search_times)

print(f"Average time taken per search: {average_time_ms:.2f} ms")


Average time taken per search: 23.13 ms
