In [1]:
from pymilvus import (
    connections,
    Collection,
    FieldSchema,
    CollectionSchema,
    DataType,
    utility,
)
import pandas as pd
from scipy import sparse
import numpy as np
import gc

connections.connect(uri="http://localhost:19530")  # Replace with your Milvus server IP

In [3]:
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=8012),
    FieldSchema(name="content_dense", dtype=DataType.FLOAT_VECTOR, dim=1024),
    FieldSchema(name="content_sparse", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="title_text", dtype=DataType.VARCHAR, max_length=8012),
    FieldSchema(name="content_text", dtype=DataType.VARCHAR, max_length=8012),
]

schema = CollectionSchema(fields=fields, enable_dynamic_field=False)

collection = Collection(name="bat_dong_san", schema=schema)

In [4]:
from glob import glob

folders = glob("../bds/*")
folders.sort()
folders

['../bds\\0_5000',
 '../bds\\10000_15000',
 '../bds\\15000_25000',
 '../bds\\5000_10000']

In [5]:
def load_data(folder):
    df = pd.read_parquet(f"{folder}/encoded_data.parquet")
    df.drop(columns=["content", "content_text_raw", "content_text_list"], inplace=True)
    df = df.explode("content_text")
    content_sparse = sparse.load_npz(f"{folder}/content_sparse.npz")

    with open(f"{folder}/content_dense.npy", "rb") as file:
        content_dense = np.load(file)

    df["content_sparse"] = list(content_sparse)
    df["content_dense"] = list(content_dense)
    df["chunk_id"] = df["id"] * 10000 + df.groupby("id").cumcount() + 1
    return df

In [6]:
def insert_to_collection(df, collection, batch_size=1000):
    for i in range(0, len(df), batch_size):
        print(f"Inserting {i}-{i+batch_size}...")
        collection.insert(
            df.iloc[i : i + batch_size][
                [
                    "chunk_id",
                    "url",
                    "content_sparse",
                    "content_dense",
                    "title",
                    "content_text",
                ]
            ]
            .rename(columns={"title": "title_text", "chunk_id": "id"})
            .to_dict("records")
        )

In [7]:
for folder in folders[0:]:
    df = load_data(folder)
    print(folder, df.shape)
    insert_to_collection(df, collection)
    print("complete inserted df")
    print()
    del df
    gc.collect()

../bds\0_5000 (58272, 9)
Inserting 0-1000...
Inserting 1000-2000...
Inserting 2000-3000...
Inserting 3000-4000...
Inserting 4000-5000...
Inserting 5000-6000...
Inserting 6000-7000...
Inserting 7000-8000...
Inserting 8000-9000...
Inserting 9000-10000...
Inserting 10000-11000...
Inserting 11000-12000...
Inserting 12000-13000...
Inserting 13000-14000...
Inserting 14000-15000...
Inserting 15000-16000...
Inserting 16000-17000...
Inserting 17000-18000...
Inserting 18000-19000...
Inserting 19000-20000...
Inserting 20000-21000...
Inserting 21000-22000...
Inserting 22000-23000...
Inserting 23000-24000...
Inserting 24000-25000...
Inserting 25000-26000...
Inserting 26000-27000...
Inserting 27000-28000...
Inserting 28000-29000...
Inserting 29000-30000...
Inserting 30000-31000...
Inserting 31000-32000...
Inserting 32000-33000...
Inserting 33000-34000...
Inserting 34000-35000...
Inserting 35000-36000...
Inserting 36000-37000...
Inserting 37000-38000...
Inserting 38000-39000...
Inserting 39000-40000.

In [8]:
dense_index = {
    "index_type": "HNSW",
    "metric_type": "IP",
    "M": 64,
    "efConstruction": 80,
}
sparse_index = {
    "index_type": "SPARSE_WAND",
    "metric_type": "IP",
}

collection.create_index("content_dense", dense_index)
collection.create_index("content_sparse", sparse_index)

Status(code=0, message=)

In [9]:
from pymilvus import MilvusClient, DataType

client = MilvusClient(uri="http://localhost:19530")

res = client.list_indexes(collection_name="bat_dong_san")
print(res)

['content_sparse', 'content_dense']
