In [1]:
# Make sure a Milvus server is already running
from pymilvus import connections, utility
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import normalize

In [2]:
# Read the data
chunks = pd.read_pickle('')

print(chunks.shape)
chunks.head()

(2288880, 20)


Unnamed: 0,paragraph_id,article_id,estimatedPublishedDate,relevancy_rank,brand,title,text,minilm_embeddings,anti_trans_legislation,covid_19,cybersecurity,data_privacy_gdpr,diversity_inclusion,gen_z,inflation,minimum_wage,ukraine_russia,vaccine,waste_reduction,work_from_home
0,46679388833_0_Visa,46679388833,2022-01-01 00:18:41,1,Visa,Visa Inc. (NYSE:V) Stock Position Lifted by AR...,Further Reading: Why do company’s buyback thei...,"[0.11455674, -0.03840628, -0.025011988, -0.047...",False,False,False,False,False,False,False,False,False,False,False,False
1,46679404279_4_Microsoft,46679404279,2022-01-01 00:06:58,2,Microsoft,Windows 11 Sun Valley 2 to be finalized by sum...,The post Windows 11 Sun Valley 2 to be finaliz...,"[-0.028817097, 0.015809987, 0.08862166, 0.0697...",False,False,False,False,False,False,False,False,False,False,False,False
2,46679513049_0_Twitter,46679513049,2022-01-01 00:21:14,2,Twitter,"Magic re-sign Tim Frazier, Freddie Gillespie","Magic re-sign Tim Frazier, Freddie Gillespie :...","[0.008570003, 0.028852804, -0.02051428, -0.038...",False,False,False,False,False,False,False,False,False,False,False,False
3,46679574842_7_Netflix,46679574842,2022-01-01 01:00:39,3,Netflix,Media And Entertainment Stocks Close The Book ...,But Disney had its worst year since 2008. Disc...,"[0.05456229, -0.1281895, 0.031874202, -0.01364...",False,False,False,False,False,False,False,False,False,False,False,False
4,46679601791_1_Ford,46679601791,2022-01-01 01:08:30,3,Ford,Ontario will stop collecting COVID-19 numbers ...,One of the steps is to allow school boards to ...,"[0.03479814, 0.0059085214, -0.005924947, -0.00...",False,False,False,False,False,False,False,False,False,False,False,False


# Milvus

In [3]:
# Connect to Milvus server
host = ''
connections.connect(alias='default', host=host, port='19530')

# Collection name
collection_name = 'chunks'

# Embedding size
emb_dim = len(chunks['minilm_embeddings'][0])
print(emb_dim)

384


In [4]:
if utility.has_collection(collection_name): 
    print(utility.list_collections())
    utility.drop_collection(collection_name)

['chunks']


In [5]:
# Create a schema for the collection
article_id = FieldSchema(name='id', dtype=DataType.INT64, is_primary=True)
chunk_embedding = FieldSchema(name='minilm_embeddings', dtype=DataType.FLOAT_VECTOR, dim=emb_dim)

# Set the Collection Schema 
fields = [article_id, chunk_embedding]
schema = CollectionSchema(fields=fields, description='article chunks')

# Create a collection with the schema
collection = Collection(name=collection_name, schema=schema, using='default', shards_num=10)

In [6]:
# Upload to Milvus
batch_size = 16

for idx in tqdm(range(0, len(chunks), batch_size)):
    subset_df = chunks.iloc[idx:idx+batch_size]
    
    # Primary Key
    data = [subset_df.index.tolist()]
    
    # Embedding
    raw_embeddings = subset_df['minilm_embeddings'].values.tolist()
    
    ## Normalize the embeddings to use IP distance
    ## https://milvus.io/docs/v2.0.0/metric.md#Inner-product-IP
    norm_embeddings = normalize(raw_embeddings, axis=1).tolist()
    
    ## Append the embeddings
    data.append(norm_embeddings)
    
    # Insert data to milvus
    collection.insert(data)

  0%|          | 0/143055 [00:00<?, ?it/s]

In [None]:
# Add an ANN index to the collection
index_params = {
        "metric_type":"L2",
        "index_type":"IVF_PQ",
        "params":{"nlist":1024, "m":8}
    }

collection.create_index(field_name='minilm_embeddings', index_params=index_params)