In [1]:
import sys, os, time, pprint
import numpy as np

In [31]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path="gnews.csv", source_column="news")

docs = loader.load()

In [32]:
num_documents = len(docs)
print(f"loaded {num_documents} documents")

loaded 365 documents


In [45]:
docs[0]

Document(page_content='news: CNN Russian President Vladimir Putin has replaced his defense minister and close ally Sergei Shoigu with a civilian economist, a major reshuffle of military leadership more than twoyears after Moscows grinding war against Ukraine has sent defense spending soaring. Andrey Belousov, a civilian who served as former first deputy prime minister and specializes in economics, was appointed to the top defense post, Kremlin spokesperson Dmitry Peskov said on Sunday. Shoigu was relieved of his position by presidential decree, Peskov said, but he will remain an influential part of Putins administration as Secretary of Russias Security Council, replacing former FSB head Nikolai Patrushev, who would transfer to another job. Shoigu will also become deputy in Russias MilitaryIndustrial Commission, Peskov said, as Putin embarks on a fifth term as president. The shakeup comes as Russia launched its most serious crossborder ground assault since Ukraine recaptured the norther

In [33]:
import os
import pymilvus
print(f"pymilvus version: {pymilvus.__version__}")
from pymilvus import connections, utility
TOKEN = "54cf1327985f3196180bd468479b0b226780034e860ed0864db2b961d76e300d2c842a1473fe28321d8e86d4806ccb8584dfbcd5"
CLUSTER_ENDPOINT="https://in03-1a9964c822484de.api.gcp-us-west1.zillizcloud.com:443"

connections.connect(
  alias='default',
  #  Public endpoint obtained from Zilliz Cloud
  uri=CLUSTER_ENDPOINT,
  # API key or a colon-separated cluster username and password
  token=TOKEN,
)

# Use no-schema Milvus client uses flexible json key:value format.
# https://milvus.io/docs/using_milvusclient.md
mc = MilvusClient(
    uri=CLUSTER_ENDPOINT,
    # API key or a colon-separated cluster username and password
    token=TOKEN)

# Check if the server is ready and get colleciton name.
print(f"Type of server: {utility.get_server_version()}")

pymilvus version: 2.4.1
Type of server: Zilliz Cloud Vector Database(Compatible with Milvus 2.3)


In [15]:
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
import torch

# Initialize torch settings
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f"device: {DEVICE}")

# Initialize a Milvus built-in sparse-dense-reranking encoder.
# https://huggingface.co/BAAI/bge-m3
embedding_model = BGEM3EmbeddingFunction(use_fp16=False, device=DEVICE)
EMBEDDING_DIM = embedding_model.dim['dense']
print(f"dense_dim: {EMBEDDING_DIM}")
print(f"sparse_dim: {embedding_model.dim['sparse']}")
print(f"colbert_dim: {embedding_model.dim['colbert_vecs']}")

device: cpu


  from .autonotebook import tqdm as notebook_tqdm
Fetching 30 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:51<00:00,  1.71s/it]


dense_dim: 1024
sparse_dim: 250002
colbert_dim: 1024


In [42]:
from pymilvus import MilvusClient

# Set the Milvus collection name.
COLLECTION_NAME = "news"

# Specify the data schema for the new Collection.
MAX_LENGTH = 65535
fields = [
    # Use auto generated id as primary key
    FieldSchema(name="id", dtype=DataType.INT64,
                is_primary=True, auto_id=True, max_length=100),
    # FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR,
                dim=EMBEDDING_DIM),
    FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=MAX_LENGTH),
    FieldSchema(name="news", dtype=DataType.VARCHAR, max_length=MAX_LENGTH),
]
schema = CollectionSchema(fields, "")

# Check if collection already exists, if so drop it.
has = utility.has_collection(COLLECTION_NAME)
if has:
    drop_result = utility.drop_collection(COLLECTION_NAME)
    print(f"Successfully dropped collection: `{COLLECTION_NAME}`")

# Create the collection.
schema = CollectionSchema(fields, "")
col = Collection(COLLECTION_NAME, schema, consistency_level="Eventually")

# Add custom HNSW search index to the collection.
# M = max number graph connections per layer. Large M = denser graph.
# Choice of M: 4~64, larger M for larger data and larger embedding lengths.
M = 16
# efConstruction = num_candidate_nearest_neighbors per layer. 
# Use Rule of thumb: int. 8~512, efConstruction = M * 2.
efConstruction = M * 2
# Create the search index for local Milvus server.
INDEX_PARAMS = dict({
    'M': M,               
    "efConstruction": efConstruction })

# Create indices for the vector fields. 
# The indices will pre-load data into memory for efficient search.
# sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "HNSW", "metric_type": "COSINE", "params": INDEX_PARAMS}
# col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)
col.load()

print(f"Successfully created collection: `{COLLECTION_NAME}`")

Successfully dropped collection: `news`
Successfully created collection: `news`


In [35]:
# # STEP 4. PREPARE DATA: CHUNK AND EMBED
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define chunk size 512 and overlap 10% chunk_size.
chunk_size = 300
chunk_overlap = np.round(chunk_size * 0.10, 0)
print(f"chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}")

# Create an instance of the RecursiveCharacterTextSplitter
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
    length_function = len,  # using built-in Python len function
)

# Split the documents further into smaller, recursive chunks.
chunks = child_splitter.split_documents(docs)

end_time = time.time()
print(f"chunking time: {end_time - start_time}")
print(f"docs: {len(docs)}, split into: {len(html_header_splits)}")
print(f"split into chunks: {len(chunks)}, type: list of {type(chunks[0])}") 

# Inspect a chunk.
print()
print("Looking at a sample chunk...")
print(chunks[0].page_content[:100])
print(chunks[0].metadata)

# # TODO - Uncomment to print child splits with their associated header metadata for debugging.
# print()
# for child in chunks:
#     print(f"Content: {child.page_content}")
#     print(f"Metadata: {child.metadata}")
#     print()

chunk_size: 300, chunk_overlap: 30.0
chunking time: 112.8498809337616
docs: 365, split into: 365
split into chunks: 5535, type: list of <class 'langchain_core.documents.base.Document'>

Looking at a sample chunk...
news: CNN Russian President Vladimir Putin has replaced his defense minister and close ally Sergei S
{'source': 'CNN Russian President Vladimir Putin has replaced his defense minister and close ally Sergei Shoigu with a civilian economist, a major reshuffle of military leadership more than twoyears after Moscows grinding war against Ukraine has sent defense spending soaring. Andrey Belousov, a civilian who served as former first deputy prime minister and specializes in economics, was appointed to the top defense post, Kremlin spokesperson Dmitry Peskov said on Sunday. Shoigu was relieved of his position by presidential decree, Peskov said, but he will remain an influential part of Putins administration as Secretary of Russias Security Council, replacing former FSB head Nikol

In [36]:
list_of_strings = [doc.page_content for doc in chunks if hasattr(doc, 'page_content')]

# Embedding inference using the Milvus built-in sparse-dense-reranking encoder.
start_time = time.time()
embeddings = embedding_model(list_of_strings)
end_time = time.time()

print(f"Embedding time for {len(list_of_strings)} chunks: ", end="")
print(f"{np.round(end_time - start_time, 2)} seconds")

Inference Embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 346/346 [22:56<00:00,  3.98s/it]


Embedding time for 5535 chunks: 1378.55 seconds


In [43]:
dict_list = []
for chunk, dense in zip(chunks, embeddings["dense"]):
    # Assemble embedding vector, original text chunk, metadata.
    chunk_dict = {
        'chunk': chunk.page_content,
        'news': chunk.metadata.get('source', ""),
        'dense_vector': dense
    }
    dict_list.append(chunk_dict)

# TODO - Uncomment to inspect the first chunk and its metadata.
print(len(dict_list))
print(type(dict_list[0]), len(dict_list[0]))
pprint.pprint(dict_list[0])

5535
<class 'dict'> 3
{'chunk': 'news: CNN Russian President Vladimir Putin has replaced his defense '
          'minister and close ally Sergei Shoigu with a civilian economist, a '
          'major reshuffle of military leadership more than twoyears after '
          'Moscows grinding war against Ukraine has sent defense spending '
          'soaring. Andrey Belousov, a civilian',
 'dense_vector': array([-0.04367027, -0.02070321, -0.03113122, ...,  0.00263912,
       -0.01077201, -0.03370891], dtype=float32),
 'news': 'CNN Russian President Vladimir Putin has replaced his defense '
         'minister and close ally Sergei Shoigu with a civilian economist, a '
         'major reshuffle of military leadership more than twoyears after '
         'Moscows grinding war against Ukraine has sent defense spending '
         'soaring. Andrey Belousov, a civilian who served as former first '
         'deputy prime minister and specializes in economics, was appointed to '
         'the top defe

In [44]:
# Insert data into the Milvus collection.
print("Start inserting entities")
start_time = time.time()
col.insert(dict_list)

end_time = time.time()
print(f"Milvus insert time for {len(dict_list)} vectors: ", end="")
print(f"{np.round(end_time - start_time, 2)} seconds")
col.flush()

Start inserting entities
Milvus insert time for 5535 vectors: 11.9 seconds
