# Handy Utils to do Vector Search on Collections

## Configuration

In [1]:
class MyConfig:
    pass
MY_CONFIG = MyConfig()

## This has to match the embeddings the documents are stored in vectordb!
MY_CONFIG.EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
MY_CONFIG.EMBEDDING_LENGTH = 384

MY_CONFIG.DB_URI = './rag_demo_dataprepkit_1.db'  # For embedded instance
#MY_CONFIG.DB_URI = 'http://localhost:19530'  # For Docker instance
MY_CONFIG.COLLECTION_NAME = 'dataprepkit_granite_docs'

## Connect to Vector Database

Milvus can be embedded and easy to use.


In [2]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(MY_CONFIG.DB_URI)

print ("✅ Connected to Milvus instance:", MY_CONFIG.DB_URI)

✅ Connected to Milvus instance: ./rag_demo_dataprepkit_1.db


## Setup Embeddings

Two choices here. 

1. use sentence transformers directly
2. use Milvus model wrapper

In [3]:
## Option 1 - use sentence transformers directly

# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(MY_CONFIG.EMBEDDING_MODEL)

def get_embeddings (str):
    embeddings = embedding_model.encode(str, normalize_embeddings=True)
    return embeddings

  from tqdm.autonotebook import tqdm, trange


In [4]:
## Option 2 - Milvus model
from pymilvus import model

# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'


# embedding_fn = model.DefaultEmbeddingFunction()

## initialize the SentenceTransformerEmbeddingFunction
embedding_fn = model.dense.SentenceTransformerEmbeddingFunction(
    model_name = MY_CONFIG.EMBEDDING_MODEL,
    device='cpu' # this will work on all devices (KIS)
)

In [5]:
# Test Embeddings
text = 'Paris 2024 Olympics'
embeddings = get_embeddings(text)
print ('sentence transformer : embeddings len =', len(embeddings))
print ('sentence transformer : embeddings[:5] = ', embeddings[:5])

embeddings = embedding_fn([text])
print ('milvus model wrapper : embeddings len =', len(embeddings[0]))
print ('milvus model wrapper  : embeddings[:5] = ', embeddings[0][:5])

sentence transformer : embeddings len = 384
sentence transformer : embeddings[:5] =  [-0.02412123 -0.02083506  0.03565466  0.00688349  0.02383429]
milvus model wrapper : embeddings len = 384
milvus model wrapper  : embeddings[:5] =  [-0.02412122 -0.0208351   0.03565468  0.00688352  0.02383425]


## Do A  Vector Search

We will do this to verify data

In [6]:
import random


## helper function to perform vector search
def  do_vector_search (query):
    query_vectors = [get_embeddings(query)]  # Option 1 - using sentence transformers
    # query_vectors = embedding_fn([query])  # using Milvus model 

    results = milvus_client.search(
        collection_name=MY_CONFIG.COLLECTION_NAME,  # target collection
        data=query_vectors,  # query vectors
        limit=5,  # number of returned entities
        output_fields=["filename", "page_number", "text"],  # specifies fields to be returned
    )
    return results
## ----

def  print_search_results (results):
    # pprint (results)
    print ('num results : ', len(results[0]))

    for i, r in enumerate (results[0]):
        #pprint(r, indent=4)
        print (f'------ result {i+1} --------')
        print ('search score:', r['distance'])
        print ('filename:', r['entity']['filename'])
        print ('page number:', r['entity']['page_number'])
        print ('text:\n', r['entity']['text'])
        print()

In [7]:
query = "Overview of the Granite Pre-Training Dataset"

results = do_vector_search (query)
print_search_results(results)

num results :  5
------ result 1 --------
search score: 0.8775781989097595
filename: Granite_Foundation_Models.pdf
page number: 9
text:
 B. Granite Model Evaluation and Comparison
TABLE II GRANITE.13B GENERAL KNOWLEDGE PERFORMANCE DURING TRAINING

------ result 2 --------
search score: 0.8280450105667114
filename: Granite_Foundation_Models.pdf
page number: 9
text:
 B. Granite Model Evaluation and Comparison
For the SocialStigmaQA benchmark, we tested a variety of the Granite, llama-2, and flan-ul2 models. We examine whether the inclusion of specific personal attributes in the prompt leads

------ result 3 --------
search score: 0.8216305375099182
filename: Granite_Foundation_Models.pdf
page number: 8
text:
 B. Granite Model Evaluation and Comparison
Fig. 6. Granite.13b General Knowledge Performance during Training.

------ result 4 --------
search score: 0.8175002336502075
filename: Granite_Foundation_Models.pdf
page number: 2
text:
 C. Organization of Report
The remainder of this repo

In [8]:
query = "How is data de-duplicated?"

results = do_vector_search (query)
print_search_results(results)

num results :  5
------ result 1 --------
search score: 0.812159538269043
filename: Granite_Foundation_Models.pdf
page number: 4
text:
 B. Pre-Processing Pipeline
2) Data De-Duplication: Data de-duplication aims to identify and remove duplicate documents. De-duplication is performed on a per-dataset basis and is essential to ensuring the trained model does not learn artificial linguistic patterns due to repeated data in the dataset.

------ result 2 --------
search score: 0.7404550909996033
filename: Granite_Foundation_Models.pdf
page number: 4
text:
 B. Pre-Processing Pipeline
Two techniques are used: exact and fuzzy de-duplication, both of which use hash-based methods. As the name suggests, exact de-duplication removes exact duplicates among the documents in the dataset. Each document is hashed and documents with the same hash are fused to one. For example, if 50 documents in a dataset have the same hash, a single document will be used. Fuzzy de-duplication finds the Jaccard similari

In [9]:
milvus_client.close()