# Load Processed Data into Vector Database

## Configuration

In [1]:
class MyConfig:
    pass
MY_CONFIG = MyConfig()

MY_CONFIG.INPUT_DATA_DIR = 'data/granite-docs/output_final/'
# MY_CONFIG.INPUT_DATA_REMOTE = "https://github.com/sujee/data-prep-kit-examples/blob/main/requirements.txt"
MY_CONFIG.DB_NAME = "rag_demo.db"  # vector db (embedded)
MY_CONFIG.COLLECTION_NAME = "docs"

## Figure out Runtime

In [2]:
# are we running in Colab?
import os

if os.getenv("COLAB_RELEASE_TAG"):
   print("Running in Colab")
   MY_CONFIG.RUNNING_IN_COLAB = True
else:
   print("NOT running in Colab")
   MY_CONFIG.RUNNING_IN_COLAB = False

NOT running in Colab


## Install Dependencies (If required)

**A note for Google Colab Users**

After installing the dependenceis, if you get errors loading libraries, **restart runtime** and **run the notebook** again

In [3]:
if MY_CONFIG.RUNNING_IN_COLAB:
  !pip install pymilvus  'pymilvus[model]'  datasets  sentence-transformers

## Step-: Load Parquet Data

Load all  `.parquet` files in the given dir

In [4]:
import pandas as pd
import glob

print ('Loading data from : ', MY_CONFIG.INPUT_DATA_DIR)

# Get a list of all Parquet files in the directory
parquet_files = glob.glob(f'{MY_CONFIG.INPUT_DATA_DIR}/*.parquet')
print ("Number of parquet files to read : ", len(parquet_files))
print ()

# Create an empty list to store the DataFrames
dfs = []

# Loop through each Parquet file and read it into a DataFrame
for file in parquet_files:
    df = pd.read_parquet(file)
    print (f"Read file: '{file}'.  number of rows = {df.shape[0]}")
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
data_df = pd.concat(dfs, ignore_index=True)

print (f"\nTotal number of rows = {data_df.shape[0]}")

Loading data from :  data/granite-docs/output_final/
Number of parquet files to read :  1

Read file: 'data/granite-docs/output_final/Granite_Foundation_Models.parquet'.  number of rows = 235

Total number of rows = 235


In [5]:

## Shape the data

MY_CONFIG.EMBEDDING_LENGTH =  len(data_df.iloc[0]['embeddings'])
print ('embedding length: ', MY_CONFIG.EMBEDDING_LENGTH)

# rename 'embeddings' columns as 'vector' to match default schema
if 'vector' not in data_df.columns and 'embeddings' in data_df.columns:
    data_df = data_df.rename( columns= {'embeddings' : 'vector'})

print (data_df.info())
data_df.head(3)

embedding length:  384
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   filename                      235 non-null    object 
 1   num_pages                     235 non-null    int64  
 2   num_tables                    235 non-null    int64  
 3   num_doc_elements              235 non-null    int64  
 4   document_id                   235 non-null    object 
 5   ext                           235 non-null    object 
 6   hash                          235 non-null    object 
 7   size                          235 non-null    int64  
 8   date_acquired                 235 non-null    object 
 9   pdf_convert_time              235 non-null    float64
 10  source_filename               235 non-null    object 
 11  contents                      235 non-null    object 
 12  doc_path                      235 non-nul

Unnamed: 0,filename,num_pages,num_tables,num_doc_elements,document_id,ext,hash,size,date_acquired,pdf_convert_time,...,docq_symbol_to_word_ratio,docq_sentence_count,docq_lorem_ipsum_ratio,docq_curly_bracket_ratio,docq_contain_bad_word,docq_bullet_point_ratio,docq_ellipsis_line_ratio,docq_alphabet_word_ratio,docq_contain_common_en_words,vector
0,Granite%20Foundation%20Models.pdf,20,13,445,d141f01c-c838-49b0-a8f2-0c80b3761d88,pdf,cfce6b11703e9b81d1958b54b135bc4a5c5d8771032e6b...,455938,2024-07-29T22:05:04.500431,37.934728,...,0.0,2,0.0,0.0,False,0.0,0.0,1.0,True,"[-0.007855909, 0.018679393, 0.042436924, -0.01..."
1,Granite%20Foundation%20Models.pdf,20,13,445,d141f01c-c838-49b0-a8f2-0c80b3761d88,pdf,cfce6b11703e9b81d1958b54b135bc4a5c5d8771032e6b...,455938,2024-07-29T22:05:04.500431,37.934728,...,0.0,1,0.0,0.0,False,0.0,0.0,1.0,False,"[-0.0035767434, 0.009818679, 0.03441954, -0.00..."
2,Granite%20Foundation%20Models.pdf,20,13,445,d141f01c-c838-49b0-a8f2-0c80b3761d88,pdf,cfce6b11703e9b81d1958b54b135bc4a5c5d8771032e6b...,455938,2024-07-29T22:05:04.500431,37.934728,...,0.0,6,0.0,0.0,False,0.0,0.0,1.0,True,"[-0.022207903, 0.0050711804, 0.022928528, -0.0..."


## Connect to Vector Database

Milvus can be embedded and easy to use.


In [6]:
from pymilvus import MilvusClient

client = MilvusClient(MY_CONFIG.DB_NAME)

# Create A Collection



In [7]:
# if we already have a collection, clear it first
if client.has_collection(collection_name=MY_CONFIG.COLLECTION_NAME):
    client.drop_collection(collection_name=MY_CONFIG.COLLECTION_NAME)

client.create_collection(
    collection_name=MY_CONFIG.COLLECTION_NAME,
    dimension=MY_CONFIG.EMBEDDING_LENGTH,
    auto_id=True
)
print ("Initialized vector db:", MY_CONFIG.DB_NAME, ", collection: ", MY_CONFIG.COLLECTION_NAME)

Initialized vector db: rag_demo.db , collection:  docs


In [8]:
#data_df.to_dict('records')[:1]

In [9]:
res = client.insert(collection_name=MY_CONFIG.COLLECTION_NAME, data=data_df.to_dict('records'))

print('inserted # rows', res['insert_count'])

client.get_collection_stats(MY_CONFIG.COLLECTION_NAME)

inserted # rows 235


{'row_count': 235}

## Do A Simple Vector Search

We will do this to verify data

In [10]:
from pymilvus import model
import random

# If connection to https://huggingface.co/ failed, uncomment the following path
# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# embedding_fn = model.DefaultEmbeddingFunction()

## initialize the SentenceTransformerEmbeddingFunction
embedding_fn = model.dense.SentenceTransformerEmbeddingFunction(
    model_name='BAAI/bge-small-en-v1.5',
    device='cpu' # this will work on all devices (KIS)
)

## helper function to perform vector search
def  do_vector_search (query):
    # query_vectors = embedding_fn.encode_queries([query])
    query_vectors = embedding_fn([query])

    results = client.search(
        collection_name=MY_CONFIG.COLLECTION_NAME,  # target collection
        data=query_vectors,  # query vectors
        limit=5,  # number of returned entities
        output_fields=["filename", "page_number", "text"],  # specifies fields to be returned
    )
    return results
## ----

def  print_search_results (results):
    # pprint (results)
    print ('num results : ', len(results[0]))

    for i, r in enumerate (results[0]):
        #pprint(r, indent=4)
        print (i+1)
        print ('search score:', r['distance'])
        print ('filename:', r['entity']['filename'])
        # print ('text:', r['entity']['text'])
        ## TODO : print text and page number
        print()

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
query = "Overview of the Granite Pre-Training Dataset"

results = do_vector_search (query)
print_search_results(results)

num results :  5
1
search score: 0.8775781393051147
filename: Granite%20Foundation%20Models.pdf

2
search score: 0.8280448913574219
filename: Granite%20Foundation%20Models.pdf

3
search score: 0.8216304779052734
filename: Granite%20Foundation%20Models.pdf

4
search score: 0.8174999952316284
filename: Granite%20Foundation%20Models.pdf

5
search score: 0.8016889691352844
filename: Granite%20Foundation%20Models.pdf

