# Save Markdown text into Vector DB

## Step-1: Config

In [2]:
from my_config import MY_CONFIG

## Step-2: Read Markdown

In [3]:
import os
import glob

pattern = os.path.join(MY_CONFIG.OUTPUT_DIR_MARKDOWN, '*.md')
md_file_count = len(glob.glob(pattern, recursive=True)) 

In [4]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=MY_CONFIG.OUTPUT_DIR_MARKDOWN, recursive=True )
documents = reader.load_data()

print (f"Loaded {len(documents)} chunks from {md_file_count} files")


Loaded 87 chunks from 87 files


In [5]:
## Inspect a sample doc
print (documents[0])

Doc ID: dfc07b1b-aa9a-4816-9b9e-c0412f6663af
Text: # Building the open future of AI  We are technology developers,
researchers, industry leaders and advocates who collaborate to advance
safe, responsible AI rooted in open innovation.  ![Conference
Speaker](https://images.prismic.io/ai-alliance/Zy08cq8jQArT0jJI_Imagef
romNotion.jpeg?auto=format%2Ccompress&fit=max&w=3840)  ![Skills &
Education](htt...


## Step-3: Setup Embedding Model

In [6]:
# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name = MY_CONFIG.EMBEDDING_MODEL
)

modules.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/60.6M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf-mirror.com/repos/a0/2e/a02ee443e67b75c0e5078b7d33aa3daaa68bfa0fbc3fe193b20f62ee34097d28/3b1fcdc9c5eb954f603bc386474e321505ff29c6c67f21e3aa8db3d2d1a533cf?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1740526098&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MDUyNjA5OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2EwLzJlL2EwMmVlNDQzZTY3Yjc1YzBlNTA3OGI3ZDMzYWEzZGFhYTY4YmZhMGZiYzNmZTE5M2IyMGY2MmVlMzQwOTdkMjgvM2IxZmNkYzljNWViOTU0ZjYwM2JjMzg2NDc0ZTMyMTUwNWZmMjljNmM2N2YyMWUzYWE4ZGIzZDJkMWE1MzNjZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=oCvtVABFZ-W24C%7Eh96jx3Z02kIKNFBYUSf89zmzjR7kaUHKD3-VZ6sgIw26-aSoaWMJt5g-SuDxP5ZDESS8ZRN0hKgsEk7A%7EJ8b9zGCqB9C%7EOn5y1OZ-yI3kw9mLJHLg7tiA%7EFkZenDGTvjpUO8RFiTKUbl1aMEq8evEuK%7EQ%7E4Zi5T2AQBcV3gr2jetSmdy7FCli9P-gVMVuz9-v8sCk431AkaF%7Es1UB1usFjw9IhRH03l

model.safetensors:   0%|          | 0.00/60.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

## Step-4: Connect to Milvus

In [8]:
# connect to vector db
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri = MY_CONFIG.DB_URI ,
    dim = MY_CONFIG.EMBEDDING_LENGTH , 
    collection_name = MY_CONFIG.COLLECTION_NAME,
    overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to Milvus instance: ", MY_CONFIG.DB_URI )

2025-02-25 14:40:41,556 [DEBUG][_create_connection]: Created new connection using: bf69c03e00f94f7aa4c5a72b362ab6c8 (async_milvus_client.py:600)


✅ Connected Llama-index to Milvus instance:  ./rag_website.db


In [9]:
## Clear up any old data

from pymilvus import MilvusClient

milvus_client = MilvusClient(MY_CONFIG.DB_URI)
print ("✅ Connected to Milvus instance: ", MY_CONFIG.DB_URI )

# if we already have a collection, clear it first
if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):
    milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)
    print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)
    

✅ Connected to Milvus instance:  ./rag_website.db
✅ Cleared collection : pages


In [10]:
# connect to vector db
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri = MY_CONFIG.DB_URI ,
    dim = MY_CONFIG.EMBEDDING_LENGTH , 
    collection_name = MY_CONFIG.COLLECTION_NAME,
    overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to Milvus instance: ", MY_CONFIG.DB_URI )

2025-02-25 14:40:42,708 [DEBUG][_create_connection]: Created new connection using: 74d5f252a97c4989bf5ce1870a2c2d80 (async_milvus_client.py:600)


✅ Connected Llama-index to Milvus instance:  ./rag_website.db


## Step-5: Save to DB

In [11]:
%%time

# create an index

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
print ("✅ Saved documents to db ", MY_CONFIG.DB_URI )

✅ Saved documents to db  ./rag_website.db
CPU times: user 975 ms, sys: 168 ms, total: 1.14 s
Wall time: 2.4 s
