# Save Markdown text into Vector DB

In [1]:
from my_config import MY_CONFIG

In [2]:
import os
import glob

pattern = os.path.join(MY_CONFIG.OUTPUT_DIR_MARKDOWN, '*.md')

md_file_count = len(glob.glob(pattern, recursive=True)) 

In [3]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=MY_CONFIG.OUTPUT_DIR_MARKDOWN, recursive=True )
documents = reader.load_data()

print (f"Loaded {len(documents)} chunks from {md_file_count} files")


Loaded 74 chunks from 20 files


In [4]:
for doc in documents:
    print (doc)
    print ('---')

Doc ID: c444d3d4-f3a9-4e86-9cbe-1f86ea82f226
Text: !
---
Doc ID: 072ecafe-306c-4138-96a2-ef1eb56a7d01
Text: Open Source AI Demo Night  The AI Alliance, in collaboration
with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in
San Francisco, bringing together more than 200+ developers and
innovators to showcase and celebrate the latest advances in open-
source AI.
---
Doc ID: 9d05c74f-113f-4632-a0d6-e62616b56a77
Text: AI Alliance members plan to start or enhance projects that meet
the following objectives:   Deploy benchmarks, tools, and other
resources that enable the responsible development and use of AI
systems at a global scale, including the creation of a catalog of
vetted safety, security, and trust tools. Support the advocacy and
enablement of these to...
---
Doc ID: 9fc93e5e-c6c0-4cc8-b57e-b5b687b5cef9
Text: Meet the AI Alliance  The AI Alliance is a collaborative network
of companies, startups, universities, research institutions,
government organizations, and non-p

## Step-3: Setup Embedding Model

In [5]:
# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name = MY_CONFIG.EMBEDDING_MODEL
)

## Step-4: Connect to Milvus

In [7]:
# connect to vector db
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri = MY_CONFIG.DB_URI ,
    dim = MY_CONFIG.EMBEDDING_LENGTH , 
    collection_name = MY_CONFIG.COLLECTION_NAME,
    overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to Milvus instance: ", MY_CONFIG.DB_URI )

✅ Connected Llama-index to Milvus instance:  ./rag_html.db


In [8]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(MY_CONFIG.DB_URI)
print ("✅ Connected to Milvus instance: ", MY_CONFIG.DB_URI )

# if we already have a collection, clear it first
if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):
    milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)
    print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)
    

✅ Connected to Milvus instance:  ./rag_html.db
✅ Cleared collection : docs


In [9]:
# connect to vector db
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri = MY_CONFIG.DB_URI ,
    dim = MY_CONFIG.EMBEDDING_LENGTH , 
    collection_name = MY_CONFIG.COLLECTION_NAME,
    overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to Milvus instance: ", MY_CONFIG.DB_URI )

✅ Connected Llama-index to Milvus instance:  ./rag_html.db


## Step-5: Create Index and Save to DB

In [10]:
%%time

# create an index

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
print ("✅ Created index:", index )
print ("✅ Saved index to db ", MY_CONFIG.DB_URI )

✅ Created index: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x749f6d347aa0>
✅ Saved index to db  ./rag_html.db
CPU times: user 532 ms, sys: 129 ms, total: 661 ms
Wall time: 657 ms
