# Save Markdown text into Vector DB

## Step-1: Config

In [1]:
from my_config import MY_CONFIG

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## Step-2: Read Markdown

In [3]:
import os
import glob

pattern = os.path.join(MY_CONFIG.OUTPUT_DIR, '*.md')
md_file_count = len(glob.glob(pattern, recursive=True)) 

In [4]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=MY_CONFIG.OUTPUT_DIR, recursive=True )
documents = reader.load_data()

print (f"Loaded {len(documents)} documents from {md_file_count} files")


Loaded 88 documents from 88 files


In [5]:
## Inspect a sample doc
print (documents[0])

Doc ID: 843dc764-d1e3-4b54-8a42-39b8f7c626a4
Text: # Building the open future of AI  We are technology developers,
researchers, industry leaders and advocates who collaborate to advance
safe, responsible AI rooted in open innovation.  <!-- image -->  ####
Skills &amp; Education  <!-- image -->  Supporting global AI skill-
building, education, and exploratory research.  #### Trust &amp;
Safety  <!...


## Step-3: Create Chunks

In [6]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
nodes = parser.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} chunks from {len(documents)} documents")

Created 218 chunks from 88 documents


## Step-4: Setup Embedding Model

In [7]:
# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name = MY_CONFIG.EMBEDDING_MODEL
)

  from .autonotebook import tqdm as notebook_tqdm


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: ibm-granite/granite-embedding-30m-english
Load pretrained SentenceTransformer: ibm-granite/granite-embedding-30m-english
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


## Step-5: Initialize Weaviate Local

We will use an embedded local instance for convenience.

In [None]:
import weaviate

client = weaviate.connect_to_embedded(persistence_data_path=MY_CONFIG.DB_URI)

INFO:weaviate-client:Started /home/sujee/.cache/weaviate-embedded: process ID 949109
Started /home/sujee/.cache/weaviate-embedded: process ID 949109


{"action":"startup","build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2025-03-17T23:02:30-07:00"}
{"action":"startup","auto_schema_enabled":true,"build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2025-03-17T23:02:30-07:00"}
{"build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2025-03-17T23:02:30-07:00"}
{"build_git_commit":"'\"$GITHASH\

INFO:httpx:HTTP Request: GET http://localhost:8079/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
HTTP Request: GET http://localhost:8079/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8079/v1/meta "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8079/v1/.well-known/ready "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/.well-known/ready "HTTP/1.1 200 OK"


{"build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","docker_image_tag":"localhost","level":"info","msg":"configured versions","server_version":"1.26.6","time":"2025-03-17T23:02:32-07:00"}
{"action":"grpc_startup","build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"grpc server listening at [::]:50050","time":"2025-03-17T23:02:32-07:00"}
{"address":"192.168.86.21:60187","build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"current Leader","time":"2025-03-17T23:02:32-07:00"}
{"build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"attempting to join","remoteNodes":["192.168.86.21:60187"],"time":"2025-03-17T23:02:32-07:00"}
{"action":"raft","build

INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"


{"action":"telemetry_push","build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:b1d0b33d-0602-42e8-b38c-14820ea2bd69 Type:INIT Version:1.26.6 NumObjects:0 OS:linux Arch:amd64 UsedModules:[]}","time":"2025-03-17T23:02:32-07:00"}


INFO:httpx:HTTP Request: GET http://localhost:8079/v1/nodes "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/nodes "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8079/v1/nodes "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/nodes "HTTP/1.1 200 OK"


In [10]:
print(f"Client: {weaviate.__version__}, Server: {client.get_meta().get('version')}")

INFO:httpx:HTTP Request: GET http://localhost:8079/v1/meta "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/meta "HTTP/1.1 200 OK"
Client: 4.11.1, Server: 1.26.6


## Step-6: Connect LlamaIndex & weaviate

In [11]:
# connect llama-index to vector db

from llama_index.core import StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex",
    overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to weaviate local instance" )

INFO:httpx:HTTP Request: GET http://localhost:8079/v1/schema/LlamaIndex "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/schema/LlamaIndex "HTTP/1.1 200 OK"
✅ Connected Llama-index to weaviate local instance


## Step-7: Save Documents to Vector DB

In [12]:
%%time 

# save chunks into vector db

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(
        nodes=nodes,
        storage_context=storage_context,
    )

print(f"Successfully stored {len(nodes)} chunks to vectorDB: ", MY_CONFIG.DB_URI)


{"action":"hnsw_prefill_cache_async","build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2025-03-17T23:02:32-07:00","wait_for_cache_prefill":false}
{"build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","level":"info","msg":"Completed loading shard llamaindex_hD2iLoxRhqpF in 26.957664ms","time":"2025-03-17T23:02:32-07:00"}
{"action":"hnsw_vector_cache_prefill","build_git_commit":"'\"$GITHASH\"'","build_go_version":"go1.22.0","build_image_tag":"localhost","build_wv_version":"1.26.6","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2025-03-17T23:02:32-07:00","took":2212375}
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.81it/s]
Batches: 100%|██████

INFO:httpx:HTTP Request: GET http://localhost:8079/v1/schema "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:8079/v1/schema "HTTP/1.1 200 OK"





Successfully stored 218 chunks to vectorDB:  ./rag_website.db
CPU times: user 967 ms, sys: 163 ms, total: 1.13 s
Wall time: 1.82 s
