### Imports

In [2]:
from typing import List, Tuple, Dict

In [1]:
from config.settings import settings

### Document Loading

In [3]:
from src.ingestion.DocumentLoader import DocumentLoader

In [4]:
loader = DocumentLoader()

In [5]:
folder_name = "index"

In [6]:
files = loader.list_filenames(folder_name)
files

[METRICS] list_filenames: time=0.00s, count=1


['Graph_Databases_for_Beginners.pdf']

In [7]:
docs = loader.load_documents(subdir=folder_name,file_names=files)
# print(type(docs[0].page_content))

Files:   0%|          | 0/1 [00:00<?, ?file/s]

[METRICS] load_documents: time=2.80s, count=46


### Chunking

In [8]:
from src.ingestion.DocumentChunker import DocumentChunker

In [9]:
chunker = DocumentChunker(
    hf_embedding_model="sentence-transformers/all-mpnet-base-v2",
    chunk_size=300,
    chunk_overlap=80
)

In [10]:
chunks = chunker.chunk_documents(docs)
token_count = chunker.get_docs_token_count(chunks)

Chunking documents:   0%|          | 0/46 [00:00<?, ?doc/s]

[METRICS] chunk_documents: time=0.47s, count=88
[METRICS] get_docs_token_count: time=0.18s, count=88


In [11]:
print(len(chunks))
print(token_count)

88
26370


### Embedding

In [12]:
from src.ingestion.HuggingFaceEmbedder import HuggingFaceEmbedder

In [13]:
embedder = HuggingFaceEmbedder("sentence-transformers/all-mpnet-base-v2")

In [16]:
v1  = embedder.embed_query(chunks[0].page_content)
print("dimension",len(v1))

[METRICS] embed_query: time=0.12s, count=32
dimension 768


### Vector Store Management

In [17]:
from src.ingestion.VectorStoreManager import VectorStoreManager

In [18]:
vsm = VectorStoreManager(embedding_function=embedder,index_name=folder_name)

INFO:src.ingestion.VectorStoreManager:VectorStoreManager initialized for index 'index'


In [19]:
vsm.create_index()

INFO:src.ingestion.VectorStoreManager:Created FAISS index 'index' with dim=768


[METRICS] embed_query: time=0.13s, count=4


In [20]:
vsm.add_documents(chunks)

Adding documents:   0%|          | 0/88 [00:00<?, ?it/s]

INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.25s, count=32


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.45s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.79s, count=262


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.45s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.71s, count=274


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.27s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.68s, count=211


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.25s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.54s, count=161


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.37s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.25s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.56s, count=145


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.43s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.52s, count=84


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.40s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.82s, count=275


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.39s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.11s, count=278


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.12s, count=368


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.16s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.45s, count=130


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.16s, count=353


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.19s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.74s, count=241


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.21s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.56s, count=169


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.22s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.41s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.46s, count=156


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.23s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.58s, count=200


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.24s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.51s, count=115


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.38s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.58s, count=172


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.34s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.42s, count=131


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.26s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.78s, count=295


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.31s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.91s, count=335


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.29s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.67s, count=158


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.01s, count=346


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.08s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.79s, count=156


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.26s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.41s, count=132


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.23s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.47s, count=195


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.50s, count=253


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.90s, count=379


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.88s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.53s, count=260


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.09s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.34s, count=160


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.89s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.98s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.47s, count=182


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.17s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.58s, count=232


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.01s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.49s, count=168


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.73s, count=323


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.00s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.72s, count=272


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.93s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.41s, count=164


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.84s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.89s, count=369


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.25s, count=383


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.94s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.70s, count=290


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.92s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.59s, count=266


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.84s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.28s, count=115


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.02s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.84s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.37s, count=164


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=1.01s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.44s, count=212


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.84s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.32s, count=110


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.83s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.54s, count=279


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.HuggingFaceEmbedder:Embedding 1 documents with model 'sentence-transformers/all-mpnet-base-v2'


[METRICS] embed_documents: time=0.85s, count=386


Embedding documents:   0%|          | 0/1 [00:00<?, ?doc/s]

INFO:src.ingestion.HuggingFaceEmbedder:Successfully embedded 1/1 documents
INFO:src.ingestion.VectorStoreManager:Added 88 documents to 'index'


[METRICS] embed_documents: time=0.47s, count=203
[METRICS] add_documents: time=77.86s, count=88


['63d838ee-78cd-45b8-af93-226a74c4d2b5',
 '1ef52a96-48f5-4124-80a8-f3a2abe09be9',
 '9bf4de6c-19af-4c7c-99c3-ba376b6d7c2b',
 'c0040e18-c85d-473f-87a1-80a2f34b1eba',
 '0c519145-6db0-4cf3-bda3-4528fb05f9a8',
 'd69e49af-738f-4366-a25a-65d4e1942508',
 '9d7945b3-8998-480e-b3e6-4a651a9fc472',
 'ec848d3b-be3a-4b9e-85d9-f7b4951b80e2',
 '9b27b542-6e30-422e-a282-1c6b9a4aa509',
 '9b90b154-853f-4bd7-917e-ff7a7c45446c',
 '3bb06f79-e803-4025-9d3a-68379ca5d0e3',
 '9d4a2f52-e54d-429e-bd7e-c377b6748910',
 '2a69635c-ec0c-4c00-8c3d-2555148a4e33',
 'f8831c58-3cba-4765-bbcd-0d7731ffd4e8',
 '11fe2f42-fde7-4558-827d-52ff7043437a',
 '949348df-9138-4a17-9755-a3a0a6c6b315',
 '31807813-c912-4e7d-9630-6fa899cee4a3',
 '57e10dde-c34e-449f-900f-920d57c9ae8d',
 '4b90a600-21c1-4fbb-b43b-86f65bd841b7',
 'e845dfcb-43cc-4133-8539-8a5774d93177',
 'db35dd6f-fa6e-42b9-bbba-82f24fc80742',
 'd8f80142-5b44-4f60-a33c-fb3ec752a7a0',
 'ca186dbb-c1f0-4d8b-a2ce-068e62f71f11',
 'df4660ed-e267-497f-aa95-64efbce749ef',
 '87390c73-4db9-

In [None]:
# vsm.save_local()

INFO:src.ingestion.VectorStoreManager:Saved index 'index' to '/home/ashmit/work/SEM_VIII/EnhancedRAG/context/faiss_indexes'


In [None]:
# vsm.load_local(allow_pickle=True)

INFO:src.ingestion.VectorStoreManager:Loaded index 'index' from disk


### Retrieval

In [21]:
retrieved = vsm.similarity_search_with_score(query="experience at dolf", k=2)

[METRICS] embed_query: time=0.08s, count=6
[METRICS] similarity_search_with_score: time=0.09s, count=2


In [26]:
retriever = vsm.retriever(search_type = "similarity", search_kwargs = {"k":2})

INFO:src.ingestion.VectorStoreManager:Created retriever for 'index' with {'search_type': 'similarity', 'search_kwargs': {'k': 2}}


In [23]:
retrieved = retriever.invoke("projects by snakalp")

[METRICS] embed_query: time=0.17s, count=8


In [24]:
texts = [ret.page_content for ret in retrieved]
texts

['- time recommendations, graph - based search or supply - chain management, be sure to review all the different ways in which graph technology can work for your company. and while our customers span several continents and professional fields, they all agree that using the neo4j graph database is a critical component of their business success and competitiveness. are you a developer eager to learn more about making the switch? with so many ways to quickly get started, mastering graph database development is one of the best time investments you can make. other resources videos : • intro to neo4j and graph databases • intro to graph databases episode # 1 - evolution of dbs books : • o ’ reilly book : graph databases • learning neo4j trainings : • online training : getting started with neo4j • classroom trainings whether you need a solution that provides real - time recommendations, graph - based search or supply - chain management, be sure to review all the different ways in which graph 

### Supported LLMs (as of 06/05/2025)

In [None]:
hf_llms=[
    "meta-llama/Llama-3.1-8B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
]

In [56]:
from huggingface_hub import InferenceClient
from config.settings import settings

client = InferenceClient(
    provider="hf-inference",
    api_key=settings.HF_TOKEN.get_secret_value(),
)

def get_answer(
    sys_prompt: str,
    query: str,
    model: str = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
) -> str:
    """
    Send a system + user prompt to the specified model via HF Inference,
    returning the assistant’s content string.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user",   "content": query}
        ]
    )
    return response.choices[0].message.content


In [57]:
get_answer(sys_prompt="you are a helpful assistant who answers the users query concisely", query="what are the top houses in game of thrones")

'A concise question about Westeros! Here are the **Top Houses in Game of Thrones**, in no particular order, highlighting their **Sigil**, **Motto**, and **Notable Members**:\n\n1. **House Stark**\n\t* **Sigil**: Direwolf\n\t* **Motto**: "Winter is Coming"\n\t* **Notable Members**: Eddard (Ned), Robb, Sansa, Arya, Bran, Jon Snow\n\n2. **House Lannister**\n\t* **Sigil**: Lion\n\t* **Motto**: "Hear Me Roar!"\n\t* **Notable Members**: Cersei, Jaime, Tyrion, Tywin, Kevan\n\n3. **House Targaryen**\n\t* **Sigil**: Dragon\n\t* **Motto**: "Fire and Blood"\n\t* **Notable Members**: Daenerys, Viserys, Rhaegar, Aerys II (Mad King), Jon Snow (Aegon Targaryen)\n\n4. **House Baratheon**\n\t* **Sigil**: Stag\n\t* **Motto**: "Ours is the Fury"\n\t* **Notable Members**: Robert, Stannis, Renly, Joffrey, Myrcella, Tommen\n\n5. **House Tyrell**\n\t* **Sigil**: Rose\n\t* **Motto**: "Growing Strong"\n\t* **Notable Members**: Mace, Loras, Margaery, Olenna (Queen of Thorns)\n\n6. **House Greyjoy**\n\t* **Sigil

### Generation Pipeline

In [27]:
from src.generation.HuggingFaceLLM import HuggingFaceLLM

In [28]:
pg_llm = HuggingFaceLLM(model_name="meta-llama/Llama-3.1-8B-Instruct")

In [29]:
# pg_llm.get_answer(sys_prompt="you are a helpful assistant that answers concisely", user_prompt="what is quantum computing ?", max_tokens = 200)

In [30]:
from src.generation.PromptAugmentor import PromptAugmentor

In [31]:
augmentor = PromptAugmentor(client=pg_llm)

In [32]:
prompts = augmentor.generate(query="what is a graph db and how is it different from a regular VectorDB ?", synthetic_count=2)
prompts

INFO:src.generation.PromptAugmentor:Requesting synthetic prompt 1/2
INFO:src.generation.HuggingFaceLLM:get_answer called
INFO:src.generation.HuggingFaceLLM:API call successful: model=meta-llama/Llama-3.1-8B-Instruct messages=2
INFO:src.generation.HuggingFaceLLM:get_answer returning 136 characters
INFO:src.generation.PromptAugmentor:Generated prompt #1: 'What is the key difference between a Graph Database and a traditional Vector Database in terms of data structure and query capabilities?'
INFO:src.generation.PromptAugmentor:Requesting synthetic prompt 2/2
INFO:src.generation.HuggingFaceLLM:get_answer called
INFO:src.generation.HuggingFaceLLM:API call successful: model=meta-llama/Llama-3.1-8B-Instruct messages=2
INFO:src.generation.HuggingFaceLLM:get_answer returning 102 characters
INFO:src.generation.PromptAugmentor:Generated prompt #2: 'What are the primary use cases for Graph Databases versus Vector Databases in real-world applications?'


['What is the key difference between a Graph Database and a traditional Vector Database in terms of data structure and query capabilities?',
 'What are the primary use cases for Graph Databases versus Vector Databases in real-world applications?',
 'what is a graph db and how is it different from a regular VectorDB ?']

In [33]:
retriever = vsm.retriever(search_type="similarity", search_kwargs={"k": 4})
prompt_chunks = [(p, retriever.invoke(p)) for p in prompts]

INFO:src.ingestion.VectorStoreManager:Created retriever for 'index' with {'search_type': 'similarity', 'search_kwargs': {'k': 4}}


[METRICS] embed_query: time=0.14s, count=25
[METRICS] embed_query: time=0.23s, count=20
[METRICS] embed_query: time=0.28s, count=18


In [46]:
from src.generation.Fusion import FusionSummarizer
from src.generation.Prompts import Prompts

In [47]:
fusion_summarizer = FusionSummarizer(fusion_llm=pg_llm,sys_prompt=Prompts.MERGE_FUSION_SYS_PROMPT)

In [48]:
summaries = fusion_summarizer.summarize(prompt_chunks=prompt_chunks)

INFO:src.generation.Fusion:Generating summary for prompt 1: 'What is the key difference between a Graph Databas...'
INFO:src.generation.HuggingFaceLLM:get_answer called
INFO:src.generation.HuggingFaceLLM:API call successful: model=meta-llama/Llama-3.1-8B-Instruct messages=2
INFO:src.generation.HuggingFaceLLM:get_answer returning 1111 characters
INFO:src.generation.Fusion:Generating summary for prompt 2: 'What are the primary use cases for Graph Databases...'
INFO:src.generation.HuggingFaceLLM:get_answer called
INFO:src.generation.HuggingFaceLLM:API call successful: model=meta-llama/Llama-3.1-8B-Instruct messages=2
INFO:src.generation.HuggingFaceLLM:get_answer returning 1111 characters
INFO:src.generation.Fusion:Generating summary for prompt 3: 'what is a graph db and how is it different from a ...'
INFO:src.generation.HuggingFaceLLM:get_answer called
INFO:src.generation.HuggingFaceLLM:API call successful: model=meta-llama/Llama-3.1-8B-Instruct messages=2
INFO:src.generation.HuggingFace

In [41]:
all_summaries = "\n\n".join(summaries)

In [42]:
final_llm = HuggingFaceLLM(model_name="meta-llama/Llama-3.3-70B-Instruct")

INFO:src.generation.HuggingFaceLLM:Creating new HuggingFaceLLM for model: meta-llama/Llama-3.3-70B-Instruct
INFO:src.generation.HuggingFaceLLM:InferenceClient initialized for model: meta-llama/Llama-3.3-70B-Instruct


In [43]:
final_answer = final_llm.get_answer(sys_prompt=Prompts.FINAL_ANS_SYS_PROMPT,user_prompt="User Question: \nwhat is a graph db and how is it different from a regular VectorDB ? \n\n Context: \n"+all_summaries,max_tokens = 400, temperature = 0.7)

INFO:src.generation.HuggingFaceLLM:get_answer called
INFO:src.generation.HuggingFaceLLM:API call successful: model=meta-llama/Llama-3.3-70B-Instruct messages=2
INFO:src.generation.HuggingFaceLLM:get_answer returning 599 characters


In [45]:
print(final_answer)

A graph database is an online, operational database management system that operates on a graph data model, storing data as nodes and relationships. It is different from a regular VectorDB in terms of data model, storage, and processing. Graph databases use a graph data model, native graph storage, and native graph processing, whereas VectorDBs are suited for dense, high-dimensional data and are often used in scenarios such as recommendation systems, image and speech recognition, and Natural Language Processing (NLP). Insufficient information is available to provide a more detailed comparison.
