### Imports

In [1]:
from src.ingestion.loader import DocumentLoader
from src.ingestion.chunker import DocumentChunker
from src.ingestion.HuggingFaceEmbedder import HuggingFaceEmbedder
from config.settings import settings

### Document Loading

In [2]:
loader = DocumentLoader()

In [3]:
files = loader.list_filenames("pdfs")
files

[METRICS] list_filenames: time=0.00s, count=3


['Graph_Databases_for_Beginners.pdf',
 'Project_4_Sankalp_Mane.pdf',
 'requirements.txt']

In [4]:
docs = loader.load_documents("pdfs",file_names=files)
# print(type(docs[0].page_content))

[METRICS] load_documents: time=2.34s, count=70


### Chunking

In [5]:
chunker = DocumentChunker(
    hf_embedding_model="sentence-transformers/all-mpnet-base-v2",
    chunk_size=300,
    chunk_overlap=80
)

In [6]:
chunks = chunker.chunk_documents(docs)
token_count = chunker.get_docs_token_count(chunks)

[METRICS] chunk_documents: time=0.27s, count=122
[METRICS] get_docs_token_count: time=0.10s, count=122


In [7]:
print(len(chunks))
print(token_count)

122
33989


### Embedding

In [8]:
embedder = HuggingFaceEmbedder("sentence-transformers/all-mpnet-base-v2")

In [9]:
v1  = embedder.embed_query(chunks[0].page_content)
print("dimension",len(v1))

[METRICS] embed_query: time=0.15s, count=32
dimension 768


In [10]:
embeddings = embedder.embed_documents(chunks[0].page_content)

[METRICS] embed_documents: time=0.65s, count=32


In [13]:
texts = []
for i in range(0,len(chunks)):
    text = chunks[i].page_content
    texts.append(text)

In [17]:
other = embedder.embed_documents(texts)

[METRICS] embed_documents: time=54.94s, count=33989


  other = embedder.embed_documents(texts)


In [19]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from uuid import uuid4
from langchain_core.documents import Document
from langchain_community.vectorstores.utils import DistanceStrategy

index = faiss.IndexFlatL2(len(embedder.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embedder,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    distance_strategy=DistanceStrategy.COSINE
)

[METRICS] embed_query: time=0.09s, count=4


In [20]:
uuids = [str(uuid4()) for _ in range(len(chunks))]
vector_store.add_documents(documents=chunks, ids=uuids)

[METRICS] embed_documents: time=53.23s, count=33989


['eb7c745e-9b17-4764-a3dd-431248a03326',
 '3a0b136a-30b6-4fbd-8176-62deded3da0f',
 'a335d43e-9383-46fa-b3d4-81c403073044',
 '3d1ede65-772a-4d05-a8aa-df680cad92ce',
 '63ad2d3f-ce78-46a1-abad-d2ddaac85d77',
 '7ffadda4-67cb-43a0-85f1-0b49afdde4fb',
 '166db3db-0025-4cfb-b702-9524e53bbfc5',
 '6787978b-f895-49b3-ad96-f202707460f9',
 '203744b3-3a4d-4e11-8809-d7cbb056aaba',
 '17a5c391-c217-4667-ae0f-518ccc9a31b9',
 '7c9e88af-8a74-4cf2-abaa-6d1e6f8bce27',
 'abc94eb6-f6bb-4bee-b544-0a36ca57d39f',
 'd39210b4-9aab-4ce4-be5f-985b6af4077e',
 '4a626a30-7018-4ea6-bf65-89cb2a4851f6',
 '22b71c96-8c05-41ac-8df5-1e250bcde28b',
 '75d2141c-6a6e-4ad5-a1d9-d01a62f91c28',
 '94a01db3-5111-47f1-98ac-9d1b74793127',
 '5e10a849-ad83-43b9-acf0-38f16d20e452',
 '87ac2ee3-2c8e-4b93-9c06-2f7945adb7d2',
 '9153ac61-e3b7-4c95-8ec4-82d8fccfd9ad',
 'bdc9224f-90b2-4a82-b5c7-92b6d83179ae',
 '2f04b832-296b-4c10-92fa-b3fe312dd34b',
 '65d90a78-634f-4af6-b475-ddca0c4248e7',
 '45dfbd85-5f43-4a2d-bca5-98235d962a1a',
 '1a0e5a60-6e92-

In [21]:
vector_store.save_local(folder_path=settings.FAISS_INDEXES, index_name="pdfs")

In [None]:
# vector_store.load_local(folder_path=settings.FAISS_INDEXES, index_name="pdfs", embeddings=embedder, allow_dangerous_deserialization=True)