### Imports

In [1]:
from src.ingestion.loader import DocumentLoader
from src.ingestion.chunker import DocumentChunker
from src.ingestion.HuggingFaceEmbedder import HuggingFaceEmbedder
from config.settings import settings

### Document Loading

In [2]:
loader = DocumentLoader()

In [3]:
files = loader.list_filenames("pdfs")
files

['Graph_Databases_for_Beginners.pdf',
 'Project_4_Sankalp_Mane.pdf',
 'requirements.txt']

In [4]:
doc = loader.load_directory("pdfs")

[METRICS] load_directory: time=5.67s, count=70


In [5]:
docs = loader.load_documents("pdfs",file_names=files)
# print(type(docs[0].page_content))

### Chunking

In [10]:
chunker = DocumentChunker(
    hf_tokenizer_name="sentence-transformers/all-mpnet-base-v2",
    chunk_size=300,
    chunk_overlap=80
)
chunks, token_count = chunker.chunk_documents(docs)

[METRICS] chunk_documents: time=0.74s, count=122


In [11]:
chunks, token_count = chunker.chunk_documents(docs)

[METRICS] chunk_documents: time=0.43s, count=122


In [12]:
print(len(chunks))
print(token_count)

122
33989


### Embedding

In [None]:
embedder = HuggingFaceEmbedder("sentence-transformers/all-mpnet-base-v2")

In [None]:
print("initialized embedder")
v1, dim = embedder.embed_query(chunks[0].page_content)
print("dimension",dim)

In [55]:
from typing import List

In [73]:
from huggingface_hub import HfApi
from huggingface_hub import login
login(token=settings.HF_TOKEN.get_secret_value())
hf_api = HfApi()

def list_models(task: str,filter:str, sort:str = "likes" ,gated:bool = False, get_top: int = 10):
    models = hf_api.list_models(
        sort=sort,
        task=task,
        filter=filter,
        gated=gated,
        limit=get_top,)
    filtered_models = [model.modelId for model in models]
    
    return filtered_models

In [74]:
list_models(task="sentence-similarity",filter="feature-extraction")

['sentence-transformers/all-MiniLM-L6-v2',
 'BAAI/bge-m3',
 'sentence-transformers/all-mpnet-base-v2',
 'intfloat/multilingual-e5-large',
 'jinaai/jina-embeddings-v3',
 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
 'GanymedeNil/text2vec-large-chinese',
 'jinaai/jina-embeddings-v2-base-en',
 'nomic-ai/nomic-embed-text-v1.5',
 'hkunlp/instructor-xl']

In [79]:
from src.utils.ModelLister import HuggingFaceModelLister

In [80]:
lister = HuggingFaceModelLister()

lister.list_models(task="sentence-similarity",filter="feature-extraction")

[METRICS] list_models: time=0.51s, count=10


['sentence-transformers/all-MiniLM-L6-v2',
 'BAAI/bge-m3',
 'sentence-transformers/all-mpnet-base-v2',
 'intfloat/multilingual-e5-large',
 'jinaai/jina-embeddings-v3',
 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
 'GanymedeNil/text2vec-large-chinese',
 'jinaai/jina-embeddings-v2-base-en',
 'nomic-ai/nomic-embed-text-v1.5',
 'hkunlp/instructor-xl']