### Imports

In [27]:
from typing import List, Tuple, Dict

In [1]:
from src.ingestion.loader import DocumentLoader
from src.ingestion.chunker import DocumentChunker
from src.ingestion.HuggingFaceEmbedder import HuggingFaceEmbedder
from config.settings import settings
from src.ingestion.VectorStoreManager import VectorStoreManager

### Document Loading

In [2]:
loader = DocumentLoader()

In [7]:
folder_name = "index"

In [8]:
files = loader.list_filenames(folder_name)
files

[METRICS] list_filenames: time=0.00s, count=1


['Sankalp_Resume.pdf']

In [9]:
docs = loader.load_documents(subdir=folder_name,file_names=files)
# print(type(docs[0].page_content))

[METRICS] load_documents: time=0.28s, count=1


### Chunking

In [10]:
chunker = DocumentChunker(
    hf_embedding_model="sentence-transformers/all-mpnet-base-v2",
    chunk_size=300,
    chunk_overlap=80
)

In [11]:
chunks = chunker.chunk_documents(docs)
token_count = chunker.get_docs_token_count(chunks)

[METRICS] chunk_documents: time=0.01s, count=3
[METRICS] get_docs_token_count: time=0.00s, count=3


In [12]:
print(len(chunks))
print(token_count)

3
939


### Embedding

In [13]:
embedder = HuggingFaceEmbedder("sentence-transformers/all-mpnet-base-v2")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


In [14]:
v1  = embedder.embed_query(chunks[0].page_content)
print("dimension",len(v1))

[METRICS] embed_query: time=3.14s, count=386
dimension 768


### Vector Store Management

In [15]:
vsm = VectorStoreManager(embedding_function=embedder,index_name=folder_name)

In [16]:
vsm.create_index()

INFO:src.ingestion.VectorStoreManager:Created in‐memory FAISS index 'index' (dim=768)


[METRICS] embed_query: time=0.20s, count=4


In [17]:
vsm.add_documents(chunks)

INFO:src.ingestion.VectorStoreManager:Added 3 docs into 'index'


[METRICS] embed_documents: time=3.00s, count=939
[METRICS] add_documents: time=3.02s, count=3


['347d5864-12a0-4a9d-a086-177a355646c2',
 'e9edf48f-a286-49a0-9c40-8cd06730c1a9',
 '317beb41-f504-40e0-868f-1afef9a8ae38']

In [None]:
# vsm.save_local()

INFO:src.ingestion.VectorStoreManager:Saved index 'pdfs' to /home/ashmit/work/SEM_VIII/EnhancedRAG/context/faiss_indexes


In [18]:
retrieved = vsm.similarity_search_with_score(query="experience at dolf", k=2)

[METRICS] embed_query: time=0.23s, count=6
[METRICS] similarity_search_with_score: time=0.26s, count=2


In [30]:
retriever = vsm.retriever(search_type = "similarity", search_kwargs = {"k":2})

INFO:src.ingestion.VectorStoreManager:Created retriever for 'index' with {'search_type': 'similarity', 'search_kwargs': {'k': 2}}


In [32]:
retrieved = retriever.invoke("projects by snakalp")

[METRICS] embed_query: time=0.46s, count=8


In [None]:
texts = [ret.page_content for ret in retrieved_chunk_docs]
texts

['sankalp mane mlh hackhound ’ 23 winner, passionate about data science, ml and genai. c manesm2403 @ gmail. com | ħ + 91 91728 67443 | a github. com / sankadash | ] linkedin. com / in / snklpm skills programming languages python, shell, sql, html, css, r tools and technologies numpy, pandas, matplotlib, seaborn, plotly, scikit - learn, transformers, tensorflow, py - torch, keras, openai api, langchain, chromadb, mongodb, flask, linux, git, docker, databricks, microsoft autogen, smolagents experience centific october 2024 - present associate ai engineer intern rag, ai services, databricks, hugging face, openai, langchain • engineered an ai - driven rag fusion system that integrates multi - llm providers ( openai, databricks, hugging face ) via langchain and azure databricks, enabling dynamic query transformation and precise document summarization. • developed and deployed modular ai services including merge fusion, weighted fusion, and ood testing along with pipelines for advanced know

In [None]:
def get_summary(prompt: str])->str:
    retrieved = vsm.similarity_search_with_score()

1.608781337738037

In [19]:
from huggingface_hub import InferenceClient
from config.settings import settings

In [23]:
from src.utils.ModelLister import HuggingFaceModelLister

In [24]:
lister = HuggingFaceModelLister()

models = lister.list_models(task=["text-generation","tex"],filter="inference-endpoints",inference="warm")

[METRICS] list_models: time=0.42s, count=0


In [None]:
api = lister.huggingface_api()

models = api.list_models(
    sort="likes",
    inference="warm",
    task="text-generation",
    filter=["text-generation-inference","HF-Inference-API"],
    gated=False,
    limit=5)

listed = []
for model in models:
    listed.append(model)
listed



[]

In [None]:
hf_llms=[
    "meta-llama/Llama-3.1-8B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
]

In [53]:
client = InferenceClient(
    provider="hf-inference",
    api_key=settings.HF_TOKEN.get_secret_value(),
)

In [45]:
completion = client.chat.completions.create(
    model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
    messages=[
        {"role": "System", "content": "Answer the question given by the user as concisely and accurately as possible"},
        {"role": "user", "content": "What is the capital of France?"}
        ],
    max_tokens=200,
)
response = completion.choices[0].message.content

In [56]:
from huggingface_hub import InferenceClient
from config.settings import settings

client = InferenceClient(
    provider="hf-inference",
    api_key=settings.HF_TOKEN.get_secret_value(),
)

def get_answer(
    sys_prompt: str,
    query: str,
    model: str = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
) -> str:
    """
    Send a system + user prompt to the specified model via HF Inference,
    returning the assistant’s content string.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user",   "content": query}
        ]
    )
    return response.choices[0].message.content


In [57]:
get_answer(sys_prompt="you are a helpful assistant who answers the users query concisely", query="what are the top houses in game of thrones")

'A concise question about Westeros! Here are the **Top Houses in Game of Thrones**, in no particular order, highlighting their **Sigil**, **Motto**, and **Notable Members**:\n\n1. **House Stark**\n\t* **Sigil**: Direwolf\n\t* **Motto**: "Winter is Coming"\n\t* **Notable Members**: Eddard (Ned), Robb, Sansa, Arya, Bran, Jon Snow\n\n2. **House Lannister**\n\t* **Sigil**: Lion\n\t* **Motto**: "Hear Me Roar!"\n\t* **Notable Members**: Cersei, Jaime, Tyrion, Tywin, Kevan\n\n3. **House Targaryen**\n\t* **Sigil**: Dragon\n\t* **Motto**: "Fire and Blood"\n\t* **Notable Members**: Daenerys, Viserys, Rhaegar, Aerys II (Mad King), Jon Snow (Aegon Targaryen)\n\n4. **House Baratheon**\n\t* **Sigil**: Stag\n\t* **Motto**: "Ours is the Fury"\n\t* **Notable Members**: Robert, Stannis, Renly, Joffrey, Myrcella, Tommen\n\n5. **House Tyrell**\n\t* **Sigil**: Rose\n\t* **Motto**: "Growing Strong"\n\t* **Notable Members**: Mace, Loras, Margaery, Olenna (Queen of Thorns)\n\n6. **House Greyjoy**\n\t* **Sigil

In [34]:
import pandas as pd
import time


In [40]:
inference_x_time = pd.DataFrame(columns=["response", "time_taken"])

for model in hf_models:
    start = time.time()
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": "What is the capital of France?"}],
        max_tokens=200,
    )
    response = completion.choices[0].message.content
    
    time_taken = time.time() - start
    
    next_index = len(inference_x_time)
    inference_x_time.loc[next_index] = [response, time_taken]
    
    print(f"{model} → {time_taken:.3f}s: {response}")


meta-llama/Llama-3.1-8B-Instruct → 0.399s: The capital of France is Paris.
meta-llama/Llama-3.3-70B-Instruct → 0.791s: The capital of France is Paris.
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B → 0.368s: Okay, so I need to figure out what the capital of France is. Hmm, I remember from school that Paris is the capital, but maybe I should double-check to make sure. Let me think about other possible cities in France. There's Lyon, Marseille, and Nice, but those don't sound like capitals. Wait, could it be somewhere else? Maybe Lille or Toulouse? No, those are major cities but I don't think they're capitals. France has a long history, so maybe the capital has changed before. I think historically, Paris has always been the capital, but I'm not entirely sure. I should probably look up some historical context. Oh, and the Eiffel Tower is in Paris, which is a symbol of France, so that reinforces that Paris is the capital. Also, when I think of French government buildings and embassies, they're u

In [41]:
display(inference_x_time)

Unnamed: 0,response,time_taken
0,The capital of France is Paris.,0.399337
1,The capital of France is Paris.,0.790705
2,"Okay, so I need to figure out what the capital...",0.367647
3,The capital of France is Paris. It's one of th...,0.44322
4,The capital of France is **Paris**.,0.35356


In [44]:
inference_x_time["response"][3]

"The capital of France is Paris. It's one of the most famous cities in the world, known for its rich history, culture, art, and architecture. The French government is headquartered at the Hôtel des Invalides, the Louvre Palace, and most notably, the Eiffel Tower, located in the heart of the city. However, the official seat of the French President is at the Élysée Palace in Paris."

In [28]:
completion

ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='The capital of France is **Paris**.', tool_call_id=None, tool_calls=None), logprobs=None)], created=1745731031, id='', model='nvidia/Llama-3.1-Nemotron-70B-Instruct-HF', system_fingerprint='3.2.1-sha-4d28897', usage=ChatCompletionOutputUsage(completion_tokens=10, prompt_tokens=22, total_tokens=32), object='chat.completion')

In [32]:
completion.choices[0].message.content

'The capital of France is **Paris**.'

In [37]:
completion.usage.total_tokens

32

In [3]:
from src.generation.HuggingFaceLLM import HuggingFaceLLM

In [4]:
pg_llm = HuggingFaceLLM(model_name="meta-llama/Llama-3.1-8B-Instruct")

In [None]:
pg_llm.get_answer(sys_prompt="you are a helpful assistant that answers concisely", user_prompt="what is quantum computing ?", max_tokens = 200)

'Quantum computing is a revolutionary computing paradigm that uses the principles of quantum mechanics to process information. Unlike classical computers, which use bits (0s and 1s) to store and process data, quantum computers use quantum bits or qubits.\n\nQubits can exist in multiple states simultaneously, allowing quantum computers to:\n\n1. Process massive amounts of data in parallel\n2. Solve complex problems exponentially faster than classical computers\n\nThis leads to potential breakthroughs in:\n\n1. cryptography\n2. machine learning\n3. materials science\n4. optimization problems\n\nHowever, quantum computing is still in its early stages, and significant technical challenges need to be overcome before it becomes widely available.'

In [1]:
from src.generation.PromptAugmentor import PromptAugmentor

In [5]:
augmentor = PromptAugmentor(client=pg_llm)

In [6]:
prompts = augmentor.generate(query="what did he say to that person who is the mother of my child", synthetic_count=2)
prompts

['What did he say to the mother of my child?',
 'What did he say to the mother of my child in that conversation.',
 'what did he say to that person who is the mother of my child']