In [1]:
# Default packages
import os
from pathlib import Path
import sys

# Installed packages
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import StoppingCriteria, StoppingCriteriaList

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings

# Custome modules
sys.path.append(Path("../src").resolve().as_posix())
import settings as s
from indexeing import get_db

In [2]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    cache_folder=str(s.models_root_path),
)

embedding_model

HuggingFaceEmbeddings(model_name='thenlper/gte-large', cache_folder='/home/TCS_GenAI_Hackaton/models', model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [4]:
model_id = "google/gemma-3-4b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=s.models_root_path)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype="auto",
    cache_dir=s.models_root_path
)
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gemma3ForConditionalGeneration(
  (model): Gemma3Model(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(4096, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              )
              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwi

In [5]:
# class StopOnNewline(StoppingCriteria):
#     def __call__(self, input_ids, scores, **kwargs):
#         return input_ids[0][-1] == tokenizer.eos_token_id


# stopping_criteria = StoppingCriteriaList([StopOnNewline()])

# output = model.generate(
#     **input_tokens,
#     temperature=0.7,   # creativity control: 0 = deterministic, higher = more creative
#     top_p=0.9,         # nucleus sampling
#     do_sample=True,
#     stopping_criteria=stopping_criteria,
# )

# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(generated_text)

In [6]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    top_k=50
)

llm_wrapper = HuggingFacePipeline(pipeline=pipe)
llm_wrapper

Device set to use cuda:0


HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7ff2014e07d0>, model_id='google/gemma-3-4b-it')

In [7]:
db = get_db(embedding_model)
db

File already embedded and indexed: GPT3_paper.pdf
File already embedded and indexed: SampathKovvaliResume.pdf
File already embedded and indexed: attention_is_all_you_need.pdf


<langchain_chroma.vectorstores.Chroma at 0x7ff2014e2b40>

In [16]:
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Answer the question using the context below.
Return only the final answer, do NOT include any context or explanations.

Context:
{context}

Question:
{question}

Answer:"""
)

# Create retrieval chain
qa = RetrievalQA.from_chain_type(
    llm=llm_wrapper,
    chain_type="stuff",  # or "map_reduce" for large docs
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

# 3️⃣ Pass your query
query = "what is my aadhar number?"
result = qa.invoke(query)  # or qa.invoke(query)
print(result["result"])


Answer the question using the context below.
Return only the final answer, do NOT include any context or explanations.

Context:
Name Split Metric N Acc/F1/BLEU
Total
Count
Dirty
Acc/F1/BLEU
Dirty
Count
Clean
Acc/F1/BLEU
Clean
Count
Clean
Percentage
Relative
Difference
Clean vs All
Quac dev f1 13 44.3 7353 44.3 7315 54.1 38 1% 20%
SQuADv2 dev f1 13 69.8 11873 69.9 11136 68.4 737 6% -2%
DROP dev f1 13 36.5 9536 37.0 8898 29.5 638 7% -21%
Symbol Insertion dev acc 7 66.9 10000 66.8 8565 67.1 1435 14% 0%
CoQa dev f1 13 86.0 7983 85.3 5107 87.1 2876 36% 1%
ReCoRD dev acc 13 89.5 10000 90.3 6110 88.2 3890 39% -1%
Winograd test acc 9 88.6 273 90.2 164 86.2 109 40% -3%
BoolQ dev acc 13 76.0 3270 75.8 1955 76.3 1315 40% 0%
MultiRC dev acc 13 74.2 953 73.4 558 75.3 395 41% 1%
RACE-h test acc 13 46.8 3498 47.0 1580 46.7 1918 55% 0%
LAMBADA test acc 13 86.4 5153 86.9 2209 86.0 2944 57% 0%
LAMBADA (No Blanks) test acc 13 77.8 5153 78.5 2209 77.2 2944 57% -1%
WSC dev acc 13 76.9 104 73.8 42 79.0 62

In [17]:
print(result["result"].split("Answer:\n")[-1].strip())

1682374
