In [1]:
# Default packages
import os
from pathlib import Path
import sys

# Installed packages
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import StoppingCriteria, StoppingCriteriaList

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings

# Custome modules
sys.path.append(Path("../src").resolve().as_posix())
import settings as s
from indexeing import get_db

In [2]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    cache_folder=str(s.models_root_path),
)

embedding_model

HuggingFaceEmbeddings(model_name='thenlper/gte-large', cache_folder='C:\\Users\\sampath\\Dev\\TCS_GenAI_Hackaton\\models', model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [4]:
model_id = "google/gemma-3-4b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=s.models_root_path)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype="auto",
    cache_dir=s.models_root_path
)
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


Gemma3ForConditionalGeneration(
  (model): Gemma3Model(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(4096, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              )
              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwi

In [7]:
class StopOnNewline(StoppingCriteria):
    def __call__(self, input_ids, scores, **kwargs):
        return input_ids[0][-1] == tokenizer.eos_token_id


stopping_criteria = StoppingCriteriaList([StopOnNewline()])

output = model.generate(
    **input_tokens,
    temperature=0.7,   # creativity control: 0 = deterministic, higher = more creative
    top_p=0.9,         # nucleus sampling
    do_sample=True,
    stopping_criteria=stopping_criteria,
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Explain why mangoes and apples are fruits.

Mangoes and apples are fruits because they develop from the flower of a plant and contain seeds.


In [5]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    top_k=50
)

llm_wrapper = HuggingFacePipeline(pipeline=pipe)
llm_wrapper

Device set to use cpu


HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x0000026DF20F0E30>, model_id='google/gemma-3-4b-it')

In [6]:
db = get_db(embedding_model)
db

  db = Chroma(


File already embedded and indexed: GPT3_paper.pdf
File already embedded and indexed: SampathKovvaliResume.pdf


<langchain_community.vectorstores.chroma.Chroma at 0x26df20f25d0>

In [15]:
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Answer the question using the context below.
Return only the final answer, do NOT include any context or explanations.

Context:
{context}

Question:
{question}

Answer:"""
)

# Create retrieval chain
qa = RetrievalQA.from_chain_type(
    llm=llm_wrapper,
    chain_type="stuff",  # or "map_reduce" for large docs
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

# 3️⃣ Pass your query
query = "Explain the details of model training?"
result = qa.invoke(query)  # or qa.invoke(query)
print(result["result"])

KeyboardInterrupt: 

In [11]:
print(result["result"].split("Answer:\n")[-1].strip())

Resume
