# RAG Components (Store → Retriever → Generator)


In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [2]:
from dataclasses import dataclass
from typing import List, Dict, Any
import numpy as np


from sentence_transformers import SentenceTransformer


2026-02-16 09:33:05.576285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771234385.764334      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771234385.821167      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771234386.272308      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771234386.272354      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771234386.272357      55 computation_placer.cc:177] computation placer alr

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "Qwen/Qwen3-8B"


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.float16,
    load_in_4bit=True,
)

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [18]:
print(model)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 4096)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (down_proj): Linear4bit(in_features=12288, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((4096,), eps=1e-06

In [6]:
@dataclass
class Doc:
    text: str
    meta: Dict[str, Any] | None = None

class Store:
    """Stores docs + their sentence embeddings computed at add() time."""
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.docs: List[Doc] = []
        self.E = None  # (N, d) numpy array of normalized embeddings

    def add(self, text: str, meta: Dict[str, Any] | None = None):
        e = self.model.encode([text], normalize_embeddings=True)[0]  # (d,)
        self.docs.append(Doc(text=text, meta=meta))
        self.E = e[None, :] if self.E is None else np.vstack([self.E, e])

    def embed_query(self, query: str):
        return self.model.encode([query], normalize_embeddings=True)[0]  # (d,)

In [7]:

class Retriever:
    """Only retrieves from an existing Store. Does NOT build/update the index."""
    def __init__(self, store: Store):
        self.store = store

    def retrieve(self, query: str, k: int = 3):
        if self.store.E is None:
            return []
        q = self.store.embed_query(query)                 # query vector (no index update)
        scores = (self.store.E @ q.T).ravel()   # cosine similarity
        top_idx = np.argsort(-scores)[:k]
        return [(scores[i], self.store.docs[i]) for i in top_idx]



In [8]:
class Generator:
    """Shows context injection. (Replace with a real LLM later.)"""
    def generate(self, query, retrieved: List[Doc], max_new_tokens=256):
        
        prompt = f"""You are a helpful assistant. Use ONLY the context.
    
        Context:
        {chr(10).join([f"- {c}" for c in retrieved])}
        
        Question: {query}"""

        
        messages = [
        {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
        
        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=32768
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
        #inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        return tokenizer.decode(output_ids, skip_special_tokens=True)

In [9]:
# --- 1) Add documents to the Store (representations are saved at add time) ---

store = Store()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
store.add("RAG retrieves relevant documents at query time and uses them to generate grounded answers.", 
          meta={"source": "intro"})

store.add("A vector store holds document embeddings so similarity search can retrieve the most relevant chunks.", 
          meta={"source": "store"})

store.add("A retriever selects top-k relevant chunks for a query using lexical, dense, or hybrid search.", 
          meta={"source": "retriever"})

store.add("The generator (LLM) reads the retrieved context and produces an answer, ideally with citations.", 
          meta={"source": "generator"})

In [11]:
len(store.docs), store.E.shape 

(4, (4, 384))

In [12]:

retriever = Retriever(store)

query = "Is pre-approval needed for a ₹18k course, and when do I have to submit the claim?"
hits = retriever.retrieve(query, k=3)

for score, doc in hits:
    print(f"score={score:.3f} | source={doc.meta.get('source') if doc.meta else None}\n  {doc.text}\n")


score=0.034 | source=generator
  The generator (LLM) reads the retrieved context and produces an answer, ideally with citations.

score=0.014 | source=intro
  RAG retrieves relevant documents at query time and uses them to generate grounded answers.

score=-0.063 | source=retriever
  A retriever selects top-k relevant chunks for a query using lexical, dense, or hybrid search.



In [13]:
# --- 3) Generate by injecting retrieved docs into context ---

gen = Generator()



In [14]:
answer = gen.generate(query, [d for _, d in hits])
answer

'Based on the provided context, there is no information related to pre-approval requirements for a ₹18k course or the submission deadlines for claims. The context documents discuss topics related to retrieval and generation processes in a system, but they do not address financial or administrative procedures. Therefore, I cannot provide an answer to the question using the given context.'

In [15]:
store.add(
  "Learning & Certification Policy (Internal, v2026.01): "
  "Employees may claim reimbursement for job-relevant online courses and certification exam fees. "
  "Pre-approval is required if the total cost exceeds ₹10,000 (course + exam). "
  "Reimbursement caps: ₹30,000 per employee per financial year. "
  "Required proof: paid invoice/receipt, course completion certificate (or exam result), and approval email if applicable. "
  "Claims must be submitted within 14 calendar days of completion.",
  meta={"source": "policy/learning-certification"}
)

store.add(
  "Finance Claims SOP (Internal): "
  "Submit reimbursements in the Expense Portal under category 'Learning & Development'. "
  "Mandatory fields: cost center, project code (if applicable), vendor name, invoice date, and currency. "
  "Attach receipts as a single PDF. Approval flow: Manager → Finance. "
  "Common rejection reasons: missing receipt, missing completion proof, incorrect category, or missing cost center. "
  "Typical processing time: 4–6 business days after final approval.",
  meta={"source": "sop/finance-claims"}
)

store.add(
  "Procurement Guidelines (Internal): "
  "For software subscriptions or licenses, request via Procurement Form before purchase if the vendor requires a contract. "
  "Include: license count, duration, cost, business justification, and vendor quote. "
  "Approvals: Budget Owner → Procurement. "
  "For renewals, submit request at least 10 business days before expiry.",
  meta={"source": "guidelines/procurement"}
)

store.add(
  "IT Access & Tool Request Guide (Internal): "
  "Access requests are handled through the IT Service Desk. "
  "Provide: tool name, purpose, team, duration, and required access level (viewer/editor/admin). "
  "Turnaround: 1–3 business days for standard tools; longer if license purchase is needed. "
  "Admins are granted only with manager approval and justification.",
  meta={"source": "guide/it-access-tooling"}
)

store.add(
  "Information Handling Basics (Internal): "
  "Share files using approved company storage with access controls (no public links). "
  "Avoid storing secrets in documents; use the secrets manager. "
  "If sharing data externally (vendor/partner), ensure a data-sharing agreement is in place and share only the minimum necessary. "
  "Remove direct identifiers when not required for the task.",
  meta={"source": "policy/info-handling"}
)

In [16]:
hits = retriever.retrieve(query, k=3)

for score, doc in hits:
    print(f"score={score:.3f} | source={doc.meta.get('source') if doc.meta else None}\n  {doc.text}\n")

score=0.634 | source=policy/learning-certification
  Learning & Certification Policy (Internal, v2026.01): Employees may claim reimbursement for job-relevant online courses and certification exam fees. Pre-approval is required if the total cost exceeds ₹10,000 (course + exam). Reimbursement caps: ₹30,000 per employee per financial year. Required proof: paid invoice/receipt, course completion certificate (or exam result), and approval email if applicable. Claims must be submitted within 14 calendar days of completion.

score=0.566 | source=sop/finance-claims
  Finance Claims SOP (Internal): Submit reimbursements in the Expense Portal under category 'Learning & Development'. Mandatory fields: cost center, project code (if applicable), vendor name, invoice date, and currency. Attach receipts as a single PDF. Approval flow: Manager → Finance. Common rejection reasons: missing receipt, missing completion proof, incorrect category, or missing cost center. Typical processing time: 4–6 busines

In [17]:
answer = gen.generate(query, [d for _, d in hits])
answer

'Yes, pre-approval is needed for a ₹18,000 course, as the total cost exceeds ₹10,000. You must submit the claim within 14 calendar days of completing the course.'