In [11]:
# Run this cell first
!pip install -q sentence-transformers faiss-cpu transformers accelerate datasets


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cpu


In [13]:
# Embedding model (small & good general-purpose)
embed_model_name = "all-mpnet-base-v2"
embed_model = SentenceTransformer(embed_model_name, device=device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Simple chunking by sentences / tokens
import re
from typing import List

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    """
    Chunk text by characters (approx tokens). Simple, robust.
    chunk_size and overlap are character counts - adjust for your data.
    """
    text = re.sub(r'\s+', ' ', text).strip()
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk.strip())
        start = end - overlap
        if start < 0:
            start = 0
    return chunks


In [15]:
# Example small corpus
documents = [
    {"id": "doc1", "text": "Shubham Tiwari was a student at Central Institute of Tool Design in 2025. Shaik Shoeb was a student at Central Institute of Tool Design in 2025. They created a unique RAG system in 2025, all by themselves without the help of ChatGPT or Youtube. Shubham Tiwari was 21 years old and Shaik Shoeb was only 22 at the time."},
    {"id": "doc2", "text": "Python is a widely used high-level programming language for general-purpose programming. It emphasizes code readability with significant whitespace."},
    {"id": "doc3", "text": "FAISS is a library for efficient similarity search and clustering of dense vectors. It was developed by Facebook AI Research."},
    {"id": "doc4", "text": "Vamshi Sir is a faculty of Artificial Intelligence at Central Institute of Tool Design also known as CITD. He was a mastermind in the field of AI research and RAG systems. Vamshi Sir has made over 200 contributions in the field of AI including 200 patents and 5000 research paper publications."},
]

# For larger docs, chunk them:
corpus_chunks = []
for doc in documents:
    chunks = chunk_text(doc["text"], chunk_size=250, overlap=50)
    for i, c in enumerate(chunks):
        corpus_chunks.append({
            "id": f"{doc['id']}_chunk{i}",
            "text": c,
            "source": doc["id"]
        })

len(corpus_chunks), corpus_chunks[:2]


(6,
 [{'id': 'doc1_chunk0',
   'text': 'Shubham Tiwari was a student at Central Institute of Tool Design in 2025. Shaik Shoeb was a student at Central Institute of Tool Design in 2025. They created a unique RAG system in 2025, all by themselves without the help of ChatGPT or Youtube. Shubh',
   'source': 'doc1'},
  {'id': 'doc1_chunk1',
   'text': 'lves without the help of ChatGPT or Youtube. Shubham Tiwari was 21 years old and Shaik Shoeb was only 22 at the time.',
   'source': 'doc1'}])

In [16]:
# Create embeddings for each chunk
texts = [c["text"] for c in corpus_chunks]
ids = [c["id"] for c in corpus_chunks]

# Compute embeddings (numpy)
embeddings = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Normalize embeddings (cosine similarity via inner product)
faiss.normalize_L2(embeddings)

d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)  # inner product -> cosine if vectors normalized
index.add(embeddings)
print("Index size:", index.ntotal)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Index size: 6


In [17]:
def retrieve(query: str, k: int = 3):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)   # D: scores, I: indices
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        results.append({
            "id": ids[idx],
            "text": texts[idx],
            "score": float(score)
        })
    return results

# quick test
print(retrieve("Who walked on the moon?", k=2))


[{'id': 'doc1_chunk1', 'text': 'lves without the help of ChatGPT or Youtube. Shubham Tiwari was 21 years old and Shaik Shoeb was only 22 at the time.', 'score': 0.1555672287940979}, {'id': 'doc1_chunk0', 'text': 'Shubham Tiwari was a student at Central Institute of Tool Design in 2025. Shaik Shoeb was a student at Central Institute of Tool Design in 2025. They created a unique RAG system in 2025, all by themselves without the help of ChatGPT or Youtube. Shubh', 'score': 0.10698273777961731}]


In [18]:
# Choose a seq2seq instruction model that runs on Colab resources.
# flan-t5-base is a good balance for Colab; can swap to large if you have more RAM/GPU.
gen_model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name).to(device)

def generate_answer(query: str, retrieved: list, max_new_tokens: int = 128, temperature: float = 0.0):
    # Build context prompt by concatenating top retrieved passages
    context = "\n\n".join([f"Source [{r['id']}]: {r['text']}" for r in retrieved])
    prompt = (
        "You are a helpful assistant. Use the following retrieved documents to answer the question.\n\n"
        f"{context}\n\nQuestion: {query}\nAnswer:"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        gen = model.generate(**inputs,
                             max_new_tokens=max_new_tokens,
                             do_sample=(temperature>0),
                             temperature=temperature,
                             top_p=0.95,
                             num_beams=4)
    answer = tokenizer.decode(gen[0], skip_special_tokens=True)
    return answer.strip()

# quick generate test
q = "What is FAISS?"
retr = retrieve(q, k=2)
print("Retrieved:", retr)
print("Answer:", generate_answer(q, retr))


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Retrieved: [{'id': 'doc3_chunk0', 'text': 'FAISS is a library for efficient similarity search and clustering of dense vectors. It was developed by Facebook AI Research.', 'score': 0.35827139019966125}, {'id': 'doc4_chunk0', 'text': 'Vamshi Sir is a faculty of Artificial Intelligence at Central Institute of Tool Design also known as CITD. He was a mastermind in the field of AI research and RAG systems. Vamshi Sir has made over 200 contributions in the field of AI including 200 pa', 'score': 0.24279829859733582}]
Answer: a library for efficient similarity search and clustering of dense vectors


In [19]:
def ask(query, k=3):
    retrieved = retrieve(query, k=k)
    answer = generate_answer(query, retrieved)
    return {"query": query, "retrieved": retrieved, "answer": answer}

# Example usage:
while True:
  prompt = input("Question:...")
  res = ask(prompt, k=3)
  print("Query:", res["query"])
  print("Answer:", res["answer"])
# print("\nRetrieved chunks:")
# for r in res["retrieved"]:
    # print("-", r["id"], f"(score={r['score']:.3f})", ":", r["text"])


KeyboardInterrupt: Interrupted by user