In [1]:
!pip install sentence-transformers faiss-cpu numpy


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [2]:
documents = [
    """Machine learning enables systems to learn from data.
    It is widely used in healthcare, finance, and automation.""",

    """Robotics integrates mechanical engineering, electronics,
    and artificial intelligence to build intelligent machines.""",

    """Retrieval Augmented Generation (RAG) improves LLM responses
    by grounding answers in external knowledge bases."""
]


In [3]:
def clean_text(text):
    text = text.lower()
    text = text.replace("\n", " ")
    return text.strip()

cleaned_docs = [clean_text(doc) for doc in documents]


In [4]:
def chunk_text(text, chunk_size=40, overlap=10):
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))

    return chunks

chunks = []
for doc in cleaned_docs:
    chunks.extend(chunk_text(doc))

print(f"Total chunks created: {len(chunks)}")


Total chunks created: 3


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(chunks)
embeddings = np.array(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 3


In [7]:
def semantic_search(query, top_k=2):
    query_embedding = model.encode([query])
    query_embedding = np.array(query_embedding)

    distances, indices = index.search(query_embedding, top_k)

    results = [chunks[i] for i in indices[0]]
    return results


In [8]:
query = "How does RAG help large language models?"
results = semantic_search(query)

for i, res in enumerate(results, 1):
    print(f"Result {i}: {res}")


Result 1: retrieval augmented generation (rag) improves llm responses by grounding answers in external knowledge bases.
Result 2: machine learning enables systems to learn from data. it is widely used in healthcare, finance, and automation.


In [9]:
!pip install transformers torch




In [10]:
from transformers import pipeline

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [11]:
def build_prompt(query, context_chunks):
    context = "\n".join(context_chunks)

    prompt = f"""
Use the context below to answer the question.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{query}

Answer:
"""
    return prompt


In [12]:
def rag_generate_answer(query):
    retrieved_chunks = semantic_search(query, top_k=2)
    prompt = build_prompt(query, retrieved_chunks)

    response = llm(prompt)
    return response[0]["generated_text"]


In [13]:
query = "How does RAG help large language models?"
answer = rag_generate_answer(query)

print("Final Answer:")
print(answer)


Final Answer:
improves llm responses by grounding answers in external knowledge bases
