In [None]:
!pip install datasets sentence-transformers faiss-cpu transformers

**Load Dataset**

In [None]:
from datasets import load_dataset

dataset = load_dataset("ag_news", split="train[:500]")

documents = [doc["text"] for doc in dataset]

**Chunking**

In [None]:
def chunk_text(text, chunk_size=500, overlap=100):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc))

**Create Embeddings**

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embed_model.encode(all_chunks)

**Store in FAISS**

In [None]:
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

**Retrieval Function**

In [None]:
def retrieve(query, top_k=3):
    query_embedding = embed_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [all_chunks[i] for i in indices[0]]

**Load LLM**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

**RAG Answer Function**

In [None]:
def rag_answer(query):
    retrieved_docs = retrieve(query)
    context = " ".join(retrieved_docs)

    prompt = f"""
Answer the question in 3-4 clear sentences based on the context.

Context:
{context}

Question:
{query}
"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

    outputs = model.generate(
        **inputs,
        max_new_tokens=100
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

**Test Queries**

In [None]:
print(rag_answer("What is machine learning?"))


In [None]:
print(rag_answer("Explain neural networks."))


In [None]:
print(rag_answer("What are research challenges in AI?"))