In [1]:
!pip install sentence-transformers transformers faiss-cpu nltk --quiet


In [2]:
import re
import faiss
import nltk
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# GPU setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


2026-02-17 12:23:23.712062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771331003.734615     309 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771331003.743659     309 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771331003.760840     309 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771331003.760871     309 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771331003.760873     309 computation_placer.cc:177] computation placer alr

Using device: cuda


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
with open("/kaggle/input/datasets/ffatty/plain-text-wikipedia-simpleenglish/AllCombined.txt",
          "r",
          encoding="utf-8") as f:
    raw_documents = f.read().split("\n\n")

print("Total documents:", len(raw_documents))


Total documents: 968274


In [4]:
editor_model_name = "google/flan-t5-small"  # 80M parameters

tokenizer = AutoTokenizer.from_pretrained(editor_model_name)

def chunk_text(text, min_tokens=300, max_tokens=600, stride=400):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []

    for i in range(0, len(tokens), stride):
        chunk_tokens = tokens[i:i+max_tokens]
        if len(chunk_tokens) == 0:
            continue
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)

    return chunks

chunks = []
for doc in raw_documents:
    chunks.extend(chunk_text(doc))

print("Total chunks:", len(chunks))


Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors


Total chunks: 934404


In [5]:
retriever = SentenceTransformer("all-MiniLM-L6-v2", device=device)

chunk_embeddings = retriever.encode(
    chunks,
    convert_to_numpy=True,
    show_progress_bar=True
)

dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

faiss.normalize_L2(chunk_embeddings)
index.add(chunk_embeddings)


Batches:   0%|          | 0/29201 [00:00<?, ?it/s]

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(editor_model_name)
model = model.to(device)
model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [7]:
def retrieve(query, k=3):
    k = min(k, 3)  # enforce k ≤ 3

    query_embedding = retriever.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)

    scores, indices = index.search(query_embedding, k)

    return [chunks[i] for i in indices[0]], scores[0]


In [8]:
def post_process(answer):
    # Remove parentheses
    answer = re.sub(r"\(.*?\)", "", answer)

    # Remove extra whitespace
    answer = re.sub(r"\s+", " ", answer).strip()

    # Sentence limit (max 3)
    sentences = sent_tokenize(answer)
    sentences = sentences[:3]

    # Merge very short sentences
    merged = []
    for s in sentences:
        if len(s.split()) < 4 and merged:
            merged[-1] += " " + s
        else:
            merged.append(s)

    final = " ".join(merged)

    return final.strip()


In [9]:
def answer_question(question):
    retrieved_chunks, scores = retrieve(question, k=3)

    # Fallback if similarity too low
    if max(scores) < 0.2:
        return "Not enough information in the Simple Wikipedia dataset."

    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
Answer the question using only the information below.
Use simple English.
Write 2 to 3 sentences only.
Do not copy text exactly. Rewrite it clearly.

Context:
{context}

Question:
{question}
"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=128,
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
            num_beams=1
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return post_process(answer)


In [31]:
question = "What is the Eiffel Tower?"
print(answer_question(question))
question1= "Is India a good country?"
print(answer_question(question1))
question2= "Are we better than Pakistan?"
print(answer_question(question2))

It was built in the style of the Eiffel Tower. It cost £45000 , is 518 ft tall and weighs 2586 tons. It is mainly made from steel and cast iron.
India is a good country.
Yes, we are better than Pakistan.


In [None]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
