In [1]:
# Generative AI: Simple Wikipedia Answerer Agent (RAG)
# With Confidence Scoring


In [2]:
import os
import re
import torch
import numpy as np
import kagglehub
from typing import List
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


2026-02-09 17:09:36.649053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770656976.890164      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770656976.959464      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770656977.553607      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770656977.553650      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770656977.553659      55 computation_placer.cc:177] computation placer alr

In [3]:
# Download dataset
path = kagglehub.dataset_download("ffatty/plain-text-wikipedia-simpleenglish")
print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/plain-text-wikipedia-simpleenglish


In [4]:
def load_wikipedia_texts(dataset_path: str) -> List[str]:
    texts = []
    for root, _, files in os.walk(dataset_path):
        for f in files:
            if f.endswith('.txt'):
                with open(os.path.join(root, f), 'r', encoding='utf-8') as file:
                    texts.append(file.read())
    return texts

raw_documents = load_wikipedia_texts(path)
print(f"Loaded {len(raw_documents)} documents")


Loaded 1 documents


In [5]:
def chunk_text(text, min_tokens=300, max_tokens=600):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+max_tokens]
        if len(chunk) >= min_tokens:
            chunks.append(' '.join(chunk))
        i += max_tokens
    return chunks

chunks = []
for doc in raw_documents:
    chunks.extend(chunk_text(doc))

print(f"Total chunks: {len(chunks)}")


Total chunks: 49591


In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
chunk_embeddings = embedder.encode(chunks, batch_size=64, show_progress_bar=True, convert_to_numpy=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/775 [00:00<?, ?it/s]

In [7]:
def retrieve_chunks(query, k=3):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    scores = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_indices = np.argsort(scores)[::-1][:k]
    return [chunks[i] for i in top_indices], scores[top_indices]


In [8]:
MODEL_NAME = 'google/flan-t5-small'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
def generate_answer(question, retrieved_chunks):
    context = ' '.join(retrieved_chunks)
    prompt = f"""
Answer the question using only the text below.
Use simple English.
Do not add new facts.

Text:
{context}

Question:
{question}

Answer:
"""
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True).to(device)
    outputs = model.generate(**inputs, max_new_tokens=120, do_sample=False, num_beams=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [10]:
def post_edit(text):
    text = re.sub(r"\([^)]*\)", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    sentences = re.split(r"(?<=[.!?])\s+", text)
    merged, buffer = [], ""
    for s in sentences:
        if len(s.split()) < 5:
            buffer += ' ' + s
        else:
            if buffer:
                merged.append(buffer.strip())
                buffer = ''
            merged.append(s)
    if buffer:
        merged.append(buffer.strip())
    return ' '.join(merged[:3])


In [11]:
def compute_confidence(scores):
    max_score = float(np.max(scores))
    confidence = max(0.0, min(1.0, max_score))
    return round(confidence, 2)


In [12]:
def answer_question(question):
    retrieved_chunks, scores = retrieve_chunks(question, k=3)
    confidence = compute_confidence(scores)

    if confidence < 0.25:
        return {"answer": "Not enough information in the Simple Wikipedia dataset.", "confidence": confidence}

    raw_answer = generate_answer(question, retrieved_chunks)
    final_answer = post_edit(raw_answer)

    sentence_count = len([s for s in final_answer.split('.') if s.strip()])
    if sentence_count < 2:
        return {"answer": "Not enough information in the Simple Wikipedia dataset.", "confidence": confidence}

    return {"answer": final_answer, "confidence": confidence}


In [13]:
# Example
result = answer_question('What is the Moon?')
print(result)


{'answer': 'Moon is a small body which moves around a larger body. The Moon is written with a capital letter, "Moon".', 'confidence': 0.53}
