In [4]:
from google.colab import files
uploaded = files.upload()

Saving sample.txt to sample (1).txt


In [5]:
import utils
import pickle, os
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np


In [6]:
text_path = list(uploaded.keys())[0]
text = utils.load_text(text_path)
chunks = utils.chunk_text(text, chunk_size=100)
print(f"Loaded and chunked {len(chunks)} chunks.")


Loaded and chunked 1 chunks.


In [7]:
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embeddings = model.encode(chunks)
print(f"Generated {len(embeddings)} embeddings.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated 1 embeddings.


In [8]:
os.makedirs("/content/vector_store", exist_ok=True)
with open("/content/vector_store/chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)
with open("/content/vector_store/embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)
print("Embeddings saved to /content/vector_store/")


Embeddings saved to /content/vector_store/


In [9]:
with open("/content/vector_store/chunks.pkl", "rb") as f:
    chunks = pickle.load(f)
with open("/content/vector_store/embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)


In [10]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


In [11]:
def semantic_search(query, top_k=3):
    query_embedding = model.encode([query])
    sims = utils.cosine_similarity_manual(query_embedding, embeddings)[0]
    top_indices = sims.argsort()[::-1][:top_k]
    return [(chunks[i], sims[i]) for i in top_indices]


In [12]:
def answer_query(query):
    top_chunks = semantic_search(query)
    context = "\n".join([chunk for chunk, _ in top_chunks])
    result = qa_pipeline({"context": context, "question": query})
    return result["answer"]


In [15]:
query = input("Ask a question: ")
print("Answer:", answer_query(query))


Ask a question: What is machine learning?
Answer: a subset of AI that involves the use of algorithms and statistical models


In [19]:
# Evaluate Recall@5 and token-based F1-score (fully offline)

from tqdm import tqdm

# Sample eval_data based on sample.txt contents
eval_data = [
    {
        "query": "What is artificial intelligence?",
        "answer": "the simulation of human intelligence processes by machines"
    },
    {
        "query": "What is machine learning?",
        "answer": "a subset of AI that involves the use of algorithms and statistical models"
    },
    {
        "query": "What is NLP used for?",
        "answer": "to understand and respond to human language"
    }
]

# Token-level F1-score function
def compute_token_f1(pred, truth):
    pred_tokens = pred.lower().split()
    truth_tokens = truth.lower().split()

    common = set(pred_tokens) & set(truth_tokens)
    if not common:
        return 0.0

    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(truth_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

retrieval_hits = []
predicted_answers = []
ground_truths = []

# Evaluate queries
for item in tqdm(eval_data):
    query = item["query"]
    ground_truth = item["answer"]

    # Run semantic search
    top_chunks = semantic_search(query, top_k=5)
    found = any(ground_truth.lower() in chunk.lower() for chunk, _ in top_chunks)
    retrieval_hits.append(int(found))

    # Run QA pipeline
    context = "\n".join([chunk for chunk, _ in top_chunks])
    result = qa_pipeline({"context": context, "question": query})
    pred = result["answer"]

    predicted_answers.append(pred)
    ground_truths.append(ground_truth)

# Final metrics
recall_at_5 = sum(retrieval_hits) / len(retrieval_hits)
f1_scores = [compute_token_f1(p, t) for p, t in zip(predicted_answers, ground_truths)]
avg_f1 = sum(f1_scores) / len(f1_scores)

print(f"\n Recall@5: {recall_at_5:.2f}")
print(f" F1-score (token overlap): {avg_f1:.2f}")


100%|██████████| 3/3 [00:02<00:00,  1.09it/s]


 Recall@5: 1.00
 F1-score (token overlap): 0.71



