In [None]:
!pip install -q transformers datasets faiss-cpu
# Install the required Python libraries:
# transformers: for working with pre-trained language models like BERT, GPT, etc.
# datasets: for loading and processing benchmark datasets used in NLP.
# faiss-cpu: Facebook AI Similarity Search, used for fast vector similarity search (CPU version).

In [None]:
sample_text = """Albert Einstein was a theoretical phyicist who developed the theory of relativity,
one of the two pillars of modern physics (alongside quantum mechanics). His work is also
known for its influence on the philosophy of science. He is best known to the general public
for his mass-energy equivalence formula E = mc2."""
# This text will later be used for testing or demonstration purposes (e.g., embedding, summarization).

In [None]:
# Import AutoTokenizer and AutoModel from the Hugging Face transformers library.
# AutoTokenizer is used to convert text into tokens (numerical format the model can understand).
# AutoModel loads a pre-trained language model (e.g., BERT, RoBERTa, etc.) for feature extraction or embeddings.
from transformers import AutoTokenizer, AutoModel

# Import PyTorch (torch), a deep learning library.
# It's used here to handle tensor operations and model inference.
import torch  # Python library → takes input, understands it using deep learning models, and generates output

# Import NumPy, a library for numerical computing.
# It’s often used for working with arrays and for mathematical operations on data.
import numpy as np


In [None]:
# Load a lightweight, high-performance sentence transformer model.
# This model converts input sentences into vector embeddings (useful for similarity, clustering, etc.).
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Load the tokenizer associated with the model.
# Tokenizer breaks input text into tokens and converts them into numerical IDs the model understands.
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the pre-trained transformer model to generate embeddings.
# The model outputs hidden states for the tokens in the input text.
model = AutoModel.from_pretrained(model_name)

# Define a function to generate sentence embeddings for a given input text.
def get_embedding(text):
  # Tokenize the input text and convert it into tensor format (PyTorch).
  # - return_tensors='pt': returns PyTorch tensors
  # - truncation=True: trims long text to fit model's max length
  # - padding=True: pads shorter sequences to have uniform length
  tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

  # Generate the model output in inference mode (no gradient computation or training).
  with torch.no_grad():
    output = model(**tokens)  # Pass tokens to the model and get the output

  # Compute sentence-level embedding by averaging the hidden states of all tokens.
  # output.last_hidden_state: tensor of shape [batch_size, seq_len, hidden_size]
  # mean(dim=1): average across the sequence (tokens)
  # squeeze(): remove unnecessary dimensions
  # numpy(): convert PyTorch tensor to NumPy array for easier processing
  return output.last_hidden_state.mean(dim=1).squeeze().numpy()


In [None]:
import faiss

#chunk the document
chunks = [sample_text]

# Create embeddings for chunks
embeddings = [get_embedding(chunk) for chunk in chunks]

#Create FAISS index
dim = len(embeddings[0])
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))

In [None]:
from transformers import pipeline

#Load a generator model (keep it small for Colab)
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

def retreive_and_answer(query, top_k=1):
  query_embedding = get_embedding(query).reshape(1,-1)
  _, indices = index.search(query_embedding, top_k)    #gives indices of most relevent chunks
  retreived_texts = [chunks[i] for i in indices[0]]    #retreiving matching chunks from document

  context = " ".join(retreived_texts)                  #Combines the chunks into one long text block
  prompt = f"Context: {context} \n\nQuestion: {query}\nAnswer:"

  result = qa_pipeline(prompt, max_length=100, do_sample=False)     # same output for same input every time
  return result[0]['generated_text']

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
question = "What is Einstein famous for?"
answer = retreive_and_answer(question)
print("Q:", question)
print("A:", answer)

Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Q: What is Einstein famous for?
A: his mass-energy equivalence formula E = mc2
