In [1]:
# ==== Cell 1: Unzip the dataset and list PDFs ====
import os, zipfile, glob

zip_path = "/content/fd6LsbgvTRy5XtEgpHTD_RAG Project Dataset-20241128T043631Z-001 (1).zip"
extract_dir = "/content/rag_dataset"

# Create the target folder if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

# Collect all PDFs
pdf_files = sorted(glob.glob(os.path.join(extract_dir, "**/*.pdf"), recursive=True))

print(f"✅ Extracted {len(pdf_files)} PDF file(s):")
for p in pdf_files:
    print(" •", p)


✅ Extracted 3 PDF file(s):
 • /content/rag_dataset/RAG Project Dataset/1706.03762v7.pdf
 • /content/rag_dataset/RAG Project Dataset/2005.11401v4.pdf
 • /content/rag_dataset/RAG Project Dataset/2005.14165v4.pdf


In [2]:
# ==== Cell 2: Install Required Libraries ====
!pip install langchain langchain-community langchain-google-genai
!pip install pypdf faiss-cpu tiktoken
!pip install -q chromadb
!pip install -U sentence-transformers


Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.5-py3-none-any.whl.metadata (5.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.

Collecting pypdf
  Downloading pypdf-5.6.0-py3-none-any.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading pypdf-5.6.0-py3-none-any.whl (304 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.2/304.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, faiss-cpu
Successfully installed faiss-cpu-1.11.0 pypdf-5.6.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
!pip install pypdf



In [7]:
# ==== Cell 3 (Updated): Extract text using pypdf ====
from pypdf import PdfReader

all_texts = []

for pdf_path in pdf_files:
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"
    all_texts.append({"file": os.path.basename(pdf_path), "content": full_text})

print(f"✅ Extracted text from {len(all_texts)} PDFs.")
print("Sample preview from first PDF:\n")
print(all_texts[0]["content"][:1000])  # preview first 1000 characters


✅ Extracted text from 3 PDFs.
Sample preview from first PDF:

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗ ‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechani

In [8]:
# ==== Cell 4: Chunk text and generate embeddings ====
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Chunk size and overlap settings
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

chunked_data = []

def chunk_text(text, file_name):
    words = text.split()
    for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
        chunk = " ".join(words[i:i + CHUNK_SIZE])
        if chunk:
            chunked_data.append({
                "file": file_name,
                "content": chunk
            })

# Create chunks for all texts
for doc in all_texts:
    chunk_text(doc["content"], doc["file"])

print(f"✅ Created {len(chunked_data)} chunks.")

# Generate embeddings
embeddings = embed_model.encode([chunk["content"] for chunk in chunked_data])

print("✅ Embeddings generated.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Created 137 chunks.
✅ Embeddings generated.


In [9]:
# ==== Cell 5: Store embeddings in FAISS ====
import faiss

# Convert embeddings to float32 for FAISS
embedding_matrix = np.array(embeddings).astype("float32")

# Create FAISS index
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)

# Keep track of metadata for each vector
metadata = chunked_data

print(f"✅ Stored {index.ntotal} vectors in FAISS index.")


✅ Stored 137 vectors in FAISS index.


In [12]:
# ==== Cell 6: RAG Question Answering with Gemini and LangChain ====
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage
import os

# Set Gemini API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyCHBCIFkAzj69OfvpVmofjK5ZRafjYiCxs"

# Initialize Gemini chat model
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3)

# Function to get top-k relevant chunks from FAISS
def retrieve_context(query, k=4):
    query_embedding = embed_model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, k)
    results = []
    for i in indices[0]:
        results.append(metadata[i])
    return results

# Function to answer question using retrieved context
def answer_question(query):
    context_chunks = retrieve_context(query)
    context_text = "\n\n".join(
        [f"From {chunk['file']}:\n{chunk['content']}" for chunk in context_chunks]
    )

    prompt = f"""You are an AI assistant. Use the following context from research papers to answer the question.

Context:
{context_text}

Question: {query}

Answer:"""

    response = llm([HumanMessage(content=prompt)])
    return response.content, context_chunks

# Try it out!
user_question = "attention is all u..."
response, sources = answer_question(user_question)

print("📘 Answer:")
print(response)

print("\n📎 Sources:")
for src in sources:
    print(f" - {src['file']} | Snippet: {src['content'][:100]}...")


📘 Answer:
The title of the paper is "Attention is All You Need."

📎 Sources:
 - 2005.11401v4.pdf | Snippet: Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all ...
 - 1706.03762v7.pdf | Snippet: 300K 4.33 26.4 213 development set, newstest2013. We used beam search as described in the previous s...
 - 1706.03762v7.pdf | Snippet: [24] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention- base...
 - 1706.03762v7.pdf | Snippet: of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-...
