In [None]:
import os

base_dir ="/content/materials" # name of the folder materials
os.makedirs( base_dir, exist_ok= True) # makes folder if it already exits do not do anything
print("Folder created at:",base_dir)
print("Current files:",os.listdir(base_dir))#os.listdir give list of the folder indie base-dir that is material

In [None]:
!pip install pypdf



In [None]:
#Ingestion part
import os
from pypdf import PdfReader

base_dir = "/content/materials"

documents =[]
for filename in os.listdir(base_dir):
  if filename.lower().endswith(".pdf"):
    print("Processing:",filename)

    file_path = os.path.join(base_dir,filename)

    reader = PdfReader(file_path)
    full_text =""

    for page in reader.pages:
      page_text = page.extract_text()
      if page_text:
        full_text +=" " +page_text

    clean_text = " ".join(full_text.split())
    print("Length of extracted text",len(clean_text))

    doc ={
          "id":len(documents),
          "filename":filename,
          "text":clean_text
         }
    documents.append(doc)

print("\nTotal document loaded:",len(documents))

print("Total documents loaded:", len(documents))
print("First document keys:", documents[0].keys())
print("First filename:", documents[0]["filename"])
print("First 500 characters of text:\n")
print(documents[0]["text"][:500])

Processing: 1810.04805v2.pdf
Length of extracted text 64068
Processing: 1706.03762v7.pdf
Length of extracted text 39598
Processing: Optimizers-1.pdf
Length of extracted text 7849
Processing: Batch_Normalization-1.pdf
Length of extracted text 6343
Processing: RAG (1).pdf
Length of extracted text 35562
Processing: CNN (2).pdf
Length of extracted text 31000
Processing: Back-Propagation-1.pdf
Length of extracted text 5689
Processing: RNN,LSTM,GRU.pdf
Length of extracted text 27912
Processing: Week-7_Agents_AgenticAI_RAGs_Agentic_RAG.pdf
Length of extracted text 55251
Processing: Week-1 AA-5750 Contemporary Issues in Analytics (1).pdf
Length of extracted text 3782
Processing: Activation_Functions-1.pdf
Length of extracted text 5343
Processing: Dropouts-1.pdf
Length of extracted text 5727
Processing: ResNets.pdf
Length of extracted text 18087
Processing: SoftvsHardMargin-1.pdf
Length of extracted text 5599

Total document loaded: 14
Total documents loaded: 14
First document keys: dict_keys([

In [None]:
#chunking by words with chunk size
def chunk_text_by_words(text, chunk_size=200):
  words = text.split()
  chunks=[]

  for start in range(0,len(words),chunk_size):
    end = start + chunk_size
    chunk_words = words[start:end]
    chunk_text = " ".join(chunk_words)
    chunks.append(chunk_text)

  return chunks


In [None]:
chunks_fixed = []
for doc in documents:
  doc_chunks = chunk_text_by_words(doc["text"],chunk_size=200)
  for i,chunk in enumerate(doc_chunks):
    chunks_fixed.append({
        "doc_id": doc["id"],
        "chunk_index": i,
        "filename": doc["filename"],
        "text":chunk
    })
print("Total fixed-size chunks:", len(chunks_fixed))
print("Example chunk keys:", chunks_fixed[0].keys())
print("From file:", chunks_fixed[0]["filename"])
print("Chunk text (first 300 chars):")
print(chunks_fixed[0]["text"][:300])

Total fixed-size chunks: 270
Example chunk keys: dict_keys(['doc_id', 'chunk_index', 'filename', 'text'])
From file: 1810.04805v2.pdf
Chunk text (first 300 chars):
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova Google AI Language {jacobdevlin,mingweichang,kentonl,kristout}@google.com Abstract We introduce a new language representa- tion model called BERT, which stands f


In [None]:
#Chunking by sentence based
import re

def chunk_text_by_sentences(text, sentences_per_chunk=3):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = []

    for sent in sentences:
        sent = sent.strip()
        if not sent:
            continue

        current_chunk.append(sent)

        if len(current_chunk) >= sentences_per_chunk:
            chunk_text = " ".join(current_chunk)
            chunks.append(chunk_text)
            current_chunk = []

    if current_chunk:
        chunk_text = " ".join(current_chunk)
        chunks.append(chunk_text)

    return chunks


In [None]:
chunks_sentence = []

for doc in documents:
    doc_chunks = chunk_text_by_sentences(doc["text"], sentences_per_chunk=3)
    for i, chunk in enumerate(doc_chunks):
        chunks_sentence.append({
            "doc_id": doc["id"],
            "chunk_index": i,
            "filename": doc["filename"],
            "text": chunk
        })

print("Total sentence-based chunks:", len(chunks_sentence))
print("Example sentence-chunk keys:", chunks_sentence[0].keys())
print("From file:", chunks_sentence[0]["filename"])
print("Chunk text (first 300 chars):")
print(chunks_sentence[0]["text"][:300])


Total sentence-based chunks: 3700
Example sentence-chunk keys: dict_keys(['doc_id', 'chunk_index', 'filename', 'text'])
From file: 1810.04805v2.pdf
Chunk text (first 300 chars):
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova Google AI Language {jacobdevlin,mingweichang,kentonl,kristout}@google.com Abstract We introduce a new language representa- tion model called BERT, which stands f


In [None]:
!pip install sentence-transformers




In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
model_open = SentenceTransformer("all-MiniLM-L6-v2")

texts_fixed = [chunk["text"] for chunk in chunks_fixed]

embeddings_open_fixed = model_open.encode(texts_fixed, batch_size = 32, show_progress_bar = True)

embeddings_open_fixed = np.array(embeddings_open_fixed)

print("Embeddings shape:", embeddings_open_fixed.shape)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Embeddings shape: (270, 384)


In [None]:
def cosine_similarity(a,b):
   a = a / (np.linalg.norm(a)+ 1e-8)
   b = b / (np.linalg.norm(b, axis =1, keepdims =True) + 1e-8)
   sims = np.dot(b, a)

   return sims

In [None]:
def retrieve_top_k(query_text, model, chunk_texts, chunk_meta, embeddings_matrix, k=5):
    # 1. Embed the query using the same model
    query_emb = model.encode([query_text])[0]

    # 2. Compute cosine similarity between query and all chunk embeddings
    sims = cosine_similarity(query_emb, embeddings_matrix)

    # 3. Get indices of top-k highest similarity scores
    top_k_idx = np.argsort(-sims)[:k]

    # 4. Gather the results
    results = []
    for idx in top_k_idx:
        results.append({
            "score": float(sims[idx]),
            "text": chunk_texts[idx],
            "meta": chunk_meta[idx]
        })

    return results


In [None]:
meta_fixed = [
    {
        "doc_id": c["doc_id"],
        "chunk_index": c["chunk_index"],
        "filename": c["filename"]
    }
    for c in chunks_fixed
]

len(meta_fixed), len(texts_fixed), embeddings_open_fixed.shape[0]


(270, 270, 270)

In [None]:
query = "Explain what a dropout layer does in neural networks."

results = retrieve_top_k(
    query_text=query,
    model=model_open,
    chunk_texts=texts_fixed,
    chunk_meta=meta_fixed,
    embeddings_matrix=embeddings_open_fixed,
    k=5
)

print("Query:", query)
print("\nTop-5 retrieved chunks:\n")
for r in results:
    print("Score:", round(r["score"], 3))
    print("From file:", r["meta"]["filename"], "| chunk:", r["meta"]["chunk_index"])
    print(r["text"][:400], "...")
    print("-" * 80)


Query: Explain what a dropout layer does in neural networks.

Top-5 retrieved chunks:

Score: 0.541
From file: Dropouts-1.pdf | chunk: 2
powerful regularization techniques in deep learning. Intuition (Analogy to Random Forests) •In a Random Forest, each decision tree is trained on a random subset of features — introducing randomness and improving generalization. •Dropout applies the same idea inside neural networks — by randomly “dropping” neurons during training. 3 Figure 3: During training, dropout randomly disables neurons to pr ...
--------------------------------------------------------------------------------
Score: 0.516
From file: 1706.03762v7.pdf | chunk: 27
Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538, 2017. [33] Nitish Srivastava, Geoffrey E Hinton, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdi- nov. Dropout: a simple way to pre

In [None]:
!pip install --upgrade openai


Collecting openai
  Downloading openai-2.11.0-py3-none-any.whl.metadata (29 kB)
Downloading openai-2.11.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 2.9.0
    Uninstalling openai-2.9.0:
      Successfully uninstalled openai-2.9.0
Successfully installed openai-2.11.0


In [None]:
from openai import OpenAI
import numpy as np  # make sure NumPy is imported

# IMPORTANT: put your real API key instead of "YOUR_API_KEY_HERE" when you have one.
client = OpenAI(api_key="removed for safety concerns")

def get_openai_embeddings(text_list, model="text-embedding-3-small", batch_size=100):
    all_embeddings = []

    for start in range(0, len(text_list), batch_size):
        end = start + batch_size
        batch = text_list[start:end]

        response = client.embeddings.create(
            model=model,
            input=batch
        )

        for item in response.data:
            all_embeddings.append(item.embedding)

    return np.array(all_embeddings)


In [None]:
embeddings_openai_fixed = get_openai_embeddings(
    texts_fixed,
    model="text-embedding-3-small",
    batch_size=100
)

print("OpenAI embeddings shape:", embeddings_openai_fixed.shape)


OpenAI embeddings shape: (270, 1536)


In [None]:
# Build a list of sentence-based chunk texts
texts_sentence = [c["text"] for c in chunks_sentence]

# Compute open-source embeddings for sentence-based chunks
embeddings_open_sentence = model_open.encode(
    texts_sentence,
    batch_size=32,
    show_progress_bar=True
)
embeddings_open_sentence = np.array(embeddings_open_sentence)

# Build parallel metadata list for sentence chunks
meta_sentence = [
    {
        "doc_id": c["doc_id"],
        "chunk_index": c["chunk_index"],
        "filename": c["filename"]
    }
    for c in chunks_sentence
]

print("Sentence-based embeddings shape:", embeddings_open_sentence.shape)
print("Number of meta_sentence entries:", len(meta_sentence))
print("Number of texts_sentence entries:", len(texts_sentence))


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

Sentence-based embeddings shape: (3700, 384)
Number of meta_sentence entries: 3700
Number of texts_sentence entries: 3700


In [None]:
def build_context_from_results(results, max_chars_per_chunk=600):
    """
    Take the list of retrieved chunks (results from retrieve_top_k)
    and build a single context string for the LLM.
    """
    parts = []
    for i, r in enumerate(results, start=1):
        text = r["text"]
        # optionally truncate each chunk so context doesn't get too long
        snippet = text[:max_chars_per_chunk]
        part = (
            f"[Chunk {i} | file: {r['meta']['filename']} | chunk_index: {r['meta']['chunk_index']}]\n"
            f"{snippet}\n"
        )
        parts.append(part)
    context = "\n".join(parts)
    return context


In [None]:
def generate_answer_with_rag_openai(query, results, model_name="gpt-4o-mini"):
    """
    query: user question (string)
    results: list from retrieve_top_k (retrieved chunks)
    model_name: OpenAI model for generation (e.g., gpt-4o-mini)
    """
    context = build_context_from_results(results)

    prompt = f"""
You are a teaching assistant for the course AA-5750: Contemporary Issues in Analytics.

Use ONLY the information in the CONTEXT below to answer the student's question.
If the answer is not covered in the context, say exactly: "I cannot answer this from the course materials."

QUESTION:
{query}

CONTEXT:
{context}

ANSWER (use clear, simple language, for a non-technical student):
"""

    response = client.responses.create(
        model=model_name,
        input=prompt
    )

    # The Python SDK exposes the main answer as .output_text
    return response.output_text


In [None]:
def generate_answer_without_rag_openai(query, model_name="gpt-4o-mini"):
    """
    Baseline: ask the LLM the question WITHOUT giving it any course context.
    """
    prompt = f"""
You are a general AI assistant (not limited to course materials).

Answer the following question as best as you can:

QUESTION:
{query}

ANSWER:
"""

    response = client.responses.create(
        model=model_name,
        input=prompt
    )

    return response.output_text


In [None]:
  query = "Where is the dropout part?"

# 1) Retrieve top-k chunks using OPEN-SOURCE embeddings + FIXED chunks
results_fixed_open = retrieve_top_k(
    query_text=query,
    model=model_open,
    chunk_texts=texts_fixed,
    chunk_meta=meta_fixed,
    embeddings_matrix=embeddings_open_fixed,
    k=5
)

print("Top-5 retrieved chunks (open-source, fixed-size):\n")
for r in results_fixed_open:
    print("Score:", round(r["score"], 3), "| File:", r["meta"]["filename"], "| Chunk:", r["meta"]["chunk_index"])
print("\n" + "="*80 + "\n")

# 2) Generate answer WITH RAG (using retrieved context)
# NOTE: requires a valid OpenAI API key
# comment these lines out if you don't have a key yet

answer_with_rag = generate_answer_with_rag_openai(query, results_fixed_open)
print("ANSWER WITH RAG:\n")
print(answer_with_rag)

# 3) Generate answer WITHOUT RAG (no course context)
answer_without_rag = generate_answer_without_rag_openai(query)
print("\n" + "="*80 + "\n")
print("ANSWER WITHOUT RAG:\n")
print(answer_without_rag)


Top-5 retrieved chunks (open-source, fixed-size):

Score: 0.312 | File: 1706.03762v7.pdf | Chunk: 27
Score: 0.281 | File: RNN,LSTM,GRU.pdf | Chunk: 21
Score: 0.236 | File: Dropouts-1.pdf | Chunk: 2
Score: 0.235 | File: 1706.03762v7.pdf | Chunk: 17
Score: 0.233 | File: ResNets.pdf | Chunk: 4


ANSWER WITH RAG:

The dropout part is a technique used during training in neural networks to prevent overfitting. It works by randomly disabling a portion of the neurons, so they don’t participate in the training for that iteration. This helps improve the model's ability to generalize to new data. You can find more details in the provided context, particularly in the third chunk, which explains the mechanism of dropout and its purpose in deep learning.


ANSWER WITHOUT RAG:

The term "dropout" can refer to a few different contexts, but it is most commonly used in relation to machine learning, specifically in neural networks. In that context, dropout is a regularization technique aimed at preventin