In [1]:
import fitz  # PyMuPDF
import json 
import os 
import io
import re
#import requests 
import dotenv 
import transformers
import pypdf
import faiss
#import sqlite3

from dotenv import load_dotenv

from openai import OpenAI
from sentence_transformers import util, SentenceTransformer
from transformers import pipeline, BertTokenizer, BertModel
from PyPDF2 import PdfReader  # For PDF text extraction

import pandas as pd
import numpy as np
from io import StringIO
from dotenv import load_dotenv
#from operator import itemgetter

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')
nltk.download('punkt')

from llama_index.core.node_parser import SentenceSplitter

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rzrizaldy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rzrizaldy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import time
from sklearn.cluster import DBSCAN
import torch

# **Setting up Python Environment**

## Instructions for setting up your .env file:

1. Create a .env file in the same directory as this notebook

2. Add the following lines to the .env file:

    OPENAI_API_KEY=<your_openai_api_key>

    HF_TOKEN=<your_huggingface_token>

3. Replace the placeholders with your actual keys.

4. Save the file.

5. Restart the kernel to ensure the keys are loaded correctly.

# Load API Keys into the Notebook Environment:

In [3]:
load_dotenv()

openai_key = os.getenv('OPENAI_API_KEY')
HF_TOKEN = os.getenv('HF_TOKEN')

In [4]:
def check_api_keys():
    """
    Checks if OpenAI and HuggingFace API keys are loaded and ready.

    Prints the status and returns a status code.
    
    Returns:
        int: Status code (0=all keys present, 1=missing OpenAI, 2=missing HF, 3=both missing)
    """
    openai_key = os.getenv('OPENAI_API_KEY')
    hf_key = os.getenv('HF_TOKEN')

    openai_ok = bool(openai_key and openai_key.strip())
    hf_ok = bool(hf_key and hf_key.strip())
    
    status_code = 0
    if not openai_ok and not hf_ok:
        print("❌ Both OpenAI and HuggingFace API keys are missing.")
        status_code = 3
    elif not openai_ok:
        print("❌ OpenAI API key is missing.")
        status_code = 1
    elif not hf_ok:
        print("❌ HuggingFace API key is missing.")
        status_code = 2
    else:
        print("✅ Both OpenAI and HuggingFace API keys are loaded successfully.")
        status_code = 0

    return status_code


In [5]:
check_api_keys()

✅ Both OpenAI and HuggingFace API keys are loaded successfully.


0

# Custom Functions for Chunking the CMU Student Handbook & Measuring Computational Cost
(Optional / not required for the homework that you use these functions)

In [6]:
# Splitting Text into Sentences
def split_text_into_sentences_v1(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

def split_text_into_sentences_v2(text):
    sentences = sent_tokenize(text, language='english')  # Default is usually 'english'
    return sentences

In [7]:
# define a function to split the resumes into sentences and assign unique identifiers:

def split_resumes_to_sentences(df, text_column):
    """
    Split the resumes into individual sentences and assign unique identifiers.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the resumes.
        text_column (str): The name of the column containing the resume texts.
        
    Returns:
        pd.DataFrame: A DataFrame with each sentence and its corresponding unique identifier.
    """
    # Initialize an empty list to hold the resulting data
    sentences_list = []
    
    # Iterate through the DataFrame rows
    for idx, row in df.iterrows():
        # Tokenize the resume text into sentences
        sentences = sent_tokenize(row[text_column])
        
        # Append each sentence along with the original index to the list
        for sentence in sentences:
            sentences_list.append((idx, sentence))
    
    # Convert the list to a DataFrame
    sentences_df = pd.DataFrame(sentences_list, columns=['unique_identifier', 'sentence'])
    
    return sentences_df

In [8]:

def compute_embedding_costs(text, model_name='all-MiniLM-L6-v2', eps=0.6, min_samples=2):
    """
    Computes the computational cost (in terms of execution time) for creating
    sentence embeddings and paraphrase embeddings.

    Parameters:
    - text (str): The input text to be processed.
    - model_name (str): The name of the model to use for embedding.
    - eps (float): The epsilon value for DBSCAN clustering.
    - min_samples (int): The minimum sample count for DBSCAN clustering.

    Returns:
    - A tuple containing the execution times for sentence embeddings and paraphrase-level embeddings.
    """
    model = SentenceTransformer(model_name)
    
    # Sentence Embedding Timing
    start_time = time.time()
    sentences = sent_tokenize(text)
    sentence_embeddings = model.encode(sentences)
    sentence_embedding_time = time.time() - start_time

    # Paraphrase Embedding (Clustering) Timing
    start_clustering_time = time.time()
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine').fit(sentence_embeddings)
    cluster_labels = clustering.labels_
    
    paraphrase_embeddings = []
    for cluster_id in set(cluster_labels):
        if cluster_id == -1:
            continue
        cluster_sentences = np.array(sentences)[cluster_labels == cluster_id]
        paraphrase = ' '.join(cluster_sentences)
        paraphrase_embeddings.append(paraphrase)
    paraphrase_embedding_time = time.time() - start_clustering_time

    return sentence_embedding_time, paraphrase_embedding_time

# Example usage
if __name__ == "__main__":
    text = ("This is a sample text. It has several sentences, meant to showcase "
            "how embeddings are computed. Some of these sentences may be clustered "
            "together, representing paraphrases or semantically similar groups.")

    sent_time, para_time = compute_embedding_costs(text)
    print(f"Sentence Embedding Time: {sent_time:.4f} seconds")
    print(f"Paraphrase Embedding Time: {para_time:.4f} seconds")

# This function was created with GenerativeAI Assistance. 

Sentence Embedding Time: 0.6105 seconds
Paraphrase Embedding Time: 0.0028 seconds


In [9]:
def estimate_model_flops(model_name, text):
    """
    Estimate the FLOPs for generating embeddings for a given text using a specified model.

    Parameters:
    - model_name (str): Model identifier from Hugging Face Transformers.
    - text (str): Text to process.

    Returns:
    - FLOPs (int): An estimated number of floating point operations.
    """
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs['input_ids']

    # Hooks for the operations
    def hook_fn_forward(module, input, output):
        # Attempt to access the tensor shape in a safer manner
        input_shape = input[0].size()
        
        # A generalized fallback if shape isn't what's expected
        if len(input_shape) == 2:  # Assuming shape [batch, seq_len] for simplicity
            batch_size, seq_len = input_shape
            # Hypothetical FLOPs calculation: For demonstration, let's assume it's just the product
            flops = batch_size * seq_len
        elif len(input_shape) > 2:  # Assuming more dimensions (e.g., embeddings)
            flops = torch.prod(torch.tensor(input_shape))
        else:
            # In case of unsupported dimensions, set flops to 0 or some placeholder
            flops = 0

        # Storing calculated FLOPs in the module
        if hasattr(module, '__flops__'):
            module.__flops__ += flops
        else:
            module.__flops__ = flops

    def add_hooks_to_model(model, hook_fn):
        """
        Recursively add hook_fn to all the layers of the model.
        """
        total_flops = 0
        for layer in model.children():
            if list(layer.children()):  # if the layer has children, recursively add hooks
                total_flops += add_hooks_to_model(layer, hook_fn)
            else:
                if hasattr(layer, 'weight'):
                    layer.register_forward_hook(hook_fn)
                    layer.__flops__ = 0
        return total_flops

    add_hooks_to_model(model, hook_fn_forward)

    with torch.no_grad():
        _ = model(**inputs)

    total_flops = sum([mod.__flops__ for mod in model.modules() if hasattr(mod, '__flops__')])

    return total_flops

# Example usage
if __name__ == "__main__":
    model_name = "bert-base-uncased"
    text = "This is an example sentence"
    flops = estimate_model_flops(model_name, text)
    print(f"Estimated FLOPs: {flops}")

# This function was created with GenerativeAI Assistance. 

Estimated FLOPs: 715797


# Load Data into the Notebook Environment:

In [10]:
import pdfreader

In [11]:
doc = fitz.open("cmu-student-policy-handbook.pdf")
doc

Document('cmu-student-policy-handbook.pdf')

---

# **SECTION A IMPLEMENTATION - Semantic*

**Chunking Method Chosen:** Semantic chunking (splits based on semantic similarity)  
**Embedding Model:** sentence-transformers/all-mpnet-base-v2 (higher quality than MiniLM)  
**Vector Store:** FAISS IndexFlatIP (Inner Product for cosine similarity)  
**Retrieval Parameter k:** 5

**Strategy Rationale:**
- **Semantic Chunking**: Splits text at natural semantic boundaries where topic changes occur
- **all-mpnet-base-v2**: 768-dim embeddings, superior quality for retrieval (ranked #1 on MTEB)
- **Similarity threshold**: 0.7 to identify topic boundaries (when similarity drops, new chunk)
- **Max chunk size**: 1000 chars to ensure chunks don't exceed embedding limits

---


In [32]:
# ============================================================================
# SECTION A: STEP 1 - Load and Prepare PDF Text
# ============================================================================
# Load the CMU Student Policy Handbook PDF and clean text
reader = PdfReader("cmu-student-policy-handbook.pdf")
raw_text = "\n".join((p.extract_text() or "") for p in reader.pages)

# Clean text: normalize whitespace and line breaks
raw_text = re.sub(r'\r', '\n', raw_text)
raw_text = re.sub(r'[ \t]+', ' ', raw_text)
raw_text = re.sub(r'\n{3,}', '\n\n', raw_text)
raw_text = raw_text.strip()

print(f"Loaded {len(raw_text):,} characters from PDF")
print(f"Number of paragraphs (double newlines): {raw_text.count(chr(10)*2)}")


Loaded 674,632 characters from PDF
Number of paragraphs (double newlines): 2


In [27]:
# ============================================================================
# SECTION A: STEP 2 - Chunk Text Using Semantic Chunking Method
# ============================================================================
# Method: Semantic chunking - splits based on semantic similarity between sentences
# This identifies natural topic boundaries by detecting drops in semantic similarity
# More intelligent than structural chunking as it respects meaning boundaries

def semantic_chunk_text(
    text: str, 
    embedding_model: SentenceTransformer,
    max_chunk_size: int = 1000,
    similarity_threshold: float = 0.7,
    min_chunk_size: int = 100
) -> List[str]:
    """
    Semantic chunking: splits text at points where semantic similarity drops significantly.
    This creates chunks that respect topic boundaries rather than arbitrary structural breaks.
    
    Args:
        text: Input text to chunk
        embedding_model: SentenceTransformer model for computing embeddings
        max_chunk_size: Maximum characters per chunk
        similarity_threshold: Minimum similarity to keep sentences together (0-1)
        min_chunk_size: Minimum characters per chunk
    
    Returns:
        List of text chunks
    """
    # Split text into sentences
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= 1:
        return [text] if text.strip() else []
    
    # Filter out very short sentences (likely noise)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
    if not sentences:
        return [text] if text.strip() else []
    
    # Compute embeddings for all sentences
    print(f"Computing embeddings for {len(sentences)} sentences...")
    sentence_embeddings = embedding_model.encode(
        sentences, 
        batch_size=32, 
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    
    # Find semantic break points
    break_points = [0]  # Start with first sentence
    current_chunk_start = 0
    
    for i in range(1, len(sentences)):
        # Compute similarity between current and previous sentence
        similarity = np.dot(sentence_embeddings[i-1], sentence_embeddings[i])
        
        # Check if we should break here
        should_break = False
        
        # Break if similarity drops below threshold (semantic boundary)
        if similarity < similarity_threshold:
            should_break = True
        
        # Also break if adding this sentence would exceed max_chunk_size
        current_chunk_text = " ".join(sentences[current_chunk_start:i])
        next_sentence = sentences[i]
        if len(current_chunk_text) + len(next_sentence) + 1 > max_chunk_size:
            should_break = True
        
        if should_break:
            # Only create chunk if it meets minimum size
            chunk_text = " ".join(sentences[current_chunk_start:i])
            if len(chunk_text) >= min_chunk_size or current_chunk_start == 0:
                break_points.append(i)
                current_chunk_start = i
    
    # Add final break point
    break_points.append(len(sentences))
    
    # Create chunks from break points
    chunks = []
    for i in range(len(break_points) - 1):
        chunk_sentences = sentences[break_points[i]:break_points[i+1]]
        chunk_text = " ".join(chunk_sentences).strip()
        if chunk_text and len(chunk_text) >= min_chunk_size:
            chunks.append(chunk_text)
    
    # Handle any remaining text
    if not chunks:
        chunks = [text]
    
    return chunks

# Initialize embedding model for semantic chunking (use same model as for retrieval)
EMBED_MODEL_A = "sentence-transformers/all-mpnet-base-v2"
print(f"Loading embedding model for semantic chunking: {EMBED_MODEL_A}")
semantic_embed_model = SentenceTransformer(EMBED_MODEL_A)

# Create chunks using semantic chunking
chunks_A = semantic_chunk_text(
    raw_text, 
    embedding_model=semantic_embed_model,
    max_chunk_size=1000,
    similarity_threshold=0.7,
    min_chunk_size=100
)

print(f"\nCreated {len(chunks_A)} text chunks using semantic chunking")
print(f"Average chunk length: {np.mean([len(c) for c in chunks_A]):.0f} characters")
print(f"Min chunk length: {min([len(c) for c in chunks_A])} characters")
print(f"Max chunk length: {max([len(c) for c in chunks_A])} characters")


Loading embedding model for semantic chunking: sentence-transformers/all-mpnet-base-v2
Computing embeddings for 3585 sentences...


Batches: 100%|██████████| 113/113 [00:24<00:00,  4.63it/s]


Created 2655 text chunks using semantic chunking
Average chunk length: 252 characters
Min chunk length: 100 characters
Max chunk length: 7319 characters





In [28]:
# ============================================================================
# SECTION A: STEP 3 - Create Embeddings with High-Quality Model
# ============================================================================
# Embedding Model: sentence-transformers/all-mpnet-base-v2
# - 768-dimensional embeddings (higher quality than MiniLM)
# - Ranked #1 on MTEB (Massive Text Embedding Benchmark)
# - Better semantic understanding for policy document retrieval
# - Normalized for cosine similarity via Inner Product

# Use the same embedding model that was used for semantic chunking
# (Already loaded in previous cell, but we'll reference it here for clarity)
EMBED_MODEL_A = "sentence-transformers/all-mpnet-base-v2"
embed_model_A = semantic_embed_model  # Reuse the model from semantic chunking
print(f"Using embedding model for retrieval: {EMBED_MODEL_A}")

# Create embeddings with normalization
def embed_texts_A(texts: List[str]) -> np.ndarray:
    """Create normalized embeddings for Section A"""
    embs = embed_model_A.encode(
        texts, 
        batch_size=32, 
        show_progress_bar=True,
        convert_to_numpy=True, 
        normalize_embeddings=True  # Normalize for cosine similarity
    )
    return embs.astype("float32")

emb_A = embed_texts_A(chunks_A)
print(f"\nCreated embeddings: shape {emb_A.shape}")
print(f"Embedding dimension: {emb_A.shape[1]} (768-dim for better quality)")
print(f"Embedding norm (should be ~1.0 for normalized): {np.linalg.norm(emb_A[0]):.4f}")


Using embedding model for retrieval: sentence-transformers/all-mpnet-base-v2


Batches: 100%|██████████| 83/83 [00:23<00:00,  3.48it/s]


Created embeddings: shape (2655, 768)
Embedding dimension: 768 (768-dim for better quality)
Embedding norm (should be ~1.0 for normalized): 1.0000





In [29]:
# ============================================================================
# SECTION A: STEP 4 - Build FAISS Vector Store
# ============================================================================
# Index Type: IndexFlatIP (Inner Product)
# - Uses dot product for similarity (equivalent to cosine for normalized vectors)
# - Exact search (no approximation) - perfect for our dataset size
# - Suitable for normalized embeddings (cosine similarity = dot product)

def build_ip_index_A(embs: np.ndarray):
    """Build FAISS Inner Product index for Section A"""
    index = faiss.IndexFlatIP(embs.shape[1])  # Inner Product = cosine for normalized vectors
    index.add(embs)
    return index

index_A = build_ip_index_A(emb_A)
print(f"FAISS index created with {index_A.ntotal} vectors")
print(f"Index type: {type(index_A).__name__}")
print(f"Vector dimension: {index_A.d}")


FAISS index created with 2655 vectors
Index type: IndexFlatIP
Vector dimension: 768


In [30]:
# ============================================================================
# SECTION A: STEP 5 - Execute Queries (k=5)
# ============================================================================
# Query the vector store with the 5 required queries, retrieving top-5 results each
# Using the optimized embedding model for better retrieval quality

queries = [
    "What is the policy statement for the academic integrity policy?",
    "What is the policy violation definition for cheating?",
    "What is the policy statement for improper or illegal communications?",
    "What are CMU's quiet hours?",
    "Where are pets allowed on CMU?"
]

def retrieve_A(query: str, index, chunks: List[str], k: int = 5):
    """Retrieve top-k results using Section A embedding model"""
    # Embed query using the same model
    qv = embed_texts_A([query])
    # Search for top-k results
    scores, idxs = index.search(qv, k)
    out = []
    for score, idx in zip(scores[0].tolist(), idxs[0].tolist()):
        if idx == -1: continue  # Skip invalid indices
        out.append((idx, float(score), chunks[idx]))
    return out

def clean_text_for_csv(text: str) -> str:
    """
    Clean text for Excel-compatible CSV export.
    - Replace newlines with spaces (Excel-friendly)
    - Remove excessive whitespace
    - Ensure proper encoding
    """
    if not isinstance(text, str):
        text = str(text)
    # Replace newlines and carriage returns with spaces
    text = text.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ')
    # Replace multiple spaces with single space
    text = re.sub(r' +', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

rows_A = []
print("Executing queries with optimized embedding model...\n")
for qi, q in enumerate(queries, 1):
    res = retrieve_A(q, index_A, chunks_A, k=5)
    for rank, (idx, score, text) in enumerate(res, 1):
        # Clean the response text for Excel compatibility
        cleaned_text = clean_text_for_csv(text)
        rows_A.append({
            "Section": "A",
            "Query #": qi,
            "Query Text": q,
            "k": 5,
            "Response #": rank,
            "chunk_id": idx,
            "score": round(score, 6),  # Round score for cleaner CSV
            "Response Text": cleaned_text
        })
    print(f"Query {qi} completed: {len(res)} results retrieved (top score: {res[0][1]:.4f})")

print(f"\nTotal results collected: {len(rows_A)} rows")
print(f"Expected: 25 rows (5 queries × 5 results)")


Executing queries with optimized embedding model...



Batches: 100%|██████████| 1/1 [00:00<00:00, 32.25it/s]


Query 1 completed: 5 results retrieved (top score: 0.7960)


Batches: 100%|██████████| 1/1 [00:00<00:00, 29.19it/s]


Query 2 completed: 5 results retrieved (top score: 0.8161)


Batches: 100%|██████████| 1/1 [00:00<00:00, 55.26it/s]


Query 3 completed: 5 results retrieved (top score: 0.6821)


Batches: 100%|██████████| 1/1 [00:00<00:00, 74.99it/s]


Query 4 completed: 5 results retrieved (top score: 0.6551)


Batches: 100%|██████████| 1/1 [00:00<00:00, 35.59it/s]

Query 5 completed: 5 results retrieved (top score: 0.6489)

Total results collected: 25 rows
Expected: 25 rows (5 queries × 5 results)





In [31]:
# ============================================================================
# SECTION A: STEP 6 - Create Results DataFrame and Export to CSV
# ============================================================================
# Format results for homework submission spreadsheet template
# Excel-compatible CSV with proper escaping and encoding

partA_df = pd.DataFrame(rows_A)

# Verify data structure
print("Data Verification:")
print(f"  - Total rows: {len(partA_df)}")
print(f"  - Expected: 25 rows (5 queries × 5 results)")
print(f"  - Unique queries: {partA_df['Query #'].nunique()}")
print(f"  - Results per query: {partA_df.groupby('Query #').size().tolist()}")
print(f"  - All k values: {partA_df['k'].unique()}")

# Export to Excel-compatible CSV
# Using proper settings for Excel compatibility:
# - quoting=csv.QUOTE_ALL: Quote ALL fields to handle commas in text properly
# - This ensures commas within text don't break column separation
# - doublequote=True: Double quotes for quotes within quoted fields
import csv

# Quote all fields to ensure commas in text don't break Excel import
partA_df.to_csv(
    "partA_results.csv", 
    index=False,
    quoting=csv.QUOTE_ALL,  # Quote ALL fields - safest for Excel compatibility
    doublequote=True,  # Use "" for quotes within quoted fields
    lineterminator='\n',  # Standard line terminator
    encoding='utf-8'  # UTF-8 encoding
)

print(f"\n✓ Results saved to partA_results.csv (Excel-compatible)")
print(f"✓ DataFrame shape: {partA_df.shape}")
print(f"✓ Columns: {list(partA_df.columns)}")

# Verify CSV format - check first line to ensure proper quoting
print("\n" + "="*80)
print("CSV Format Verification (first line):")
print("="*80)
with open("partA_results.csv", "r", encoding="utf-8") as f:
    first_line = f.readline()
    print(f"First line length: {len(first_line)} characters")
    print(f"First line preview: {first_line[:200]}...")
    print(f"All fields quoted: {'\"' in first_line}")

print("\n" + "="*80)
print("Sample Results (First 5 rows):")
print("="*80)
partA_df.head(5)


Data Verification:
  - Total rows: 25
  - Expected: 25 rows (5 queries × 5 results)
  - Unique queries: 5
  - Results per query: [5, 5, 5, 5, 5]
  - All k values: [5]

✓ Results saved to partA_results.csv (Excel-compatible)
✓ DataFrame shape: (25, 8)
✓ Columns: ['Section', 'Query #', 'Query Text', 'k', 'Response #', 'chunk_id', 'score', 'Response Text']

CSV Format Verification (first line):
First line length: 85 characters
First line preview: "Section","Query #","Query Text","k","Response #","chunk_id","score","Response Text"
...
All fields quoted: True

Sample Results (First 5 rows):


Unnamed: 0,Section,Query #,Query Text,k,Response #,chunk_id,score,Response Text
0,A,1,What is the policy statement for the academic ...,5,1,51,0.795959,Statement on Academic Integrity The Statement ...
1,A,1,What is the policy statement for the academic ...,5,2,1373,0.725584,Existing University policies and principles on...
2,A,1,What is the policy statement for the academic ...,5,3,120,0.724487,Academic Integrity Policy Students at Carnegie...
3,A,1,What is the policy statement for the academic ...,5,4,69,0.69872,Practice of the Mission of Academic Integrity ...
4,A,1,What is the policy statement for the academic ...,5,5,107,0.693166,Fairness and Exemplary Behavior The preservati...


# **Homework 2 Assignment**

## **Section A. Experimenting with Vector Store Query Design (50 points)**

### **Choose a method to chunk the text data:**

- [Semantic chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker)

- [Recursive chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

- [Character chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/character_text_splitter)

- [Token chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/split_by_token)

# Sentence Chunking

### Choose a type of chunker:

In [13]:
!pip3 install PyPDF2
# Additional imports for alternative chunking methods
from typing import List, Tuple
from PyPDF2 import PdfReader


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
# Configuration: PDF path and embedding model
# Update PDF_PATH to match your PDF filename
PDF_PATH = "cmu-student-policy-handbook.pdf"  # or "the-word-2023-24-12.11.23.pdf"
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Lightweight, fast embedding model


In [15]:
# Load PDF and extract raw text
# Extracts text from all pages and joins with newlines
reader = PdfReader(PDF_PATH)
raw_text = "\n".join((p.extract_text() or "") for p in reader.pages)
print("Loaded characters:", len(raw_text))


Loaded characters: 678695


In [16]:
# Text cleaning function: normalizes whitespace and line breaks
# Removes carriage returns, collapses multiple spaces, limits consecutive newlines
def clean_text(t: str) -> str:
    t = re.sub(r'\r', '\n', t)  # Convert \r to \n
    t = re.sub(r'[ \t]+', ' ', t)  # Collapse spaces/tabs to single space
    t = re.sub(r'\n{3,}', '\n\n', t)  # Limit to max 2 consecutive newlines
    return t.strip()


In [29]:
# Paragraph-based chunking: splits by double newlines, respects max_chars with overlap
# Overlap helps maintain context between chunks for better retrieval
def chunk_by_paragraph(text: str, max_chars: int = 1200, overlap: int = 150):
    text = clean_text(text)
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]  # Split by paragraphs
    chunks, buf = [], ""
    
    for p in paras:
        if len(buf) + len(p) + 2 <= max_chars:  # +2 for "\n\n"
            buf = (buf + "\n\n" + p) if buf else p
        else:
            if buf: chunks.append(buf)
            # Add overlap from previous chunk for context continuity
            if chunks and overlap > 0:
                tail = chunks[-1][-overlap:]
                buf = (tail + "\n\n" + p).strip()
            else:
                buf = p
    if buf: chunks.append(buf)
    return chunks


In [30]:
# Sentence-based chunking: uses NLTK sentence tokenizer, respects max_chars with overlap
# More granular than paragraph chunking, better for precise sentence-level retrieval
def chunk_by_sentence(text: str, max_chars: int = 900, overlap: int = 120):
    text = clean_text(text)
    sents = nltk.tokenize.sent_tokenize(text)  # Split into sentences
    chunks, buf = [], ""
    
    for s in sents:
        if len(buf) + len(s) + 1 <= max_chars:  # +1 for space
            buf = (buf + " " + s).strip() if buf else s
        else:
            if buf: chunks.append(buf)
            # Add overlap from previous chunk for context continuity
            if chunks and overlap > 0:
                tail = chunks[-1][-overlap:]
                buf = (tail + " " + s).strip()
            else:
                buf = s
    if buf: chunks.append(buf)
    return chunks


In [31]:
# Initialize embedding model (load once, reuse for all embeddings)
# all-MiniLM-L6-v2: 384-dim embeddings, fast and efficient
embed_model = SentenceTransformer(EMBED_MODEL_NAME)


In [32]:
# Embed texts: converts list of text chunks to normalized embedding vectors
# Normalized embeddings enable cosine similarity via dot product (Inner Product)
def embed_texts(texts: List[str]) -> np.ndarray:
    embs = embed_model.encode(texts, batch_size=64, show_progress_bar=True,
                              convert_to_numpy=True, normalize_embeddings=True)
    return embs.astype("float32")


In [33]:
# Build FAISS Inner Product index: optimized for cosine similarity search
# IndexFlatIP uses dot product (equivalent to cosine for normalized vectors)
def build_ip_index(embs: np.ndarray):
    index = faiss.IndexFlatIP(embs.shape[1])  # Inner Product = cosine for normalized vectors
    index.add(embs)
    return index


In [34]:
# Retrieve function: searches index and returns top-k chunks with similarity scores
# Returns list of (index, score, chunk_text) tuples sorted by relevance
def retrieve(query: str, index, chunks: List[str], k: int = 5):
    qv = embed_texts([query])  # Embed query (normalized; IP acts as cosine similarity)
    scores, idxs = index.search(qv, k)  # Search for top-k results
    out = []
    for score, idx in zip(scores[0].tolist(), idxs[0].tolist()):
        if idx == -1: continue  # Skip invalid indices
        out.append((idx, float(score), chunks[idx]))
    return out


In [36]:
# Section A: Query the vector store with the 5 required queries (k=5)
# These are the official homework queries for Section A
queries = [
    "What is the policy statement for the academic integrity policy?",
    "What is the policy violation definition for cheating?",
    "What is the policy statement for improper or illegal communications?",
    "What are CMU's quiet hours?",
    "Where are pets allowed on CMU?"
]


In [37]:
# Execute queries and collect results: retrieves top-5 chunks for each query
# Stores results in structured format for easy analysis and CSV export
rows_A = []
for qi, q in enumerate(queries, 1):  # qi starts at 1 for query numbering
    res = retrieve(q, index_A, chunks_A, k=5)  # Get top-5 results
    for rank, (idx, score, text) in enumerate(res, 1):  # rank starts at 1
        rows_A.append({
            "Section": "A",
            "Query #": qi,
            "Query Text": q,
            "k": 5,
            "Response #": rank,
            "chunk_id": idx,
            "score": score,
            "Response Text": text
        })


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 86.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 83.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]


In [38]:
# Create DataFrame and save to CSV for homework submission
# This matches the format required for the homework spreadsheet template
partA_df = pd.DataFrame(rows_A)
partA_df.to_csv("partA_results.csv", index=False)
partA_df.head(10)  # Display first 10 rows to verify results


Unnamed: 0,Section,Query #,Query Text,k,Response #,chunk_id,score,Response Text
0,A,1,What is the policy statement for the academic ...,5,1,2,0.393871,must not destroy that respect by their failure...
1,A,1,What is the policy statement for the academic ...,5,2,0,0.355402,1 \n \n \n \nThe Word: Student Handbook \n2023...
2,A,1,What is the policy statement for the academic ...,5,3,1,0.30532,sful Carnegie \nMellon experience. \n \n \nAmy...
3,A,2,What is the policy violation definition for ch...,5,1,2,0.26479,must not destroy that respect by their failure...
4,A,2,What is the policy violation definition for ch...,5,2,1,0.130284,sful Carnegie \nMellon experience. \n \n \nAmy...
5,A,2,What is the policy violation definition for ch...,5,3,0,0.070751,1 \n \n \n \nThe Word: Student Handbook \n2023...
6,A,3,What is the policy statement for improper or i...,5,1,2,0.212431,must not destroy that respect by their failure...
7,A,3,What is the policy statement for improper or i...,5,2,1,0.08716,sful Carnegie \nMellon experience. \n \n \nAmy...
8,A,3,What is the policy statement for improper or i...,5,3,0,0.078769,1 \n \n \n \nThe Word: Student Handbook \n2023...
9,A,4,What are CMU's quiet hours?,5,1,2,0.19595,must not destroy that respect by their failure...


In [39]:
# Example: Chunk by paragraph, create embeddings, build index
# This demonstrates the complete pipeline for paragraph-based chunking
chunks_A = chunk_by_paragraph(raw_text, max_chars=1200, overlap=150)
emb_A = embed_texts(chunks_A)
index_A = build_ip_index(emb_A)
len(chunks_A), index_A.ntotal


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]


(3, 3)

In [72]:
# this is an example chunker. You don't have to use it. Email Sara with questions.

# parser to split up PDF resume:
text_parser = SentenceSplitter(
    chunk_size=1024
)

#### **Chunker Choices**

In [17]:
# Chunker choice #1:
# Semantic Chunker: splits text into semantically-coherent chunks using NLTK sentence tokenization,
# groups sentences to fit approximately within max_chars, and preserves context by overlapping.

import nltk
from typing import List

def semantic_chunker(text: str, max_chars:1200, overlap:150) -> List[str]:
    """
    Split text into semantically-coherent chunks by grouping sentences such that
    each chunk is approximately max_chars, with optional overlap in characters for contextual continuity.
    Returns a list of text chunks.
    """
    # Ensure sentence tokenizer is available
    nltk.download('punkt', quiet=True)
    sentences = nltk.sent_tokenize(text)
    chunks = []
    cur_chunk = ""
    i = 0
    while i < len(sentences):
        sent = sentences[i]
        # If current chunk gets too large, finalize the chunk
        if len(cur_chunk) + len(sent) > max_chars:
            chunks.append(cur_chunk.strip())
            # Add overlap by recalculating starting index such that new chunk includes trailing context
            if overlap > 0 and len(cur_chunk) > overlap:
                # Find where to start overlap from the current chunk
                chunk_end = cur_chunk[-overlap:]
                # Find the index of the sentence that starts with the overlap
                overlap_sents = []
                overlap_len = 0
                # Walk backwards and include enough previous sentences to reach overlap
                for j in range(i-1, -1, -1):
                    overlap_sents.insert(0, sentences[j])
                    overlap_len += len(sentences[j])
                    if overlap_len >= overlap:
                        break
                cur_chunk = " ".join(overlap_sents)
            else:
                cur_chunk = ""
        # Add next sentence to current chunk
        cur_chunk += (" " if cur_chunk else "") + sent
        i += 1
    if cur_chunk:
        chunks.append(cur_chunk.strip())
    return chunks

# Example usage:
# semantic_chunks = semantic_chunker(raw_text, max_chars=1200, overlap=150)


In [None]:
# Chunker choice #2:

In [18]:
# example code, feel free to use in homework. 

text_chunks = [] # create an empty list to store the text chunks.
doc_idxs = []    # create an empty list to store unique identifiers for the text chunks.

# split the CMU handbook up into chunks and assign unique identifiers to each chunk:
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

NameError: name 'text_parser' is not defined

In [None]:
text_chunks # glance at the text chunks to observe how the chunks look.

In [18]:
text_chunk_df = pd.DataFrame(text_chunks) # put the chunks into a pandas dataframe.
text_chunk_df

In [20]:
# Split resumes into sentences and include a unique identifier for each sentence:
sentences_df = split_resumes_to_sentences(text_chunk_df, 0) 
sentences_df

In [30]:
# get the length of the sentences dataframe:
len(sentences_df)

### **Choose an embedding model to use for creating embeddings of the text chunks and create the Embeddings**

In [74]:
# define a function to create embeddings for the sentences:

model = SentenceTransformer('bert-base-nli-mean-tokens') # here we are selecting to use a Bert model on HuggingFace to create the embeddings.

In [23]:
# create sentence embeddings:
sentence_embeddings = model.encode(sentences_df['sentence'])

# check the shape of the sentence embeddings:
sentence_embeddings.shape

## **Create a FAISS Vector Store**

In [103]:
# specify the dimensions of the sentence embeddings:
dimension = sentence_embeddings.shape[1]

# specify the number of sentences:
nb = len(set(sentences_df))

# specify the number of queries:
nq = 10000 
np.random.seed(1234)             # set a random number to make the process reproducible
xb = np.random.random((nb, d)).astype('float32')

#
nlist = 100

In [104]:
# glance at the shape of the sentence embeddings or dimension for the vector store:
dimension

In [105]:
# create an index for the vector store:
index = faiss.IndexFlatL2(d)

In [106]:
# add the sentence embeddings to the index:
index.add(sentence_embeddings)

In [107]:
# check the number of vectors in the index:
index.ntotal

In [108]:
# train the index:
index.train(sentence_embeddings)

index.is_trained  # check if index is now trained

### **Construct Query and Perform Search of the Vector Store**

In [109]:
# define a query to submit to the vector store:
question = "<INSERT QUERY FROM HOMEWORK ASSIGNMENT INSTRUCTIONS HERE>"

In [110]:
# define the number of documents to retrieve from the vector store in response to the query:
retrival_number=10

# create an embedding for the query:
query_embedding = model.encode([question])

In [112]:
%%time 
 # measure the time it takes to search the index
D, I = index.search(query_embedding, retrival_number)  # search the index for the query, using the number of documents to retrieve specified by k
print(I) # print the indices of the documents that are most similar to the query

In [113]:
# Retrieve and print the string data from 'text' column of the first index in I

first_index = I[0] # Get the first index from I

first_row_string = sentences_df['sentence'].iloc[first_index].sum()  # Use iloc to access the row by index

print(first_row_string) # Print the string data

### **Define System Prompt (e.g. context message) to send to LLM**

In [114]:
# define a function to get retrieve the results from the vector store:
def get_sys_message(user_query: str, retrieval_number: int):
    query_embedding = model.encode([user_query])
    D, I = index.search(query_embedding, retrival_number)  # search
    first_index = I[0]  # Get the first index from I
    first_row_string = sentences_df['sentence'].iloc[first_index].sum()
    return first_row_string 

In [115]:
# use the custom function to retrieve the results from the vector store:
get_sys_message(user_query="Which resume has the most software skills listed?", retrieval_number=100)

In [116]:
# custom function for using an LLM with a RAG retriever:
def rag_openAI_gpt(
    model: str, 
    query: str, 
    retrieval_number: int, 
    llm_prompt: str):
    
    import openai
    from openai import OpenAI
    
    client = OpenAI()
    
    f=get_sys_message(query, retrieval_number)
    
    response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": f"Instruction: use the information in {f} to answer the user's question."},
        {"role": "user", "content": f"{llm_prompt}"},
        {"role": "assistant", "content": f"{f}"},
        {"role": "user", "content": "What is the answer?"}
    ]
    )
    return response.choices[0].message.content

In [117]:
gpt_3_5_turbo = "gpt-3.5-turbo"
gpt_4 = "gpt-4"
gpt_4_turbo = "gpt-4-0125-preview"
gpt_4o = "gpt-4o"

## Examples for demonstration only:

In [118]:
rag_openAI_gpt(model=gpt_3_5_turbo, query="Which resume has the most software skills listed?", retrieval_number=20, llm_prompt="Classify the document and return a label based on the document type or class. Make the label specify which occupation the document pertains to")

In [119]:
rag_openAI_gpt(model=gpt_4, query="Which resume has the most software skills listed?", retrieval_number=20, llm_prompt="summarize the resume")

In [120]:
rag_openAI_gpt(model=gpt_4_turbo, query="Which resume has the most software skills listed?", retrieval_number=20, llm_prompt="summarize the resume")

# Homework requirement:

# Section A

## **Query the vector store using these queries**

**Instruction: set the 'k' parameter to 5**

Query 1: What is the policy statement for the academic integrity policy?

Query 2: What is the policy violation definition for cheating?

Query 3: What is the policy statement for improper or illegal communications?

Query 4: What are CMU’s quiet hours?

Query 5: Where are pets allowed on CMU?


### ***query the vector store with the 5 queries above (don't forget to record the responses in your homework submission spreadsheet: see instructions for a link to the spreadsheet!):***

In [121]:
# query the vector store with the 5 queries above (don't forget to record the responses in your homework submission!):

# **Homework Questions:**

**A.I.** 

(i) Describe these distance metrics: Cosine similarity; Euclidean Distance; Dot Product.

(ii) For each of the metrics you defined in (i), describe how the metric is different from the other metrics.

(iii) For each of the metrics you defined in (i), describe one advantage and one disadvantage of using the metric.

 

**A.II.** Copy and paste the results or information retrieved from the vector store in response to each of the queries you submitted to the vector store in the SPREADSHEET TEMPLATE (please see instructions for a link to the spreadsheet template you should copy and use).  


**A.III.** Qualitatively analyze the responses to your queries submitted to the vector store. Did the queries retrieve the information you were expecting to obtain. Why or why not? Why do you think the queries were successful / unsuccessful in retrieving the information you expected or needed? 

# **Section B. Experimenting with Vector Store Embeddings & Query Parameters (50 points)**

1) Choose 1 of the 5 queries provided in Section A, above, and experiment with submitting the query to the vector store by changing the QUERY and RETRIEVAL_NUMBER parameters in the following manner:


*   A) Baseline query (e.g. query), retrieval_number parameter=1.

*   B) Query, retrieval_number parameter  = 3

*   C) Query, retrieval_number parameter  = 5

*   D) Query, retrieval_number parameter  = 10

**In your written homework submission, record the UNIQUE responses/results of each query submitted to the vector store.**

2. Select a different text chunking method (e.g. word, sentence, paragraph) and:
   
- Chunk your text data using the method.
- Create embeddings for the text. 
- Load the embeddings into the vector store. 
- Submit the same query you selected in B.1, above, and submit it to the vector store 6 times (using the different ‘retrieval_number’ parameter settings defined in B.1, above), and record the responses.

**In your written homework submission, record the responses/results of each query submitted to the vector store.**

### **Homework Questions:**

**B.I.** Explain your rationale for selecting the query you choose in B.1. Why did you choose this query vs. the other 4 queries? 

**B.II.** Copy and paste the responses to the queries you submitted to the vector store in the SPREADSHEET TEMPLATE.

**B.III.** Copy and paste the responses to the queries you submitted to the vector store in the SPREADSHEET TEMPLATE. 

**B.IV.** In observing the responses from the vector store to the queries created in B.1., which ‘k’ parameter do you think retrieved the highest quality / most accurate result? Why do you think this parameter was the best to use with the query?

**B.V.** In observing the responses from the vector store to the queries created in B.2., which ‘k’ parameter do you think retrieved the highest quality / most accurate result? Why do you think this parameter was the best to use with the query?

# **BONUS TASKS / QUESTIONS: Define function to call LLM API**

## Please email Sara for the Bonus Task Python Notebook once you've completed your homework assignment