In [1]:
import numpy as np
import os
import pandas as pd
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from typing import List, Dict

  from tqdm.autonotebook import tqdm, trange


In [2]:
class DocumentChunker:
    def __init__(self, chunk_size: int = 500):
        self.chunk_size = chunk_size
    
    @staticmethod
    def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
        """
        Compute cosine similarity between two vectors.
        
        Args:
            v1 (np.ndarray): First vector
            v2 (np.ndarray): Second vector
            
        Returns:
            float: Cosine similarity score between -1 and 1
        """
        dot_product = np.dot(v1, v2)
        norm1 = np.linalg.norm(v1)
        norm2 = np.linalg.norm(v2)
        return dot_product / (norm1 * norm2)
    
    def chunk_by_semantic_similarity(self, text: str) -> List[str]:
        """
        Chunk text based on semantic similarity using sentence embeddings.
        More efficient version that computes embeddings only once per sentence.
        """
        # Split into sentences and filter empty ones
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        
        # Initialize sentence transformer
        model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Get embeddings for all sentences at once (more efficient)
        embeddings = model.encode(sentences)
        
        chunks = []
        chunks_embeddings = []
        current_chunk = []
        current_length = 0
        
        # Zip embeddings with their corresponding sentences and next embeddings
        for sentence, next_emb in zip(sentences,embeddings[1:], strict=False):
            current_chunk.append(sentence)
            current_length += len(sentence)
            
            if current_length >= self.chunk_size:
                curr_emb = model.encode('. '.join(current_chunk) + '.')
                similarity = self.cosine_similarity(curr_emb, next_emb)
                
                if similarity < 0.5:
                    chunks.append('. '.join(current_chunk) + '.')
                    chunks_embeddings.append(curr_emb)
                    current_chunk = []
                    current_length = 0
        
        # Add remaining sentences as the last chunk
        if current_chunk:
            last_chunk = '. '.join(current_chunk) + '.'
            chunks.append(last_chunk)
            chunks_embeddings.append(model.encode(last_chunk))

        return chunks, chunks_embeddings

    def create_document_chunks(self, text: str) -> List[Dict]:
        """
        Create chunks with metadata
        """
        chunks, embeddings = self.chunk_by_semantic_similarity(text)
            
        # Add metadata to chunks
        doc_chunks = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            doc_chunks.append({
                'content': chunk,
                'embedding': embedding,
                'metadata': {
                    'chunk_id': i,
                    'chunk_size': len(chunk),
                    'chunk_method': 'semantic'
                }
            })
            
        return doc_chunks

# Usage example
def process_pdf_to_chunks(pdf_path: str) -> List[Dict]:
    """
    Process a PDF file and return chunked content
    """
    # Read PDF
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    # Create chunks
    chunker = DocumentChunker(chunk_size=500)
    chunks = chunker.create_document_chunks(text)
    
    return chunks

In [3]:
DATA_ROOT = os.environ["DATA_ROOT"]
DATA_FOLDER = "student-rag"
FILE = "Designing Machine Learning Systems.pdf"

In [4]:
pdf_path = os.path.join(DATA_ROOT, DATA_FOLDER, FILE)
semantic_chunks = process_pdf_to_chunks(pdf_path)

In [11]:
chunks_lenghts = [chunk['metadata']["chunk_size"] for chunk in semantic_chunks]
pd.DataFrame(chunks_lenghts).describe([0.9, 0.99])

In [27]:
from chromadb import Client
from chromadb.utils import embedding_functions

def store_chunks_in_vectordb(chunks: List[Dict], collection_name: str = "pdf_chunks"):
    """
    Store chunks in ChromaDB with explicit embedding function
    """
    # Initialize ChromaDB
    client = Client()
    
    # Initialize the default embedding function (what ChromaDB uses under the hood)
    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    
    # Create or get collection with explicit embedding function
    collection = client.create_collection(
        name=collection_name,
        embedding_function=embedding_fn
    )
    
    # Prepare documents for insertion
    documents = [chunk['content'] for chunk in chunks]
    ids = [str(chunk['metadata']['chunk_id']) for chunk in chunks]
    metadatas = [chunk['metadata'] for chunk in chunks]
    
    # Add documents to collection
    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas
    )
    
    return collection

In [28]:
collection = store_chunks_in_vectordb(semantic_chunks)

In [29]:
# Example query
query = "What are the key components of a machine learning system?"

# Query the collection
results = collection.query(
    query_texts=[query],
    n_results=3  # Number of chunks to return
)

# Print results
print("Query:", query)
print("\nRelevant chunks:")
for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\nChunk {i+1}:")
    print(f"Metadata: {metadata}")
    print(f"Content: {doc[:200]}...")  # Print first 200 chars of each chunk



Query: What are the key components of a machine learning system?

Relevant chunks:

Chunk 1:
Metadata: {'chunk_id': 131, 'chunk_method': 'semantic', 'chunk_size': 555}
Content: We’ll provide detail on what this holistic
approach means in the next chapter. Summary | 23CHAPTER 2
Introduction to Machine Learning
Systems Design
Now that we’ve walked through an overview of ML sys...

Chunk 2:
Metadata: {'chunk_id': 1, 'chunk_method': 'semantic', 'chunk_size': 556}
Content: 99  CAN $74. 99
ISBN: 978-1-098-10796-3Twitter: @oreillymedia
linkedin. com/company/oreilly-media
youtube. com/oreillymedia Machine learning systems are both complex and unique. Complex because they c...

Chunk 3:
Metadata: {'chunk_id': 50, 'chunk_method': 'semantic', 'chunk_size': 546}
Content: Figure 1-1. Different  components of an ML system. “ML algorithms” is usually what
people think of when they say machine learning, but it’s only a small part of the entire
system. There are many excel...
