In [535]:
import sys
import os
import subprocess
from pathlib import Path
import logging

In [536]:


# Get your home directory automatically
HOME = Path.home()
project_name = "azure-multimodal-rag"
AZURE_RAG_PROJECT = HOME / "projects" / project_name

print(f"🎯 Azure RAG Project: ~/{project_name}")
print(f"📍 Current Working Directory: {Path.cwd().name}")

# Create and navigate
AZURE_RAG_PROJECT.mkdir(parents=True, exist_ok=True)
os.chdir(AZURE_RAG_PROJECT)

# Safe display
print(f"🎯 Azure RAG Project: ~/projects/azure-multimodal-rag")
print(f"📍 Working in: {AZURE_RAG_PROJECT.name}")

🎯 Azure RAG Project: ~/azure-multimodal-rag
📍 Current Working Directory: azure-multimodal-rag
🎯 Azure RAG Project: ~/projects/azure-multimodal-rag
📍 Working in: azure-multimodal-rag


In [537]:

# Set up logging
# Create logs directory
Path("logs").mkdir(exist_ok=True)

# Configure to save to file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/app.log'),  # Save to file
        logging.StreamHandler()               # Also show on screen
    ]
)

# SECURE Project configuration - no personal paths exposed
HOME = Path.home()  # Gets your home directory automatically
PROJECT_NAME = "azure-multimodal-rag"
AZURE_RAG_PROJECT = HOME / "projects" / PROJECT_NAME

# Safe display function
def safe_display(path):
    """Display path without exposing personal directory"""
    return str(path).replace(str(HOME), "~")

print(f"🎯 Azure RAG Project: {safe_display(AZURE_RAG_PROJECT)}")

# Create project directory and add to path
AZURE_RAG_PROJECT.mkdir(parents=True, exist_ok=True)
if str(AZURE_RAG_PROJECT) not in sys.path:
    sys.path.append(str(AZURE_RAG_PROJECT))

# Change to project directory for file creation
os.chdir(AZURE_RAG_PROJECT)
print(f"📍 Working directory: {AZURE_RAG_PROJECT.name}")

# Create complete directory structure
directories = [
    "config", "models", "data/raw/pdfs", "data/processed", "data/vector_store",
    "src/document_processor", "src/vector_store", "src/retrieval", 
    "src/generation", "src/utils", "logs", "notebooks", "scripts", "tests"
]

print("📁 Creating directory structure...")
for directory in directories:
    Path(directory).mkdir(parents=True, exist_ok=True)
    print(f"   📁 {directory}")

# Create __init__.py files
init_files = [
    "src/__init__.py", "src/document_processor/__init__.py", 
    "src/vector_store/__init__.py", "src/retrieval/__init__.py",
    "src/generation/__init__.py", "src/utils/__init__.py", "config/__init__.py"
]

for init_file in init_files:
    Path(init_file).touch()

print("✅ Project structure created")
print(f"📍 All files created in: ~/{PROJECT_NAME}/")

🎯 Azure RAG Project: ~/projects/azure-multimodal-rag
📍 Working directory: azure-multimodal-rag
📁 Creating directory structure...
   📁 config
   📁 models
   📁 data/raw/pdfs
   📁 data/processed
   📁 data/vector_store
   📁 src/document_processor
   📁 src/vector_store
   📁 src/retrieval
   📁 src/generation
   📁 src/utils
   📁 logs
   📁 notebooks
   📁 scripts
   📁 tests
✅ Project structure created
📍 All files created in: ~/azure-multimodal-rag/


In [538]:
# Cell: Configuration System
print("⚙️ Creating Configuration System...")

class AzureRAGConfig:
    """Configuration for our Azure RAG system"""
    
    # File paths
    PDF_FOLDER = "data/raw/pdfs"
    PROCESSED_FOLDER = "data/processed"
    VECTOR_STORE_FOLDER = "data/vector_store"
    LOGS_FOLDER = "logs"
    
    # Text processing settings
    CHUNK_SIZE = 1000        # characters - good balance
    CHUNK_OVERLAP = 200      # 20% overlap for context
    
    # AI model settings
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # For creating embeddings
    MAX_MEMORY_GB = 8        # Memory limit
    
    # Processing limits
    MAX_FILES_TO_PROCESS = 10  # Don't overwhelm the system
    
    # Supported file types
    SUPPORTED_FILE_TYPES = [".pdf", ".txt", ".md"]

# Create our configuration instance
config = AzureRAGConfig()

# Test and display our configuration
print("✅ Configuration System Created!")
print("\n📋 Current Settings:")
print(f"   📁 PDF Folder: {config.PDF_FOLDER}")
print(f"   📁 Processed Folder: {config.PROCESSED_FOLDER}")
print(f"   📁 Vector Store: {config.VECTOR_STORE_FOLDER}")
print(f"   📝 Chunk Size: {config.CHUNK_SIZE} characters")
print(f"   🔄 Chunk Overlap: {config.CHUNK_OVERLAP} characters ({config.CHUNK_OVERLAP/config.CHUNK_SIZE*100:.0f}% overlap)")
print(f"   🤖 Embedding Model: {config.EMBEDDING_MODEL}")
print(f"   💾 Max Memory: {config.MAX_MEMORY_GB} GB")
print(f"   📊 Max Files: {config.MAX_FILES_TO_PROCESS}")
print(f"   📄 Supported Types: {', '.join(config.SUPPORTED_FILE_TYPES)}")

# Save configuration to file for later use
config_file_content = f'''"""Azure RAG Configuration Settings"""

class AzureRAGConfig:
    """Configuration for our Azure RAG system"""
    
    # File paths
    PDF_FOLDER = "{config.PDF_FOLDER}"
    PROCESSED_FOLDER = "{config.PROCESSED_FOLDER}"
    VECTOR_STORE_FOLDER = "{config.VECTOR_STORE_FOLDER}"
    LOGS_FOLDER = "{config.LOGS_FOLDER}"
    
    # Text processing settings
    CHUNK_SIZE = {config.CHUNK_SIZE}
    CHUNK_OVERLAP = {config.CHUNK_OVERLAP}
    
    # AI model settings
    EMBEDDING_MODEL = "{config.EMBEDDING_MODEL}"
    MAX_MEMORY_GB = {config.MAX_MEMORY_GB}
    
    # Processing limits
    MAX_FILES_TO_PROCESS = {config.MAX_FILES_TO_PROCESS}
    
    # Supported file types
    SUPPORTED_FILE_TYPES = {config.SUPPORTED_FILE_TYPES}

# Global config instance
config = AzureRAGConfig()
'''

# Write to config file
with open('config/settings.py', 'w') as f:
    f.write(config_file_content)

print(f"\n💾 Configuration saved to: config/settings.py")

⚙️ Creating Configuration System...
✅ Configuration System Created!

📋 Current Settings:
   📁 PDF Folder: data/raw/pdfs
   📁 Processed Folder: data/processed
   📁 Vector Store: data/vector_store
   📝 Chunk Size: 1000 characters
   🔄 Chunk Overlap: 200 characters (20% overlap)
   🤖 Embedding Model: all-MiniLM-L6-v2
   💾 Max Memory: 8 GB
   📊 Max Files: 10
   📄 Supported Types: .pdf, .txt, .md

💾 Configuration saved to: config/settings.py


In [539]:

# Let's create a corrected version that works with our config
class SimplePDFReader:
    """Simple PDF reader using PyMuPDF"""
    
    def __init__(self, config=None):
        """Initialize with optional config"""
        self.config = config
        
    def read_pdf(self, pdf_path):
        """Read a PDF file and return text content"""
        try:
            import fitz  # PyMuPDF
            
            # Open the PDF
            doc = fitz.open(pdf_path)
            text_content = ""
            
            print(f"📄 Processing {len(doc)} pages...")
            
            # Extract text from each page
            for page_num in range(len(doc)):
                page = doc[page_num]
                page_text = page.get_text()
                text_content += page_text
                
                # Show progress for larger documents
                if (page_num + 1) % 5 == 0 or page_num == 0:
                    print(f"   📖 Processed page {page_num + 1}/{len(doc)}")
            
            doc.close()
            return text_content
            
        except ImportError:
            raise Exception("PyMuPDF (fitz) not installed. Run: pip install PyMuPDF")
        except Exception as e:
            raise Exception(f"Error reading PDF {pdf_path}: {str(e)}")

 
       
 

In [540]:

pdf_files = list(Path(config.PDF_FOLDER).glob("*.pdf"))
print(f"📄 Found {len(pdf_files)} PDF files:")

for i, pdf_file in enumerate(pdf_files, 1):
    print(f"   {i}. {pdf_file.name}")

if pdf_files:
    # Test with the first PDF
    test_pdf = pdf_files[0]
    print(f"\n🔍 Testing with: {test_pdf.name}")
    
    try:
        # Create PDF reader instance with config
        pdf_reader = SimplePDFReader(config=config)
        
        # Read the PDF
        print(f"📖 Reading {test_pdf.name}...")
        text_content = pdf_reader.read_pdf(str(test_pdf))
        
        # Analyze what we got
        print(f"\n✅ Successfully read PDF!")
        print(f"📊 Content Analysis:")
        print(f"   📝 Total characters: {len(text_content):,}")
        print(f"   📄 Total words: {len(text_content.split()):,}")  
        print(f"   📋 Total lines: {len(text_content.splitlines()):,}")
        
        # Clean and show first few meaningful lines
        lines = [line.strip() for line in text_content.splitlines() if line.strip()]
        print(f"\n📋 First 5 meaningful lines:")
        for i, line in enumerate(lines[:5], 1):
            display_line = line[:100] + "..." if len(line) > 100 else line
            print(f"   {i}. {display_line}")
        
        # Show a sample chunk
        print(f"\n📄 Sample chunk (first {config.CHUNK_SIZE} characters):")
        sample_chunk = text_content[:config.CHUNK_SIZE].strip()
        print(f"'{sample_chunk}...'")
        
        # Calculate chunking statistics
        total_chars = len(text_content)
        chunk_size = config.CHUNK_SIZE
        chunk_overlap = config.CHUNK_OVERLAP
        
        # Estimate chunks (accounting for overlap)
        estimated_chunks = max(1, (total_chars - chunk_overlap) // (chunk_size - chunk_overlap))
        
        print(f"\n🧮 Chunking Analysis:")
        print(f"   📦 Estimated chunks: {estimated_chunks}")
        print(f"   📏 Chunk size: {chunk_size:,} characters")
        print(f"   🔄 Overlap: {chunk_overlap:,} characters ({chunk_overlap/chunk_size*100:.1f}%)")
        print(f"   💾 Memory per chunk: ~{chunk_size * 4 / 1024:.1f} KB")  # Rough estimate
        
        # Content quality check
        if len(text_content.strip()) < 100:
            print(f"\n⚠️  Warning: Very short content - PDF might be image-based")
        elif "azure" in text_content.lower() or "network" in text_content.lower():
            print(f"\n✅ Content looks good - contains Azure networking terms!")
        else:
            print(f"\n🤔 Content extracted but may need review")
            
        print(f"\n🎯 Next step: Text Chunking System!")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        print(f"🔧 Debug info:")
        print(f"   📁 PDF path: {test_pdf}")
        print(f"   📏 File size: {test_pdf.stat().st_size:,} bytes")
        print(f"   🔍 File exists: {test_pdf.exists()}")
        
else:
    print("❌ No PDF files found!")
    print(f"📁 Looking in: {Path(config.PDF_FOLDER).absolute()}")
    print("💡 Add some PDF files to continue testing")

📄 Found 2 PDF files:
   1. 01-study-guide-az-vnet.pdf
   2. 02-study-guide-az-load-balancer.pdf

🔍 Testing with: 01-study-guide-az-vnet.pdf
📖 Reading 01-study-guide-az-vnet.pdf...
📄 Processing 11 pages...
   📖 Processed page 1/11
   📖 Processed page 5/11
   📖 Processed page 10/11

✅ Successfully read PDF!
📊 Content Analysis:
   📝 Total characters: 25,974
   📄 Total words: 3,986
   📋 Total lines: 518

📋 First 5 meaningful lines:
   1. Tuesday, November 2, 2021
   2. 1
   3. Capabilities of Azure Virtual Networks
   4. Azure VNets enable resources in Azure to securely communicate with each other, the internet,
   5. and on-premises networks.

📄 Sample chunk (first 1000 characters):
'Tuesday, November 2, 2021 
1 
 
 
Capabilities of Azure Virtual Networks 
Azure VNets enable resources in Azure to securely communicate with each other, the internet, 
and on-premises networks. 
 
Communication with the internet. All resources in a VNet can communicate outbound to the 
internet, by default. Y

**Smart Text Chunking System Overview**

**What This Code Does:**

* **Initializes `SmartTextChunker`** with configurable chunk size and overlap, reporting setup details.
* **Cleans input text** by normalizing whitespace and paragraph breaks.
* **Finds smart split points** near ideal chunk boundaries, preferring sentence endings or paragraph breaks.
* **Generates overlapping chunks** of roughly `CHUNK_SIZE` characters with `CHUNK_OVERLAP`, enforcing a minimum forward progress to avoid tiny fragments.
* **Assigns metadata** (`chunk_id`, start/end offsets, character and word counts, source) to each chunk.
* **Analyzes chunk statistics** (total count, size range, averages) for quality control.
* **Includes safety checks** to prevent infinite loops or excessive chunk counts.
* **Test harness** demonstrates the fixed algorithm on sample Azure networking text and reports before/after metrics.


In [541]:
# Cell: Text Chunking System - UPDATED AND FIXED
print("📦 Building Text Chunking System...")

class SmartTextChunker:
    """Intelligent text chunking for Azure RAG system"""
    
    def __init__(self, config):
        """Initialize with configuration settings"""
        self.config = config
        self.chunk_size = config.CHUNK_SIZE
        self.chunk_overlap = config.CHUNK_OVERLAP
        
        print(f"🎯 Chunker initialized:")
        print(f"   📏 Chunk size: {self.chunk_size} characters")
        print(f"   🔄 Overlap: {self.chunk_overlap} characters ({self.chunk_overlap/self.chunk_size*100:.1f}%)")
    
    def clean_text(self, text):
        """Clean and normalize text before chunking"""
        # Remove excessive whitespace
        import re
        
        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text)
        
        # Remove excessive line breaks (keep paragraph breaks)
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        
        # Strip leading/trailing whitespace
        text = text.strip()
        
        return text
    
    def find_sentence_boundary(self, text, ideal_position):
        """Find the best place to split text (prefer sentence endings)"""
        # Don't search past the end of text
        if ideal_position >= len(text):
            return len(text)
            
        # Look for sentence endings near the ideal position
        search_range = min(100, len(text) - ideal_position)  # Don't go past text end
        
        # Search backwards from ideal position for sentence endings
        for i in range(ideal_position, max(0, ideal_position - search_range), -1):
            if i < len(text) and text[i] in '.!?':
                # Make sure it's not just an abbreviation
                if i + 1 < len(text) and text[i + 1] in ' \n':
                    return i + 1
        
        # If no sentence ending found, look for paragraph breaks
        for i in range(ideal_position, max(0, ideal_position - search_range), -1):
            if i < len(text) and text[i] == '\n':
                return i + 1
        
        # If nothing found, use the ideal position
        return ideal_position
    
    def create_chunks(self, text, source_info="Unknown"):
        """Split text into overlapping chunks with smart boundaries"""
        
        # Clean the text first
        text = self.clean_text(text)
        
        print(f"📝 Processing text: {len(text):,} characters")
        
        if len(text) <= self.chunk_size:
            # Text is small enough to be one chunk
            return [{
                'content': text,
                'chunk_id': 0,
                'source': source_info,
                'char_start': 0,
                'char_end': len(text),
                'char_count': len(text),
                'word_count': len(text.split())
            }]
        
        chunks = []
        start_pos = 0
        chunk_id = 0
        
        print(f"📦 Starting chunking process...")
        
        while start_pos < len(text):
            # Calculate ideal end position
            ideal_end = start_pos + self.chunk_size
            
            if ideal_end >= len(text):
                # Last chunk - take remaining text
                end_pos = len(text)
            else:
                # Find smart boundary
                end_pos = self.find_sentence_boundary(text, ideal_end)
            
            # Extract chunk content
            chunk_content = text[start_pos:end_pos].strip()
            
            if chunk_content:  # Only add non-empty chunks
                chunk = {
                    'content': chunk_content,
                    'chunk_id': chunk_id,
                    'source': source_info,
                    'char_start': start_pos,
                    'char_end': end_pos,
                    'char_count': len(chunk_content),
                    'word_count': len(chunk_content.split())
                }
                chunks.append(chunk)
                chunk_id += 1
                
                # Show progress for first few chunks
                if chunk_id <= 3:
                    print(f"   📦 Chunk {chunk_id}: {start_pos}-{end_pos} ({len(chunk_content)} chars)")
            
            # FIXED: Calculate next starting position with proper overlap
            next_start = end_pos - self.chunk_overlap
            
            # CRITICAL FIX: Ensure meaningful progress to prevent tiny chunks
            min_progress = max(self.chunk_size - self.chunk_overlap - 50, 200)  # Minimum meaningful step
            
            if next_start <= start_pos:
                # If overlap is too big, make reasonable progress
                next_start = start_pos + min_progress
            elif (next_start - start_pos) < min_progress:
                # If we're not making enough progress, force a bigger step
                next_start = start_pos + min_progress
            
            # Safety: if we're near the end, just finish
            if next_start >= len(text) - 50:
                break
                
            start_pos = next_start
            
            # Safety check to prevent infinite loops
            if chunk_id > 100:  # Reasonable limit for any document
                print(f"⚠️  Safety limit reached at {chunk_id} chunks")
                break
        
        print(f"✅ Chunking complete: {len(chunks)} chunks created")
        return chunks
    
    def analyze_chunks(self, chunks):
        """Analyze chunk statistics"""
        if not chunks:
            return {}
        
        char_counts = [chunk['char_count'] for chunk in chunks]
        word_counts = [chunk['word_count'] for chunk in chunks]
        
        return {
            'total_chunks': len(chunks),
            'avg_chars': sum(char_counts) / len(char_counts),
            'min_chars': min(char_counts),
            'max_chars': max(char_counts),
            'avg_words': sum(word_counts) / len(word_counts),
            'total_chars': sum(char_counts),
            'total_words': sum(word_counts)
        }

# Test the FIXED chunking system
print("\n🧪 Testing FIXED Text Chunking System...")

# Create chunker instance
chunker = SmartTextChunker(config)

# Test with sample Azure networking text
sample_text = """
Azure Virtual Networks (VNets) provide the foundation for your private network in Azure. VNets enable Azure resources, like Azure Virtual Machines (VMs), to securely communicate with each other, the internet, and on-premises networks.

A VNet is similar to a traditional network that you'd operate in your own data center. But it brings with it additional benefits of Azure's infrastructure such as scale, availability, and isolation.

Key concepts for VNets include address space, subnets, regions, and subscriptions. The address space is a set of private and public IP addresses that you can use within the VNet. You can divide the address space into multiple subnets and allocate a portion of the VNet's address space to each subnet.

Network Security Groups (NSGs) contain security rules that allow or deny inbound or outbound network traffic to several types of Azure resources. For each rule, you can specify source and destination, port, and protocol.

Azure Load Balancer operates at layer 4 of the Open Systems Interconnection (OSI) model. It's the single point of contact for clients. Load Balancer distributes inbound flows that arrive at the load balancer's front end to backend pool instances.
"""

print(f"\n📝 Sample text length: {len(sample_text):,} characters")

# Create chunks with FIXED algorithm
chunks = chunker.create_chunks(sample_text, source_info="Azure VNet Study Guide")

# Analyze results
stats = chunker.analyze_chunks(chunks)

print(f"\n📊 FIXED Chunking Results:")
print(f"   📦 Total chunks created: {stats['total_chunks']} (was 202!)")
print(f"   📏 Average chunk size: {stats['avg_chars']:.0f} characters")
print(f"   📄 Average words per chunk: {stats['avg_words']:.0f} words")
print(f"   📐 Size range: {stats['min_chars']}-{stats['max_chars']} characters")

# Show the chunks
print(f"\n🔍 Generated Chunks:")
for i, chunk in enumerate(chunks):
    print(f"\n📦 Chunk {chunk['chunk_id']}:")
    print(f"   📏 Length: {chunk['char_count']} chars, {chunk['word_count']} words")
    print(f"   📍 Position: {chunk['char_start']}-{chunk['char_end']}")
    preview = chunk['content'][:150] + "..." if len(chunk['content']) > 150 else chunk['content']
    print(f"   📝 Preview: {preview}")
    
    # Show overlap with next chunk
    if i < len(chunks) - 1:
        next_chunk = chunks[i + 1]
        overlap_start = max(chunk['char_start'], next_chunk['char_start'] - config.CHUNK_OVERLAP)
        overlap_end = min(chunk['char_end'], next_chunk['char_start'] + config.CHUNK_OVERLAP)
        if overlap_end > overlap_start:
            print(f"   🔄 Overlap with next chunk: {overlap_end - overlap_start} characters")

# Test with real PDF content if available
if 'text_content' in globals():
    print(f"\n🧪 Testing with REAL Azure PDF content...")
    print(f"📄 PDF content: {len(text_content):,} characters")
    
    # Create chunks from real PDF
    real_chunks = chunker.create_chunks(text_content, source_info="Azure VNet Study Guide PDF")
    real_stats = chunker.analyze_chunks(real_chunks)
    
    print(f"\n📊 Real PDF Chunking Results:")
    print(f"   📦 Total chunks: {real_stats['total_chunks']}")
    print(f"   📏 Average size: {real_stats['avg_chars']:.0f} characters")
    print(f"   📄 Average words: {real_stats['avg_words']:.0f} words")
    print(f"   📐 Size range: {real_stats['min_chars']}-{real_stats['max_chars']} characters")
    
    # Expected vs actual
    expected_chunks = len(text_content) // config.CHUNK_SIZE
    print(f"\n🎯 Chunking Analysis:")
    print(f"   📈 Expected chunks: ~{expected_chunks}")
    print(f"   ✅ Actual chunks: {real_stats['total_chunks']}")
    print(f"   📊 Efficiency: {expected_chunks/real_stats['total_chunks']*100:.1f}% of expected")

print(f"\n📊 BEFORE vs AFTER:")
print(f"   🔴 BROKEN: 202 tiny chunks, avg 106 chars")
print(f"   ✅ FIXED:  {stats['total_chunks']} proper chunks, avg {stats['avg_chars']:.0f} chars")

print(f"\n✅ Text Chunking System FIXED and Ready!")
print(f"🎯 Next: Test with real PDF content!")

📦 Building Text Chunking System...

🧪 Testing FIXED Text Chunking System...
🎯 Chunker initialized:
   📏 Chunk size: 1000 characters
   🔄 Overlap: 200 characters (20.0%)

📝 Sample text length: 1,208 characters
📝 Processing text: 1,202 characters
📦 Starting chunking process...
   📦 Chunk 1: 0-955 (955 chars)
   📦 Chunk 2: 755-1202 (447 chars)
✅ Chunking complete: 2 chunks created

📊 FIXED Chunking Results:
   📦 Total chunks created: 2 (was 202!)
   📏 Average chunk size: 701 characters
   📄 Average words per chunk: 110 words
   📐 Size range: 447-955 characters

🔍 Generated Chunks:

📦 Chunk 0:
   📏 Length: 955 chars, 150 words
   📍 Position: 0-955
   📝 Preview: Azure Virtual Networks (VNets) provide the foundation for your private network in Azure. VNets enable Azure resources, like Azure Virtual Machines (VM...
   🔄 Overlap with next chunk: 400 characters

📦 Chunk 1:
   📏 Length: 447 chars, 71 words
   📍 Position: 755-1202
   📝 Preview: ups (NSGs) contain security rules that allow or deny

**Basic Text Search System Overview**

* **Defines a `BasicTextSearcher` class** that ingests text “chunks” and builds an inverted index mapping keywords to their source passages.
* **Tokenizes content** using regular expressions and avoids duplicate entries in the index for each chunk.
* **Implements a relevance scoring** method combining term frequency, exact-phrase matches, and domain-specific bonuses for Azure-related terms.
* **Provides a `search()` method** that retrieves, scores, and ranks matching chunks based on the query, returning the top results.
* **Includes methods for adding new chunks** (`add_chunks`) and rebuilding the index incrementally.
* **Offers a `get_search_statistics()` method** to report total chunks, indexed terms, average/min/max chunk sizes, source distribution, and a rough memory estimate.





In [542]:


# Cell: Basic Text Search System
print("🔍 Building Basic Text Search System...")

import re
from collections import Counter
from typing import List, Dict, Tuple

class BasicTextSearcher:
    """Simple but effective text search for Azure RAG foundation"""
    
    def __init__(self, config):
        """Initialize with configuration"""
        self.config = config
        self.chunks = []  # Will store our text chunks
        self.search_index = {}  # Simple keyword index
        
        print(f"🎯 Search system initialized")
        print(f"   🔍 Ready to index chunks")
        print(f"   📊 Will track search statistics")
    
    def add_chunks(self, chunks: List[Dict]):
        """Add chunks to our search system"""
        self.chunks.extend(chunks)
        self._build_search_index(chunks)
        
        print(f"📚 Added {len(chunks)} chunks to search index")
        print(f"   📦 Total chunks in system: {len(self.chunks)}")
        print(f"   🔑 Index contains {len(self.search_index)} unique terms")
    
    def _build_search_index(self, chunks: List[Dict]):
        """Build a simple keyword index for fast searching"""
        for chunk in chunks:
            chunk_id = chunk['chunk_id']
            text = chunk['content'].lower()
            
            # Extract words (simple tokenization)
            words = re.findall(r'\b\w+\b', text)
            
            # Add to inverted index
            for word in words:
                if word not in self.search_index:
                    self.search_index[word] = []
                
                # Only add if not already there (avoid duplicates)
                if chunk_id not in [item['chunk_id'] for item in self.search_index[word]]:
                    self.search_index[word].append({
                        'chunk_id': chunk_id,
                        'source': chunk['source']
                    })
    
    def _calculate_relevance_score(self, chunk: Dict, query_terms: List[str]) -> float:
        """Calculate how relevant a chunk is to the query"""
        content = chunk['content'].lower()
        score = 0.0
        
        # Count term matches
        for term in query_terms:
            term_count = content.count(term.lower())
            if term_count > 0:
                # Term frequency score (more mentions = higher score)
                tf_score = term_count / len(content.split())
                score += tf_score
                
                # Bonus for exact phrase matches
                if len(query_terms) > 1 and ' '.join(query_terms).lower() in content:
                    score += 0.5
        
        # Bonus for Azure-specific terms (domain relevance)
        azure_terms = ['azure', 'vnet', 'subnet', 'nsg', 'load balancer', 'network']
        azure_bonus = sum(1 for term in azure_terms if term in content) * 0.1
        score += azure_bonus
        
        return score
    
    def search(self, query: str, max_results: int = 5) -> List[Dict]:
        """Search for chunks matching the query"""
        if not query.strip():
            return []
        
        print(f"\n🔍 Searching for: '{query}'")
        
        # Prepare query terms
        query_terms = re.findall(r'\b\w+\b', query.lower())
        print(f"   📝 Query terms: {query_terms}")
        
        # Find candidate chunks
        candidate_chunks = set()
        
        for term in query_terms:
            if term in self.search_index:
                for item in self.search_index[term]:
                    candidate_chunks.add(item['chunk_id'])
        
        print(f"   📦 Found {len(candidate_chunks)} candidate chunks")
        
        if not candidate_chunks:
            print(f"   ❌ No matches found")
            return []
        
        # Score and rank chunks
        scored_chunks = []
        
        for chunk_id in candidate_chunks:
            # Find the actual chunk
            chunk = next((c for c in self.chunks if c['chunk_id'] == chunk_id), None)
            if chunk:
                score = self._calculate_relevance_score(chunk, query_terms)
                if score > 0:
                    scored_chunks.append({
                        'chunk': chunk,
                        'score': score,
                        'matched_terms': [term for term in query_terms 
                                        if term in chunk['content'].lower()]
                    })
        
        # Sort by relevance score (highest first)
        scored_chunks.sort(key=lambda x: x['score'], reverse=True)
        
        # Return top results
        results = scored_chunks[:max_results]
        
        print(f"   ✅ Returning top {len(results)} results")
        for i, result in enumerate(results, 1):
            print(f"      {i}. Score: {result['score']:.3f} | Terms: {result['matched_terms']}")
        
        return results
    
    def get_search_statistics(self) -> Dict:
        """Get statistics about the search system"""
        if not self.chunks:
            return {'status': 'empty'}
        
        # Calculate statistics
        chunk_sizes = [len(chunk['content']) for chunk in self.chunks]
        sources = [chunk['source'] for chunk in self.chunks]
        source_counts = Counter(sources)
        
        return {
            'total_chunks': len(self.chunks),
            'total_terms_indexed': len(self.search_index),
            'avg_chunk_size': sum(chunk_sizes) / len(chunk_sizes),
            'min_chunk_size': min(chunk_sizes),
            'max_chunk_size': max(chunk_sizes),
            'sources': dict(source_counts),
            'memory_estimate_mb': (sum(chunk_sizes) * 4) / (1024 * 1024)  # Rough estimate
        }

# Test the search system
print("\n🧪 Testing Basic Search System...")

# Create search system
searcher = BasicTextSearcher(config)

# Create some test chunks (simulate chunked Azure content)
test_chunks = [
    {
        'content': 'Azure Virtual Networks (VNets) provide isolated networking environments in Azure. You can configure subnets, network security groups, and routing tables.',
        'chunk_id': 0,
        'source': 'Azure VNet Guide',
        'char_start': 0,
        'char_end': 150,
        'char_count': 150,
        'word_count': 25
    },
    {
        'content': 'Network Security Groups (NSGs) contain security rules that control inbound and outbound traffic to Azure resources. Each rule specifies protocol, port, and direction.',
        'chunk_id': 1,
        'source': 'Azure Security Guide', 
        'char_start': 0,
        'char_end': 160,
        'char_count': 160,
        'word_count': 27
    },
    {
        'content': 'Azure Load Balancer distributes incoming network traffic across multiple virtual machines. It operates at Layer 4 and supports both TCP and UDP protocols.',
        'chunk_id': 2,
        'source': 'Azure Load Balancer Guide',
        'char_start': 0,
        'char_end': 155,
        'char_count': 155,
        'word_count': 26
    },
    {
        'content': 'Subnets allow you to segment your Azure virtual network into smaller networks. Each subnet must be assigned a portion of the VNet address space.',
        'chunk_id': 3,
        'source': 'Azure VNet Guide',
        'char_start': 151,
        'char_end': 280,
        'char_count': 129,
        'word_count': 24
    }
]

# Add chunks to searcher
searcher.add_chunks(test_chunks)

# Test different search queries
test_queries = [
    "network security",
    "load balancer",
    "Azure subnet",
    "virtual machine traffic",
    "NSG rules"
]

print(f"\n🎯 Testing Search Queries:")
for query in test_queries:
    results = searcher.search(query, max_results=3)
    
    if results:
        print(f"\n📊 Results for '{query}':")
        for i, result in enumerate(results, 1):
            chunk = result['chunk']
            preview = chunk['content'][:100] + "..." if len(chunk['content']) > 100 else chunk['content']
            print(f"   {i}. [{result['score']:.3f}] {chunk['source']}")
            print(f"      📝 {preview}")
            print(f"      🎯 Matched: {result['matched_terms']}")
    else:
        print(f"\n❌ No results for '{query}'")

# Show system statistics
print(f"\n📊 Search System Statistics:")
stats = searcher.get_search_statistics()
for key, value in stats.items():
    print(f"   📈 {key}: {value}")

print(f"\n✅ Basic Search System Ready!")
print(f"🎯 Next: Simple Storage System for persistence!")

🔍 Building Basic Text Search System...

🧪 Testing Basic Search System...
🎯 Search system initialized
   🔍 Ready to index chunks
   📊 Will track search statistics
📚 Added 4 chunks to search index
   📦 Total chunks in system: 4
   🔑 Index contains 68 unique terms

🎯 Testing Search Queries:

🔍 Searching for: 'network security'
   📝 Query terms: ['network', 'security']
   📦 Found 4 candidate chunks
   ✅ Returning top 3 results
      1. Score: 1.600 | Terms: ['network', 'security']
      2. Score: 1.430 | Terms: ['network', 'security']
      3. Score: 0.483 | Terms: ['network']

📊 Results for 'network security':
   1. [1.600] Azure VNet Guide
      📝 Azure Virtual Networks (VNets) provide isolated networking environments in Azure. You can configure ...
      🎯 Matched: ['network', 'security']
   2. [1.430] Azure Security Guide
      📝 Network Security Groups (NSGs) contain security rules that control inbound and outbound traffic to A...
      🎯 Matched: ['network', 'security']
   3. [0.483]

**Storage Cleanup and Simple Storage System Implementation**


**Key Features:**

* **`clean_test_storage()` helper**

  * Deletes all files matching `session_*` in the configured storage folder to ensure tests start with no leftover data.
  * Prints each deleted filename and a confirmation when done.

* **`SimpleStorageManager` class**

  * **Initialization**:

    * Reads `config.PROCESSED_FOLDER`, creates that directory if missing, and logs the storage location.
  * **Session ID & Hashing**:

    * `_generate_session_id()` → timestamp-based IDs (`session_YYYYMMDD_HHMMSS`).
    * `_calculate_content_hash()` → MD5 hash prefix of content for change detection.
  * **Chunk Persistence** (`save_chunks` / `load_chunks`):

    * `save_chunks()` serializes chunks + metadata to both JSON (human-readable) and pickle (fast-load), returning the session ID.
    * `load_chunks()` prefers pickle over JSON, loads chunks and metadata, and logs counts and timestamps.
  * **Search-Index Persistence** (`save_search_index` / `load_search_index`):

    * `save_search_index()` pickles the inverted index and stats alongside chunk metadata.
    * `load_search_index()` retrieves the index pickle and returns it with its metadata.
  * **Session Management** (`list_saved_sessions` / `cleanup_old_sessions`):

    * `list_saved_sessions()` scans for `*_chunks.json`, reads metadata, and reports which sessions have pickle/index files.
    * `cleanup_old_sessions(keep_latest)` sorts sessions by creation time, deletes all but the most recent N, and logs each removal.
  * **Test Harness** at the end exercises storage of test chunks, loading them back, integration with `BasicTextSearcher`, and session listing.



In [543]:


# Add this to the beginning of your storage test
def clean_test_storage():
    """Clean up all test files before creating new ones"""
    storage_path = Path(config.PROCESSED_FOLDER)
    test_files = list(storage_path.glob("session_*"))
    
    for file in test_files:
        file.unlink()
        print(f"🗑️ Deleted: {file.name}")
    
    print(f"✅ Storage cleaned - ready for fresh test")

# Use it like this:
clean_test_storage()
# Then run your normal storage test







# Cell: Simple Storage System
print("💾 Building Simple Storage System...")

import json
import pickle
from datetime import datetime
from pathlib import Path
import hashlib

class SimpleStorageManager:
    """Simple but robust storage for Azure RAG foundation"""
    
    def __init__(self, config):
        """Initialize with configuration"""
        self.config = config
        self.storage_path = Path(config.PROCESSED_FOLDER)
        self.storage_path.mkdir(parents=True, exist_ok=True)
        
        print(f"💾 Storage system initialized")
        print(f"   📁 Storage location: {self.storage_path}")
        print(f"   🔧 Ready to save/load chunks and search indices")
    
    def _generate_session_id(self):
        """Generate unique session ID"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"session_{timestamp}"
    
    def _calculate_content_hash(self, content):
        """Calculate hash of content for change detection"""
        return hashlib.md5(content.encode('utf-8')).hexdigest()[:8]
    
    def save_chunks(self, chunks, source_name="unknown", session_id=None):
        """Save chunks to storage with metadata"""
        if not chunks:
            print("⚠️  No chunks to save")
            return None
        
        # Generate session ID if not provided
        if session_id is None:
            session_id = self._generate_session_id()
        
        # Prepare storage data
        storage_data = {
            'metadata': {
                'session_id': session_id,
                'source_name': source_name,
                'created_at': datetime.now().isoformat(),
                'total_chunks': len(chunks),
                'chunk_size_config': self.config.CHUNK_SIZE,
                'chunk_overlap_config': self.config.CHUNK_OVERLAP,
                'total_characters': sum(chunk['char_count'] for chunk in chunks),
                'total_words': sum(chunk['word_count'] for chunk in chunks)
            },
            'chunks': chunks
        }
        
        # Save as JSON (human readable)
        json_file = self.storage_path / f"{session_id}_chunks.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(storage_data, f, indent=2, ensure_ascii=False)
        
        # Save as pickle (faster loading)
        pickle_file = self.storage_path / f"{session_id}_chunks.pkl"
        with open(pickle_file, 'wb') as f:
            pickle.dump(storage_data, f)
        
        print(f"✅ Saved {len(chunks)} chunks:")
        print(f"   📄 JSON: {json_file.name}")
        print(f"   🚀 Pickle: {pickle_file.name}")
        print(f"   🔖 Session ID: {session_id}")
        
        return session_id
    
    def load_chunks(self, session_id):
        """Load chunks from storage"""
        pickle_file = self.storage_path / f"{session_id}_chunks.pkl"
        json_file = self.storage_path / f"{session_id}_chunks.json"
        
        # Try pickle first (faster)
        if pickle_file.exists():
            print(f"📚 Loading from pickle: {pickle_file.name}")
            with open(pickle_file, 'rb') as f:
                data = pickle.load(f)
        elif json_file.exists():
            print(f"📚 Loading from JSON: {json_file.name}")
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
        else:
            print(f"❌ Session not found: {session_id}")
            return None, None
        
        chunks = data['chunks']
        metadata = data['metadata']
        
        print(f"✅ Loaded {len(chunks)} chunks from {metadata['source_name']}")
        print(f"   📅 Created: {metadata['created_at']}")
        print(f"   📊 Total chars: {metadata['total_characters']:,}")
        print(f"   📄 Total words: {metadata['total_words']:,}")
        
        return chunks, metadata
    
    def save_search_index(self, searcher, session_id):
        """Save search index and statistics"""
        if not hasattr(searcher, 'search_index'):
            print("⚠️  No search index to save")
            return
        
        # Prepare index data
        index_data = {
            'metadata': {
                'session_id': session_id,
                'created_at': datetime.now().isoformat(),
                'total_chunks_indexed': len(searcher.chunks),
                'total_terms': len(searcher.search_index),
                'statistics': searcher.get_search_statistics()
            },
            'search_index': searcher.search_index,
            'chunks_metadata': [{
                'chunk_id': chunk['chunk_id'],
                'source': chunk['source'],
                'char_count': chunk['char_count']
            } for chunk in searcher.chunks]
        }
        
        # Save index
        index_file = self.storage_path / f"{session_id}_search_index.pkl"
        with open(index_file, 'wb') as f:
            pickle.dump(index_data, f)
        
        print(f"✅ Saved search index:")
        print(f"   🔍 File: {index_file.name}")
        print(f"   📊 Terms indexed: {len(searcher.search_index):,}")
        print(f"   📦 Chunks indexed: {len(searcher.chunks)}")
    
    def load_search_index(self, session_id):
        """Load search index"""
        index_file = self.storage_path / f"{session_id}_search_index.pkl"
        
        if not index_file.exists():
            print(f"❌ Search index not found: {session_id}")
            return None
        
        with open(index_file, 'rb') as f:
            index_data = pickle.load(f)
        
        metadata = index_data['metadata']
        search_index = index_data['search_index']
        
        print(f"✅ Loaded search index:")
        print(f"   📅 Created: {metadata['created_at']}")
        print(f"   🔍 Terms: {len(search_index):,}")
        print(f"   📦 Chunks: {metadata['total_chunks_indexed']}")
        
        return search_index, metadata
    
    def list_saved_sessions(self):
        """List all saved sessions"""
        json_files = list(self.storage_path.glob("*_chunks.json"))
        
        if not json_files:
            print("📭 No saved sessions found")
            return []
        
        sessions = []
        print(f"📚 Found {len(json_files)} saved sessions:")
        
        for json_file in sorted(json_files):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                metadata = data['metadata']
                session_id = metadata['session_id']
                
                # Check for corresponding files
                has_pickle = (self.storage_path / f"{session_id}_chunks.pkl").exists()
                has_index = (self.storage_path / f"{session_id}_search_index.pkl").exists()
                
                session_info = {
                    'session_id': session_id,
                    'source_name': metadata['source_name'],
                    'created_at': metadata['created_at'],
                    'total_chunks': metadata['total_chunks'],
                    'has_pickle': has_pickle,
                    'has_search_index': has_index
                }
                
                sessions.append(session_info)
                
                print(f"   📄 {session_id}")
                print(f"      📚 Source: {metadata['source_name']}")
                print(f"      📅 Created: {metadata['created_at']}")
                print(f"      📦 Chunks: {metadata['total_chunks']}")
                print(f"      🚀 Pickle: {'✅' if has_pickle else '❌'}")
                print(f"      🔍 Index: {'✅' if has_index else '❌'}")
                
            except Exception as e:
                print(f"   ⚠️  Error reading {json_file.name}: {e}")
        
        return sessions
    
    def cleanup_old_sessions(self, keep_latest=5):
        """Keep only the latest N sessions"""
        sessions = self.list_saved_sessions()
        
        if len(sessions) <= keep_latest:
            print(f"📚 Only {len(sessions)} sessions found, keeping all")
            return
        
        # Sort by creation time and keep latest
        sessions.sort(key=lambda x: x['created_at'], reverse=True)
        sessions_to_delete = sessions[keep_latest:]
        
        print(f"🧹 Cleaning up {len(sessions_to_delete)} old sessions...")
        
        for session in sessions_to_delete:
            session_id = session['session_id']
            
            # Delete all files for this session
            files_to_delete = [
                f"{session_id}_chunks.json",
                f"{session_id}_chunks.pkl", 
                f"{session_id}_search_index.pkl"
            ]
            
            for filename in files_to_delete:
                file_path = self.storage_path / filename
                if file_path.exists():
                    file_path.unlink()
                    print(f"   🗑️  Deleted: {filename}")
        
        print(f"✅ Cleanup complete, kept {keep_latest} latest sessions")

# Test the storage system
print("\n🧪 Testing Simple Storage System...")

# Create storage manager
storage = SimpleStorageManager(config)

# Test with our previous chunking example - create some test data
test_chunks_for_storage = [
    {
        'content': 'Azure Virtual Networks (VNets) provide the foundation for private networking in Azure.',
        'chunk_id': 0,
        'source': 'Azure VNet Documentation',
        'char_start': 0,
        'char_end': 85,
        'char_count': 85,
        'word_count': 14
    },
    {
        'content': 'Network Security Groups (NSGs) act as virtual firewalls for your Azure resources.',
        'chunk_id': 1,
        'source': 'Azure Security Documentation',
        'char_start': 0,
        'char_end': 82,
        'char_count': 82,
        'word_count': 13
    }
]

# Test saving chunks
print(f"\n💾 Testing chunk storage...")
session_id = storage.save_chunks(test_chunks_for_storage, "Test Azure Documentation")

# Test loading chunks
print(f"\n📚 Testing chunk loading...")
loaded_chunks, metadata = storage.load_chunks(session_id)

if loaded_chunks:
    print(f"✅ Successfully loaded {len(loaded_chunks)} chunks")
    
    # Test with search system
    print(f"\n🔍 Testing integration with search system...")
    test_searcher = BasicTextSearcher(config)
    test_searcher.add_chunks(loaded_chunks)
    
    # Save search index
    storage.save_search_index(test_searcher, session_id)
    
    # Test search
    results = test_searcher.search("Azure network")
    if results:
        print(f"✅ Search works with loaded chunks!")
        print(f"   🎯 Found {len(results)} results for 'Azure network'")

# List all sessions
print(f"\n📋 Listing all saved sessions...")
storage.list_saved_sessions()

print(f"\n✅ Simple Storage System Complete!")
print(f"🎯 Foundation notebook ready - all components working together!")

🗑️ Deleted: session_20250629_124156_search_index.pkl
🗑️ Deleted: session_20250629_124156_chunks.pkl
🗑️ Deleted: session_20250629_124156_chunks.json
✅ Storage cleaned - ready for fresh test
💾 Building Simple Storage System...

🧪 Testing Simple Storage System...
💾 Storage system initialized
   📁 Storage location: data/processed
   🔧 Ready to save/load chunks and search indices

💾 Testing chunk storage...
✅ Saved 2 chunks:
   📄 JSON: session_20250629_125835_chunks.json
   🚀 Pickle: session_20250629_125835_chunks.pkl
   🔖 Session ID: session_20250629_125835

📚 Testing chunk loading...
📚 Loading from pickle: session_20250629_125835_chunks.pkl
✅ Loaded 2 chunks from Test Azure Documentation
   📅 Created: 2025-06-29T12:58:35.646276
   📊 Total chars: 167
   📄 Total words: 27
✅ Successfully loaded 2 chunks

🔍 Testing integration with search system...
🎯 Search system initialized
   🔍 Ready to index chunks
   📊 Will track search statistics
📚 Added 2 chunks to search index
   📦 Total chunks in sys