# Python Modules Installation

In [None]:
!pip install chromadb sentence-transformers PyPDF2 openai tqdm pandas ragas datasets

 # Setting up the Open AI API Key

In [None]:
from google.colab import userdata
import openai

# Load API key from secrets
openai.api_key = userdata.get('OPENAI_API_KEY')

# Quick test
response = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Hello"}],
    max_tokens=5
)
print("✅ OpenAI API working!")

# Uploading the 5 Internal Docs files

In [None]:
from google.colab import files
import os

print("📚 Upload your mainframe migration documentation (.md files):")
uploaded = files.upload()

# Process uploaded markdown files
md_files = []
for filename, content in uploaded.items():
    if filename.endswith('.md'):
        # Decode binary content to text for markdown files
        text_content = content.decode('utf-8')
        with open(f"/content/{filename}", 'w', encoding='utf-8') as f:
            f.write(text_content)
        md_files.append(f"/content/{filename}")
        print(f"✅ {filename}")

print(f"📊 Ready to process {len(md_files)} markdown files")

# Optional: Display the files that will be processed
if md_files:
    print("\n📋 Uploaded files:")
    for file_path in md_files:
        filename = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)
        print(f"  • {filename} ({file_size} bytes)")
else:
    print("⚠️  No markdown files were uploaded. Please upload .md files.")

# Sets up a ChromaDB vector database with OpenAI embeddings

In [None]:
import chromadb
import openai
import numpy as np
from typing import List

print("🚀 Setting up ChromaDB with OpenAI Embeddings")
print("=" * 60)

# ==========================================
# CHROMADB SETUP
# ==========================================

# Simple in-memory vector database
client = chromadb.Client()

# Collection name
collection_name = "aws_docs"

# Check if collection exists and delete it
try:
    # Get list of existing collections
    existing_collections = client.list_collections()
    collection_names = [col.name for col in existing_collections]

    if collection_name in collection_names:
        print(f"🗑️ Deleting existing collection: {collection_name}")
        client.delete_collection(name=collection_name)
        print(f"✅ Collection '{collection_name}' deleted successfully")
    else:
        print(f"ℹ️ No existing collection named '{collection_name}' found")

except Exception as e:
    print(f"⚠️ Error checking/deleting collection: {e}")

# Create new collection
try:
    collection = client.create_collection(name=collection_name)
    print(f"✅ Created new collection: {collection_name}")
except Exception as e:
    print(f"❌ Error creating collection: {e}")
    # Fallback: try to get existing collection
    try:
        collection = client.get_collection(name=collection_name)
        print(f"📁 Using existing collection: {collection_name}")
    except Exception as fallback_error:
        print(f"❌ Could not create or access collection: {fallback_error}")
        raise

# ==========================================
# OPENAI EMBEDDING MODEL SETUP
# ==========================================

class OpenAIEmbeddingModel:
    """OpenAI embedding model wrapper compatible with existing pipeline"""

    def __init__(self, model: str = "text-embedding-3-small"):
        self.model = model
        self.embedding_dimension = None

        # Test OpenAI connection and get embedding dimension
        try:
            print(f"🔄 Testing OpenAI {model} connection...")
            test_response = openai.embeddings.create(
                model=self.model,
                input=["test connection"]
            )
            self.embedding_dimension = len(test_response.data[0].embedding)
            print(f"✅ OpenAI {model} connected successfully!")
            print(f"📏 Embedding dimension: {self.embedding_dimension}")

        except Exception as e:
            print(f"❌ OpenAI connection failed: {e}")
            print("🔑 Make sure OPENAI_API_KEY is set in Colab secrets")
            raise

    def encode(self, texts, show_progress_bar=False, convert_to_numpy=True):
        """Generate embeddings for texts (compatible with sentence-transformers interface)"""

        # Handle single string input
        if isinstance(texts, str):
            texts = [texts]

        try:
            response = openai.embeddings.create(
                model=self.model,
                input=texts
            )

            # Extract embeddings
            embeddings = []
            for item in response.data:
                embeddings.append(item.embedding)

            if convert_to_numpy:
                return np.array(embeddings)
            else:
                return embeddings

        except Exception as e:
            print(f"❌ OpenAI embedding error: {e}")
            return np.array([]) if convert_to_numpy else []

    def get_sentence_embedding_dimension(self):
        """Get embedding dimension"""
        return self.embedding_dimension

# Load OpenAI embedding model
try:
    embedding_model = OpenAIEmbeddingModel("text-embedding-3-large")
    print("✅ OpenAI embedding model loaded successfully")
except Exception as e:
    print(f"❌ Error loading OpenAI embedding model: {e}")
    print("🔄 Falling back to sentence-transformers model...")
    try:
        from sentence_transformers import SentenceTransformer
        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("✅ Fallback: Sentence-transformers model loaded")
    except Exception as fallback_error:
        print(f"❌ Both OpenAI and sentence-transformers failed: {fallback_error}")
        raise

# ==========================================
# EMBEDDING QUALITY TEST
# ==========================================

print("\n🧪 Testing embedding quality...")

# Test the problematic query

query = "What is AWS Transform?"
test_content = "AWS Transform automates COBOL to Java conversion with AI-powered code analysis and modernization."
try:
    # Generate embeddings
    query_embedding = embedding_model.encode([query])
    content_embedding = embedding_model.encode([test_content])

    # Calculate similarity
    from sklearn.metrics.pairwise import cosine_similarity
    similarity = cosine_similarity(query_embedding, content_embedding)[0][0]

    print(f"🎯 Embedding quality test:")
    print(f"   Query: '{query}'")
    print(f"   Similarity score: {similarity:.3f}")

    if similarity > 0.8:
        print("🎉 EXCELLENT! High-quality embeddings")
    elif similarity > 0.7:
        print("✅ GOOD! Strong semantic matching")
    elif similarity > 0.5:
        print("🟡 MODERATE: Acceptable but could be better")
    else:
        print("❌ LOW: Embeddings may not work well for this use case")

except Exception as e:
    print(f"❌ Embedding test failed: {e}")

print("\n" + "=" * 60)
print("🎉 ChromaDB + OpenAI Embedding Setup Complete!")
print(f"📊 Collection: {collection_name}")
print(f"🤖 Model: {embedding_model.model if hasattr(embedding_model, 'model') else 'sentence-transformers'}")
print(f"📏 Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")
print("📝 Ready for improved document storage and retrieval")
print("=" * 60)

# Store key variables for next phases
setup_complete = True
print(f"\n💾 Variables ready for next phases:")
print(f"   ✅ client (ChromaDB client)")
print(f"   ✅ collection ('{collection_name}' collection)")
print(f"   ✅ embedding_model (OpenAI or sentence-transformers)")
print(f"   ✅ setup_complete = {setup_complete}")

# Chunks markdown files into text segments for vector database storage.

In [None]:
# ==========================================
# Document Processing Pipeline
# ==========================================

import re
import tiktoken
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from tqdm import tqdm
import hashlib

print("🚀 Phase 1B: Document Processing Pipeline (Markdown)")
print("=" * 50)

# ==========================================
# DOCUMENT CHUNK DATA STRUCTURE
# ==========================================

@dataclass
class DocumentChunk:
    """Structured document chunk with metadata"""
    id: str
    content: str
    source_file: str
    page_number: int  # For markdown, this will be section number
    chunk_index: int
    token_count: int
    char_count: int
    section_title: Optional[str] = None
    chunk_type: str = "content"  # content, header, code, etc.

    def to_dict(self) -> Dict:
        """Convert to dictionary for ChromaDB storage"""
        return {
            'id': self.id,
            'content': self.content,
            'source_file': self.source_file,
            'page_number': self.page_number,
            'chunk_index': self.chunk_index,
            'token_count': self.token_count,
            'char_count': self.char_count,
            'section_title': self.section_title or "",
            'chunk_type': self.chunk_type
        }

# ==========================================
# MARKDOWN TEXT EXTRACTION
# ==========================================

class MarkdownProcessor:
    """Advanced Markdown text extraction with metadata preservation"""

    def __init__(self):
        self.tokenizer = tiktoken.get_encoding("cl100k_base")  # GPT tokenizer

    def extract_text_from_markdown(self, md_path: str) -> List[Dict]:
        """Extract text from Markdown with section-level metadata"""

        print(f"📄 Processing: {md_path}")

        try:
            with open(md_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Split markdown into sections based on headers
            sections_data = self._parse_markdown_sections(content)

            print(f"✅ Extracted {len(sections_data)} sections from {md_path}")
            return sections_data

        except Exception as e:
            print(f"❌ Error processing {md_path}: {e}")
            return []

    def _parse_markdown_sections(self, content: str) -> List[Dict]:
        """Parse markdown content into sections based on headers"""

        sections_data = []
        lines = content.split('\n')

        current_section = {
            'section_number': 1,
            'header': None,
            'content': '',
            'level': 0
        }

        section_counter = 1

        for line in lines:
            # Check if line is a header
            header_match = re.match(r'^(#+)\s*(.+)$', line)

            if header_match:
                # Save current section if it has content
                if current_section['content'].strip():
                    cleaned_content = self._clean_markdown_text(current_section['content'])
                    if cleaned_content.strip():
                        section_data = {
                            'section_number': current_section['section_number'],
                            'header': current_section['header'],
                            'raw_text': current_section['content'],
                            'cleaned_text': cleaned_content,
                            'char_count': len(cleaned_content),
                            'token_count': len(self.tokenizer.encode(cleaned_content)),
                            'level': current_section['level']
                        }
                        sections_data.append(section_data)

                # Start new section
                header_level = len(header_match.group(1))
                header_text = header_match.group(2).strip()

                current_section = {
                    'section_number': section_counter,
                    'header': header_text,
                    'content': '',
                    'level': header_level
                }
                section_counter += 1
            else:
                # Add line to current section content
                current_section['content'] += line + '\n'

        # Don't forget the last section
        if current_section['content'].strip():
            cleaned_content = self._clean_markdown_text(current_section['content'])
            if cleaned_content.strip():
                section_data = {
                    'section_number': current_section['section_number'],
                    'header': current_section['header'],
                    'raw_text': current_section['content'],
                    'cleaned_text': cleaned_content,
                    'char_count': len(cleaned_content),
                    'token_count': len(self.tokenizer.encode(cleaned_content)),
                    'level': current_section['level']
                }
                sections_data.append(section_data)

        return sections_data

    def _clean_markdown_text(self, text: str) -> str:
        """Clean and normalize markdown text while preserving important formatting"""

        # Remove markdown syntax but preserve structure
        # Remove bold/italic markers but keep the text
        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # **bold**
        text = re.sub(r'\*(.*?)\*', r'\1', text)      # *italic*
        text = re.sub(r'__(.*?)__', r'\1', text)      # __bold__
        text = re.sub(r'_(.*?)_', r'\1', text)        # _italic_

        # Clean up code blocks but preserve content
        text = re.sub(r'```\w*\n(.*?)\n```', r'\1', text, flags=re.DOTALL)  # Code blocks
        text = re.sub(r'`([^`]+)`', r'\1', text)      # Inline code

        # Remove markdown links but keep text
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # [text](url)

        # Remove markdown image syntax
        text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', text)  # ![alt](url)

        # Clean up tables - convert to readable format
        text = re.sub(r'\|', ' | ', text)  # Make table separators more readable

        # Remove excessive whitespace
        text = re.sub(r'\n{3,}', '\n\n', text)  # Multiple newlines
        text = re.sub(r'[ \t]+', ' ', text)     # Multiple spaces/tabs

        # Remove markdown horizontal rules
        text = re.sub(r'^[-=*]{3,}$', '', text, flags=re.MULTILINE)

        return text.strip()

# ==========================================
# INTELLIGENT TEXT CHUNKING (Updated for Markdown)
# ==========================================

class SmartChunker:
    """Intelligent text chunking with semantic awareness for Markdown"""

    def __init__(self, chunk_size: int = 800, overlap: int = 150, min_chunk_size: int = 100):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.min_chunk_size = min_chunk_size
        self.tokenizer = tiktoken.get_encoding("cl100k_base")

    def chunk_documents(self, sections_data: List[Dict], source_file: str) -> List[DocumentChunk]:
        """Create intelligent chunks from extracted markdown sections"""

        print(f"🔪 Chunking document: {source_file}")

        all_chunks = []
        chunk_counter = 0

        for section_data in tqdm(sections_data, desc="Processing sections"):
            section_text = section_data['cleaned_text']
            section_number = section_data['section_number']
            section_header = section_data['header']

            # Create chunks for this section
            section_chunks = self._create_semantic_chunks(
                text=section_text,
                section_title=section_header,
                section_number=section_number,
                source_file=source_file,
                start_chunk_idx=chunk_counter
            )

            all_chunks.extend(section_chunks)
            chunk_counter += len(section_chunks)

        print(f"✅ Created {len(all_chunks)} chunks")
        return all_chunks

    def _create_semantic_chunks(self, text: str, section_title: Optional[str],
                              section_number: int, source_file: str,
                              start_chunk_idx: int) -> List[DocumentChunk]:
        """Create semantic chunks from section text"""

        chunks = []

        # Split by paragraphs first (better semantic boundaries)
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

        if not paragraphs:
            return chunks

        current_chunk = ""
        chunk_idx = start_chunk_idx

        for paragraph in paragraphs:
            # Check if adding this paragraph would exceed chunk size
            potential_chunk = current_chunk + "\n\n" + paragraph if current_chunk else paragraph
            token_count = len(self.tokenizer.encode(potential_chunk))

            if token_count <= self.chunk_size:
                # Add paragraph to current chunk
                current_chunk = potential_chunk
            else:
                # Current chunk is ready, start new one
                if current_chunk:
                    chunk = self._create_chunk(
                        content=current_chunk,
                        source_file=source_file,
                        section_number=section_number,
                        chunk_index=chunk_idx,
                        section_title=section_title
                    )
                    chunks.append(chunk)
                    chunk_idx += 1

                # Handle overlap: take last sentences from previous chunk
                if chunks and self.overlap > 0:
                    overlap_text = self._get_overlap_text(current_chunk, self.overlap)
                    current_chunk = overlap_text + "\n\n" + paragraph
                else:
                    current_chunk = paragraph

        # Add final chunk
        if current_chunk and len(current_chunk.strip()) >= self.min_chunk_size:
            chunk = self._create_chunk(
                content=current_chunk,
                source_file=source_file,
                section_number=section_number,
                chunk_index=chunk_idx,
                section_title=section_title
            )
            chunks.append(chunk)

        return chunks

    def _get_overlap_text(self, text: str, target_tokens: int) -> str:
        """Get last N tokens from text for overlap"""

        sentences = re.split(r'[.!?]+', text)
        overlap_text = ""
        token_count = 0

        # Build overlap from end backwards
        for sentence in reversed(sentences):
            sentence = sentence.strip()
            if not sentence:
                continue

            sentence_tokens = len(self.tokenizer.encode(sentence))
            if token_count + sentence_tokens <= target_tokens:
                overlap_text = sentence + ". " + overlap_text
                token_count += sentence_tokens
            else:
                break

        return overlap_text.strip()

    def _create_chunk(self, content: str, source_file: str, section_number: int,
                     chunk_index: int, section_title: Optional[str]) -> DocumentChunk:
        """Create a DocumentChunk object"""

        # Generate unique ID
        chunk_id = hashlib.md5(
            f"{source_file}_{chunk_index}_{content[:50]}".encode()
        ).hexdigest()[:12]

        # Calculate metrics
        token_count = len(self.tokenizer.encode(content))
        char_count = len(content)

        # Determine chunk type based on content and section title
        chunk_type = "content"
        if section_title:
            title_lower = section_title.lower()
            if any(keyword in title_lower for keyword in ['troubleshoot', 'error', 'issue', 'problem']):
                chunk_type = "troubleshooting"
            elif any(keyword in title_lower for keyword in ['example', 'sample', 'demo']):
                chunk_type = "example"
            elif any(keyword in title_lower for keyword in ['standard', 'guideline', 'rule']):
                chunk_type = "standards"
            elif any(keyword in title_lower for keyword in ['architecture', 'decision', 'adr']):
                chunk_type = "architecture"
            elif any(keyword in title_lower for keyword in ['specification', 'spec', 'program']):
                chunk_type = "specification"

        return DocumentChunk(
            id=chunk_id,
            content=content,
            source_file=source_file,
            page_number=section_number,  # Using section number instead of page number
            chunk_index=chunk_index,
            token_count=token_count,
            char_count=char_count,
            section_title=section_title,
            chunk_type=chunk_type
        )

# ==========================================
# CHUNK QUALITY VALIDATION (Same as before)
# ==========================================

class ChunkValidator:
    """Validate chunk quality and filter low-quality chunks"""

    @staticmethod
    def validate_chunk(chunk: DocumentChunk) -> Tuple[bool, str]:
        """Validate chunk quality, return (is_valid, reason)"""

        content = chunk.content.strip()

        # Check minimum length
        if chunk.char_count < 50:
            return False, "Too short"

        # Check for too many special characters (likely OCR errors)
        special_char_ratio = sum(1 for c in content if not c.isalnum() and c not in ' .,!?;:-\n()[]{}') / len(content)
        if special_char_ratio > 0.4:  # Slightly higher threshold for markdown
            return False, "Too many special characters"

        # Check for reasonable word count
        words = content.split()
        if len(words) < 10:
            return False, "Too few words"

        # Check for excessive repetition
        unique_words = set(words)
        if len(unique_words) / len(words) < 0.3:
            return False, "Too repetitive"

        # Check for table of contents patterns (usually not useful for RAG)
        if re.search(r'\.{10,}|\t{5,}|_{10,}', content):
            return False, "Looks like table of contents"

        return True, "Valid"

    @staticmethod
    def filter_chunks(chunks: List[DocumentChunk]) -> Tuple[List[DocumentChunk], Dict[str, int]]:
        """Filter chunks and return valid ones with statistics"""

        valid_chunks = []
        rejection_stats = {}

        for chunk in chunks:
            is_valid, reason = ChunkValidator.validate_chunk(chunk)

            if is_valid:
                valid_chunks.append(chunk)
            else:
                rejection_stats[reason] = rejection_stats.get(reason, 0) + 1

        return valid_chunks, rejection_stats

# ==========================================
# MAIN PROCESSING PIPELINE (Updated for Markdown)
# ==========================================

def process_uploaded_markdowns(md_files: List[str]) -> List[DocumentChunk]:
    """Main pipeline to process all uploaded Markdown files"""

    print("\n🔄 Starting Markdown Document Processing Pipeline")
    print("=" * 50)

    # Initialize processors
    md_processor = MarkdownProcessor()
    chunker = SmartChunker(chunk_size=800, overlap=150)

    all_chunks = []
    processing_stats = {}

    for md_path in md_files:
        filename = md_path.split('/')[-1]
        print(f"\n📚 Processing: {filename}")

        # Extract text from Markdown
        sections_data = md_processor.extract_text_from_markdown(md_path)

        if not sections_data:
            print(f"⚠️ No text extracted from {filename}")
            continue

        # Create chunks
        chunks = chunker.chunk_documents(sections_data, filename)

        # Validate chunks
        valid_chunks, rejection_stats = ChunkValidator.filter_chunks(chunks)

        # Update statistics
        processing_stats[filename] = {
            'sections': len(sections_data),
            'total_chunks': len(chunks),
            'valid_chunks': len(valid_chunks),
            'rejected': len(chunks) - len(valid_chunks),
            'rejection_reasons': rejection_stats
        }

        all_chunks.extend(valid_chunks)

        print(f"✅ {filename}: {len(valid_chunks)} valid chunks from {len(chunks)} total")

    # Display final statistics
    print("\n📊 PROCESSING SUMMARY")
    print("=" * 50)

    total_sections = sum(stats['sections'] for stats in processing_stats.values())
    total_valid_chunks = len(all_chunks)
    total_rejected = sum(stats['rejected'] for stats in processing_stats.values())

    print(f"📄 Total sections processed: {total_sections}")
    print(f"✅ Valid chunks created: {total_valid_chunks}")
    print(f"❌ Chunks rejected: {total_rejected}")

    if total_rejected > 0:
        print("\n🔍 Rejection reasons:")
        all_rejection_reasons = {}
        for stats in processing_stats.values():
            for reason, count in stats['rejection_reasons'].items():
                all_rejection_reasons[reason] = all_rejection_reasons.get(reason, 0) + count

        for reason, count in all_rejection_reasons.items():
            print(f"  • {reason}: {count}")

    # Sample some chunks for review
    if all_chunks:
        print(f"\n📋 Sample chunks:")
        for i, chunk in enumerate(all_chunks[:3]):
            print(f"\nChunk {i+1} (ID: {chunk.id}):")
            print(f"Source: {chunk.source_file}, Section: {chunk.page_number}")
            print(f"Section Title: {chunk.section_title or 'N/A'}")
            print(f"Content preview: {chunk.content[:200]}...")
            print(f"Tokens: {chunk.token_count}, Type: {chunk.chunk_type}")

    print("=" * 50)
    print(f"🎉 Markdown document processing complete! Ready for Phase 1C (Embedding & Storage)")

    return all_chunks

# ==========================================
# EXECUTION (Updated for Markdown)
# ==========================================

# Process the uploaded Markdown files (assuming md_files variable exists from Phase 1A)
try:
    processed_chunks = process_uploaded_markdowns(md_files)
    print(f"\n✅ Phase 1B Complete: {len(processed_chunks)} chunks ready for embedding")

    # Store for next phase
    phase1b_output = {
        'chunks': processed_chunks,
        'total_chunks': len(processed_chunks),
        'ready_for_embedding': True
    }

except NameError:
    print("❌ md_files not found. Please run Phase 1A first to upload Markdown files.")
    print("📋 Expected variable: md_files (list of Markdown file paths)")

    # For testing, you can manually set:
    # md_files = ['/content/your_migration_guide.md']  # Add your MD paths here
    # processed_chunks = process_uploaded_markdowns(md_files)

# Generating embeddings and storing in vector database

In [None]:
import numpy as np
from typing import List, Dict, Any, Optional
import time
from tqdm import tqdm

print("🧠 Generating embeddings and storing in vector database...")

# ==========================================
# EMBEDDING GENERATION
# ==========================================

class EmbeddingGenerator:
    """Generate and manage embeddings for document chunks"""

    def __init__(self, model, batch_size: int = 32):
        self.model = model
        self.batch_size = batch_size
        self.embedding_cache = {}

    def generate_embeddings(self, chunks: List[DocumentChunk]) -> Dict[str, np.ndarray]:
        """Generate embeddings for all chunks with batching"""

        print(f"🔄 Generating embeddings for {len(chunks)} chunks...")

        # Prepare texts for embedding
        texts = []
        chunk_ids = []

        for chunk in chunks:
            # Create rich text for embedding (includes context)
            embedding_text = self._prepare_embedding_text(chunk)
            texts.append(embedding_text)
            chunk_ids.append(chunk.id)

        # Generate embeddings in batches
        all_embeddings = {}

        for i in tqdm(range(0, len(texts), self.batch_size), desc="Generating embeddings"):
            batch_texts = texts[i:i + self.batch_size]
            batch_ids = chunk_ids[i:i + self.batch_size]

            # Generate embeddings for this batch
            batch_embeddings = self.model.encode(
                batch_texts,
                show_progress_bar=False,
                convert_to_numpy=True
            )

            # Store embeddings
            for chunk_id, embedding in zip(batch_ids, batch_embeddings):
                all_embeddings[chunk_id] = embedding

        print(f"✅ Generated {len(all_embeddings)} embeddings")
        return all_embeddings

    def _prepare_embedding_text(self, chunk: DocumentChunk) -> str:
        """Prepare rich text for embedding generation with migration-specific context"""

        # Start with main content
        embedding_text = chunk.content

        # Add section context if available
        if chunk.section_title:
            embedding_text = f"Section: {chunk.section_title}\n\n{embedding_text}"

        # Add document type and migration context
        source_context = f"Source: {chunk.source_file}"

        # Add chunk type for better retrieval
        if chunk.chunk_type != "content":
            source_context += f" (Type: {chunk.chunk_type})"

        # Add migration domain context
        migration_context = "Domain: Mainframe to Java Migration"

        # Determine document category from filename
        filename_lower = chunk.source_file.lower()
        if 'playbook' in filename_lower or 'migration' in filename_lower:
            migration_context += " - Migration Guide"
        elif 'troubleshoot' in filename_lower:
            migration_context += " - Troubleshooting"
        elif 'standard' in filename_lower or 'java' in filename_lower:
            migration_context += " - Development Standards"
        elif 'architecture' in filename_lower or 'adr' in filename_lower:
            migration_context += " - Architecture Decisions"
        elif 'spec' in filename_lower or 'cobol' in filename_lower:
            migration_context += " - Technical Specifications"

        embedding_text = f"{source_context}\n{migration_context}\n\n{embedding_text}"

        return embedding_text

# ==========================================
# VECTOR DATABASE STORAGE
# ==========================================

class VectorStore:
    """Manage ChromaDB vector storage operations"""

    def __init__(self, collection):
        self.collection = collection
        self.stored_count = 0

    def store_chunks_with_embeddings(self, chunks: List[DocumentChunk],
                                   embeddings: Dict[str, np.ndarray]) -> bool:
        """Store chunks and their embeddings in ChromaDB"""

        print(f"💾 Storing {len(chunks)} chunks in vector database...")

        try:
            # Prepare data for ChromaDB
            ids = []
            documents = []
            metadatas = []
            embeddings_list = []

            for chunk in tqdm(chunks, desc="Preparing for storage"):
                if chunk.id in embeddings:
                    ids.append(chunk.id)
                    documents.append(chunk.content)
                    metadatas.append(self._prepare_metadata(chunk))
                    embeddings_list.append(embeddings[chunk.id].tolist())

            # Store in ChromaDB (batch operation)
            self.collection.add(
                ids=ids,
                documents=documents,
                metadatas=metadatas,
                embeddings=embeddings_list
            )

            self.stored_count = len(ids)
            print(f"✅ Successfully stored {self.stored_count} chunks in vector database")
            return True

        except Exception as e:
            print(f"❌ Error storing chunks: {e}")
            return False

    def _prepare_metadata(self, chunk: DocumentChunk) -> Dict[str, Any]:
        """Prepare enhanced metadata for ChromaDB storage"""

        metadata = {
            'source_file': chunk.source_file,
            'section_number': chunk.page_number,  # Using section number instead of page
            'chunk_index': chunk.chunk_index,
            'token_count': chunk.token_count,
            'char_count': chunk.char_count,
            'chunk_type': chunk.chunk_type
        }

        # Add section title if available
        if chunk.section_title:
            metadata['section_title'] = chunk.section_title

        # Add document category based on filename
        filename_lower = chunk.source_file.lower()
        if 'playbook' in filename_lower:
            metadata['doc_category'] = 'migration_guide'
        elif 'troubleshoot' in filename_lower:
            metadata['doc_category'] = 'troubleshooting'
        elif 'standard' in filename_lower or 'java' in filename_lower:
            metadata['doc_category'] = 'development_standards'
        elif 'architecture' in filename_lower or 'adr' in filename_lower:
            metadata['doc_category'] = 'architecture'
        elif 'spec' in filename_lower or 'cobol' in filename_lower:
            metadata['doc_category'] = 'specification'
        else:
            metadata['doc_category'] = 'general'

        # Add content indicators
        content_lower = chunk.content.lower()
        metadata['has_code'] = bool(any(lang in content_lower for lang in ['java', 'cobol', 'sql', 'yaml', 'xml']))
        metadata['has_error_info'] = bool(any(term in content_lower for term in ['error', 'exception', 'failed', 'issue']))
        metadata['migration_phase'] = 'unknown'

        # Determine migration phase
        if any(term in content_lower for term in ['planning', 'assessment', 'analysis']):
            metadata['migration_phase'] = 'planning'
        elif any(term in content_lower for term in ['conversion', 'transform', 'migrate']):
            metadata['migration_phase'] = 'execution'
        elif any(term in content_lower for term in ['test', 'validate', 'verify']):
            metadata['migration_phase'] = 'testing'
        elif any(term in content_lower for term in ['deploy', 'production', 'cutover']):
            metadata['migration_phase'] = 'deployment'

        return metadata

    def get_collection_stats(self) -> Dict[str, Any]:
        """Get statistics about the vector database"""

        try:
            count = self.collection.count()
            sample_metadata = None

            if count > 0:
                # Get a sample to check metadata structure
                sample = self.collection.peek(limit=1)
                if sample['metadatas']:
                    sample_metadata = sample['metadatas'][0]

            stats = {
                'total_documents': count,
                'collection_name': self.collection.name,
                'sample_metadata_keys': list(sample_metadata.keys()) if sample_metadata else [],
                'storage_success': count > 0
            }

            return stats

        except Exception as e:
            print(f"❌ Error getting collection stats: {e}")
            return {'error': str(e)}

# ==========================================
# RETRIEVAL SYSTEM
# ==========================================

class RetrievalSystem:
    """Handle similarity search and retrieval operations for migration content"""

    def __init__(self, collection, embedding_model, top_k: int = 5):
        self.collection = collection
        self.embedding_model = embedding_model
        self.top_k = top_k

    def search(self, query: str, top_k: Optional[int] = None, filter_metadata: Dict = None) -> List[Dict[str, Any]]:
        """Search for relevant chunks using semantic similarity"""

        search_k = top_k or self.top_k

        try:
            # Enhance query with migration context
            enhanced_query = self._enhance_query(query)

            # Generate query embedding
            query_embedding = self.embedding_model.encode([enhanced_query])

            # Prepare search parameters
            search_params = {
                'query_embeddings': query_embedding.tolist(),
                'n_results': search_k,
                'include': ['documents', 'metadatas', 'distances']
            }

            # Add metadata filtering if provided
            if filter_metadata:
                search_params['where'] = filter_metadata

            # Search in ChromaDB
            results = self.collection.query(**search_params)

            # Format results
            formatted_results = []

            if results['documents'] and results['documents'][0]:
                for i in range(len(results['documents'][0])):
                    result = {
                        'id': results['ids'][0][i],
                        'content': results['documents'][0][i],
                        'metadata': results['metadatas'][0][i],
                        'similarity_score': max(0, 1 - (results['distances'][0][i] / 2)),
                        'distance': results['distances'][0][i]
                    }
                    formatted_results.append(result)

            return formatted_results

        except Exception as e:
            print(f"❌ Search error: {e}")
            return []

    def _enhance_query(self, query: str) -> str:
        """Enhance queries with migration-specific context"""

        query_lower = query.lower()

        # Add context for common migration terms
        if any(term in query_lower for term in ['decimal', 'precision', 'comp-3']):
            return f"COBOL to Java decimal conversion: {query}"
        elif any(term in query_lower for term in ['batch', 'jcl']):
            return f"Mainframe batch to Spring Batch: {query}"
        elif any(term in query_lower for term in ['error', 'issue', 'problem']):
            return f"Migration troubleshooting: {query}"
        elif any(term in query_lower for term in ['spring', 'java']):
            return f"Java development standards migration: {query}"
        elif any(term in query_lower for term in ['aws', 'cloud']):
            return f"AWS mainframe modernization: {query}"
        else:
            return f"Mainframe to Java migration: {query}"

    def search_by_category(self, query: str, category: str, top_k: int = 3) -> List[Dict[str, Any]]:
        """Search within a specific document category"""

        return self.search(
            query=query,
            top_k=top_k,
            filter_metadata={'doc_category': category}
        )

    def test_retrieval(self, test_queries: List[str] = None) -> None:
        """Test retrieval system with migration-specific sample queries"""

        if not test_queries:
            test_queries = [
                "How do you convert COBOL COMP-3 fields to Java BigDecimal?",
                "What are the steps for AWS Transform migration?",
                "How to troubleshoot decimal precision errors?",
                "What Spring Boot patterns for financial applications?",
                "Why choose PostgreSQL over Oracle for migration?",
                "How to handle CICS transaction context in Java?",
                "What testing strategy for COBOL to Java migration?",
                "How to convert JCL batch jobs to Spring Batch?",
                "What are common overdraft calculation errors?",
                "How to fix connection timeout issues in Java?"
            ]

        print(f"\n🔍 Testing retrieval with {len(test_queries)} migration-specific queries...")

        for i, query in enumerate(test_queries, 1):
            print(f"\n--- Test Query {i}: '{query}' ---")

            results = self.search(query, top_k=3)

            if results:
                for j, result in enumerate(results, 1):
                    similarity = result['similarity_score']
                    source = result['metadata'].get('source_file', 'Unknown')
                    section = result['metadata'].get('section_number', 'N/A')
                    section_title = result['metadata'].get('section_title', 'N/A')
                    doc_category = result['metadata'].get('doc_category', 'N/A')

                    print(f"  {j}. Score: {similarity:.3f} | {source} (Section {section})")
                    print(f"     Category: {doc_category} | Section: {section_title}")
                    print(f"     Preview: {result['content'][:100]}...")
            else:
                print("  No results found")

    def test_category_search(self) -> None:
        """Test category-specific searches"""

        print(f"\n🏷️ Testing category-specific searches...")

        category_tests = [
            ("troubleshooting", "How to fix decimal precision errors?"),
            ("development_standards", "What BigDecimal patterns should I use?"),
            ("architecture", "Why was PostgreSQL chosen over Oracle?"),
            ("specification", "What business rules does ACCVAL01 implement?"),
            ("migration_guide", "What were the performance improvements?")
        ]

        for category, query in category_tests:
            print(f"\n--- Category: {category} | Query: '{query}' ---")
            results = self.search_by_category(query, category, top_k=2)

            if results:
                for j, result in enumerate(results, 1):
                    similarity = result['similarity_score']
                    source = result['metadata'].get('source_file', 'Unknown')
                    print(f"  {j}. Score: {similarity:.3f} | {source}")
                    print(f"     Preview: {result['content'][:80]}...")
            else:
                print("  No results found in this category")

# ==========================================
# MAIN EXECUTION PIPELINE
# ==========================================

def setup_vector_database(chunks: List[DocumentChunk], embedding_model, collection):
    """Complete pipeline to set up vector database with embeddings for migration content"""

    print("🚀 Setting up migration knowledge vector database...")
    print("=" * 60)

    # Step 1: Generate embeddings
    embedding_generator = EmbeddingGenerator(embedding_model, batch_size=32)
    embeddings = embedding_generator.generate_embeddings(chunks)

    # Step 2: Store in vector database
    vector_store = VectorStore(collection)
    storage_success = vector_store.store_chunks_with_embeddings(chunks, embeddings)

    if not storage_success:
        print("❌ Failed to store chunks in vector database")
        return None

    # Step 3: Set up retrieval system
    retrieval_system = RetrievalSystem(collection, embedding_model)

    # Step 4: Get database statistics
    stats = vector_store.get_collection_stats()

    print("\n📊 MIGRATION KNOWLEDGE BASE STATISTICS")
    print("=" * 40)
    print(f"Total documents stored: {stats.get('total_documents', 0)}")
    print(f"Collection name: {stats.get('collection_name', 'N/A')}")
    print(f"Metadata fields: {', '.join(stats.get('sample_metadata_keys', []))}")

    # Step 5: Test retrieval systems
    retrieval_system.test_retrieval()
    retrieval_system.test_category_search()

    print("\n" + "=" * 60)
    print("✅ Migration knowledge vector database setup complete!")
    print("🔍 Retrieval system ready for mainframe migration queries")
    print("🎯 Ready for agentic RAG pipeline with Tavily integration!")

    return retrieval_system

# Execute the pipeline
try:
    # Use chunks from previous phase
    retrieval_system = setup_vector_database(
        chunks=processed_chunks,
        embedding_model=embedding_model,
        collection=collection
    )

    if retrieval_system:
        print(f"\n🎉 Success! Migration knowledge base contains {collection.count()} documents")

        # Store for next phase
        vector_db_setup = {
            'retrieval_system': retrieval_system,
            'collection': collection,
            'embedding_model': embedding_model,
            'total_documents': collection.count(),
            'ready_for_generation': True
        }

except NameError as e:
    print(f"❌ Missing variable: {e}")
    print("📋 Required variables from previous phases:")
    print("  • processed_chunks (from markdown document processing)")
    print("  • embedding_model (from setup)")
    print("  • collection (from ChromaDB setup)")
    print("\n💡 Make sure to run previous phases first!")

# Enhanced query testing functions
def test_migration_query(query: str, category: str = None):
    """Test a single migration query against the vector database"""
    if 'retrieval_system' in globals():
        if category:
            results = retrieval_system.search_by_category(query, category)
            print(f"\nCategory Search: '{category}' | Query: '{query}'")
        else:
            results = retrieval_system.search(query)
            print(f"\nGeneral Search: '{query}'")

        print("-" * 50)
        for i, result in enumerate(results, 1):
            print(f"{i}. Score: {result['similarity_score']:.3f}")
            print(f"   Source: {result['metadata']['source_file']}")
            print(f"   Category: {result['metadata'].get('doc_category', 'N/A')}")
            print(f"   Section: {result['metadata'].get('section_title', 'N/A')}")
            print(f"   Content: {result['content'][:120]}...")
            print()
    else:
        print("❌ Retrieval system not available. Run the setup first.")

def show_database_categories():
    """Show available document categories in the database"""
    if 'collection' in globals():
        try:
            # Get all metadata to show categories
            sample_data = collection.peek(limit=50)
            categories = set()

            for metadata in sample_data['metadatas']:
                if 'doc_category' in metadata:
                    categories.add(metadata['doc_category'])

            print("\n📋 Available Document Categories:")
            for category in sorted(categories):
                print(f"  • {category}")

        except Exception as e:
            print(f"❌ Error retrieving categories: {e}")
    else:
        print("❌ Database not available.")

print("\n💡 Available functions:")
print("  • test_migration_query('your question here') - Test general search")
print("  • test_migration_query('question', 'category') - Test category search")
print("  • show_database_categories() - Show available categories")

# RAG system for mainframe migration queries with confidence scoring

In [None]:
import openai
from typing import List, Dict, Any, Optional
import json
from datetime import datetime
import re

print("🚀 Mainframe Migration RAG System with Enhanced Confidence Scoring")
print("=" * 60)

# ==========================================
# ENHANCED RAG SYSTEM WITH CONFIDENCE SCORING
# ==========================================

class MainframeMigrationRAG:
    """RAG system with comprehensive confidence scoring"""

    def __init__(self, retrieval_system, embedding_model):
        self.retrieval_system = retrieval_system
        self.embedding_model = embedding_model

        # Confidence thresholds
        self.confidence_thresholds = {
            'excellent': 85,
            'high': 70,
            'medium': 50,
            'low': 30,
            'very_low': 15
        }

        # System prompt template
        self.system_prompt = """You are a mainframe to Java migration assistant. You help users with COBOL to Java conversion questions based on the provided migration documentation.

INSTRUCTIONS:
- Answer based ONLY on the provided context from migration documentation
- Be specific and practical in your responses
- Include relevant code examples (COBOL and Java) when applicable
- If the context doesn't contain enough information, say so clearly
- Cite the source material when possible
- Format your response clearly with examples when helpful
- Focus on practical migration guidance and troubleshooting

CONTEXT FROM MIGRATION DOCUMENTATION:
{context}

USER QUESTION: {query}

Please provide a helpful, accurate answer based on the migration context above."""

    def ask(self, query: str, show_details: bool = False, top_k: int = 5) -> str:
        """
        Ask a migration question and get an answer with confidence scoring

        Args:
            query: Your migration question
            show_details: Set to True to see internal processing details
            top_k: Number of relevant chunks to retrieve

        Returns:
            String answer to your question
        """

        if show_details:
            print(f"\n🔍 PROCESSING: '{query}'")
            print("=" * 50)

        # Step 1: Retrieve relevant chunks
        if show_details:
            print("📚 Step 1: Searching migration knowledge base...")

        retrieved_chunks = self.retrieval_system.search(query, top_k=top_k)

        if not retrieved_chunks:
            answer = "I couldn't find any relevant information in the migration documentation for your question."
            if show_details:
                print("❌ No relevant chunks found")
            self._show_clean_result(query, answer, 0, [], "No relevant information found")
            return answer

        # Step 2: Show retrieval analysis (if details requested)
        if show_details:
            self._show_retrieval_details(retrieved_chunks)

        # Step 3: Filter and assemble context
        context, filtered_chunks = self._assemble_context(retrieved_chunks, show_details)

        # Step 4: Generate response
        if show_details:
            print("🤖 Step 4: Generating AI response...")

        try:
            prompt = self.system_prompt.format(context=context, query=query)

            if show_details:
                print(f"   📝 Context length: {len(context)} chars")
                print(f"   📨 Full prompt length: {len(prompt)} chars")

            response = openai.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1500,
                temperature=0.1,
                top_p=0.9
            )

            answer = response.choices[0].message.content.strip()

            if show_details:
                print(f"✅ Response generated ({len(answer)} characters)")

        except Exception as e:
            error_msg = f"Error calling OpenAI API: {e}"
            if show_details:
                print(f"❌ {error_msg}")
            error_answer = f"Sorry, I encountered an error: {error_msg}"
            self._show_clean_result(query, error_answer, 0, [], "API Error")
            return error_answer

        # Step 5: Calculate comprehensive confidence score
        confidence_score, confidence_breakdown = self._calculate_comprehensive_confidence(
            query, filtered_chunks, answer, context
        )

        # Step 6: Show results
        if show_details:
            self._show_detailed_confidence_analysis(confidence_score, confidence_breakdown)
            self._show_final_details(answer, filtered_chunks, confidence_score)
        else:
            confidence_label = self._score_to_confidence_label(confidence_score)
            self._show_clean_result(query, answer, confidence_score, filtered_chunks, confidence_breakdown['primary_reason'])

        return answer

    def _calculate_comprehensive_confidence(self, query: str, chunks: List[Dict], answer: str, context: str) -> tuple[int, Dict]:
        """
        Calculate comprehensive confidence score (0-100) with detailed breakdown
        """

        confidence_factors = {}
        total_score = 0

        # 1. RETRIEVAL QUALITY FACTOR (25 points max)
        if chunks:
            max_similarity = max(chunk['similarity_score'] for chunk in chunks)
            avg_similarity = sum(chunk['similarity_score'] for chunk in chunks) / len(chunks)
            high_quality_chunks = sum(1 for chunk in chunks if chunk['similarity_score'] >= 0.7)

            retrieval_score = min(25, int(
                (max_similarity * 15) +  # Best match contributes 15 points max
                (avg_similarity * 5) +   # Average quality contributes 5 points max
                (high_quality_chunks * 2.5)  # Each high-quality chunk adds 2.5 points
            ))

            confidence_factors['retrieval_quality'] = {
                'score': retrieval_score,
                'max_similarity': max_similarity,
                'avg_similarity': avg_similarity,
                'high_quality_chunks': high_quality_chunks
            }
        else:
            retrieval_score = 0
            confidence_factors['retrieval_quality'] = {'score': 0, 'reason': 'No chunks retrieved'}

        total_score += retrieval_score

        # 2. ANSWER COMPLETENESS FACTOR (25 points max)
        answer_lower = answer.lower()

        # Negative indicators
        insufficient_phrases = [
            "i couldn't find", "no information", "not mentioned", "not discussed",
            "does not provide", "cannot find", "not available", "insufficient information",
            "based on the provided context, i cannot", "the document does not contain"
        ]

        has_insufficient_info = any(phrase in answer_lower for phrase in insufficient_phrases)

        if has_insufficient_info:
            completeness_score = 5  # Very low score for incomplete answers
            completeness_reason = "Answer indicates insufficient information"
        else:
            # Positive indicators
            answer_length = len(answer)
            has_examples = bool(re.search(r'(example|for instance|such as)', answer_lower))
            has_code = bool(re.search(r'(cobol|java|```|public class)', answer_lower))
            has_specific_details = bool(re.search(r'(bigdecimal|comp-3|spring|aws)', answer_lower))

            completeness_score = min(25, int(
                (min(answer_length / 50, 10)) +  # Length factor (up to 10 points)
                (7 if has_examples else 0) +     # Examples add 7 points
                (5 if has_code else 0) +         # Code examples add 5 points
                (3 if has_specific_details else 0)  # Technical details add 3 points
            ))

            completeness_reason = f"Answer is {'comprehensive' if completeness_score > 20 else 'adequate' if completeness_score > 15 else 'basic'}"

        confidence_factors['answer_completeness'] = {
            'score': completeness_score,
            'has_insufficient_info': has_insufficient_info,
            'answer_length': len(answer),
            'has_examples': has_examples if not has_insufficient_info else False,
            'has_code': has_code if not has_insufficient_info else False,
            'reason': completeness_reason
        }

        total_score += completeness_score

        # 3. CONTEXT RELEVANCE FACTOR (25 points max)
        query_words = set(query.lower().split())
        context_words = set(context.lower().split())
        answer_words = set(answer.lower().split())

        # Query-context alignment
        query_context_overlap = len(query_words.intersection(context_words)) / max(len(query_words), 1)

        # Context-answer alignment
        context_answer_overlap = len(context_words.intersection(answer_words)) / max(len(context_words), 1)

        relevance_score = min(25, int(
            (query_context_overlap * 15) +    # Query-context alignment (15 points max)
            (context_answer_overlap * 10)     # Context-answer alignment (10 points max)
        ))

        confidence_factors['context_relevance'] = {
            'score': relevance_score,
            'query_context_overlap': query_context_overlap,
            'context_answer_overlap': context_answer_overlap
        }

        total_score += relevance_score

        # 4. SOURCE DIVERSITY FACTOR (15 points max)
        if chunks:
            unique_sources = set(chunk['metadata'].get('source_file', 'unknown') for chunk in chunks)
            unique_doc_types = set(chunk['metadata'].get('doc_category', 'unknown') for chunk in chunks)

            diversity_score = min(15, int(
                (len(unique_sources) * 4) +      # Each unique source adds 4 points
                (len(unique_doc_types) * 3)      # Each unique doc type adds 3 points
            ))

            confidence_factors['source_diversity'] = {
                'score': diversity_score,
                'unique_sources': len(unique_sources),
                'unique_doc_types': len(unique_doc_types),
                'sources': list(unique_sources)
            }
        else:
            diversity_score = 0
            confidence_factors['source_diversity'] = {'score': 0}

        total_score += diversity_score

        # 5. TECHNICAL SPECIFICITY FACTOR (10 points max)
        technical_terms = [
            'cobol', 'java', 'bigdecimal', 'comp-3', 'spring', 'aws', 'migration',
            'transform', 'mainframe', 'jcl', 'cics', 'batch', 'postgresql', 'oracle'
        ]

        technical_matches = sum(1 for term in technical_terms if term in answer_lower)
        specificity_score = min(10, technical_matches * 2)

        confidence_factors['technical_specificity'] = {
            'score': specificity_score,
            'technical_matches': technical_matches
        }

        total_score += specificity_score

        # Determine primary reason for confidence level
        max_factor = max(confidence_factors.keys(), key=lambda k: confidence_factors[k]['score'])
        primary_reason = f"Based on {max_factor.replace('_', ' ')}"

        if has_insufficient_info:
            primary_reason = "Answer indicates missing information"
        elif retrieval_score < 10:
            primary_reason = "Low relevance of retrieved sources"
        elif completeness_score > 20:
            primary_reason = "Comprehensive answer with good context"

        confidence_factors['primary_reason'] = primary_reason
        confidence_factors['breakdown'] = {
            'Retrieval Quality': confidence_factors['retrieval_quality']['score'],
            'Answer Completeness': confidence_factors['answer_completeness']['score'],
            'Context Relevance': confidence_factors['context_relevance']['score'],
            'Source Diversity': confidence_factors['source_diversity']['score'],
            'Technical Specificity': confidence_factors['technical_specificity']['score']
        }

        return min(100, total_score), confidence_factors

    def _score_to_confidence_label(self, score: int) -> str:
        """Convert numerical score to confidence label"""
        if score >= 85:
            return "Excellent"
        elif score >= 70:
            return "High"
        elif score >= 50:
            return "Medium"
        elif score >= 30:
            return "Low"
        else:
            return "Very Low"

    def _get_confidence_color(self, score: int) -> str:
        """Get emoji indicator for confidence level"""
        if score >= 85:
            return "🟢"  # Green
        elif score >= 70:
            return "🔵"  # Blue
        elif score >= 50:
            return "🟡"  # Yellow
        elif score >= 30:
            return "🟠"  # Orange
        else:
            return "🔴"  # Red

    def _show_detailed_confidence_analysis(self, score: int, breakdown: Dict) -> None:
        """Show detailed confidence score breakdown"""

        print(f"\n📊 CONFIDENCE ANALYSIS")
        print("=" * 40)
        print(f"🎯 Overall Confidence Score: {score}/100 ({self._score_to_confidence_label(score)})")
        print(f"📈 Score Breakdown:")

        for factor, factor_score in breakdown['breakdown'].items():
            percentage = (factor_score / 25) * 100 if factor != 'Technical Specificity' else (factor_score / 10) * 100
            bar = "█" * int(percentage / 10) + "░" * (10 - int(percentage / 10))
            print(f"   {factor:20}: {factor_score:2d} [{bar}]")

        print(f"💡 Primary Factor: {breakdown['primary_reason']}")

    def _show_retrieval_details(self, chunks: List[Dict]) -> None:
        """Show retrieval analysis details"""

        scores = [chunk['similarity_score'] for chunk in chunks]

        print("📊 Step 2: Analyzing retrieval results...")
        print(f"   🎯 Best score: {max(scores):.3f}")
        print(f"   📊 Average: {sum(scores)/len(scores):.3f}")
        print(f"   📉 Worst score: {min(scores):.3f}")

        print(f"\n   📋 Top 3 results:")
        for i, chunk in enumerate(chunks[:3], 1):
            score = chunk['similarity_score']
            confidence = "High" if score >= 0.7 else "Medium" if score >= 0.4 else "Low"
            source = chunk['metadata'].get('source_file', 'Unknown')
            section = chunk['metadata'].get('section_number', 'N/A')

            print(f"   {i}. {source} (Section {section}) - Score: {score:.3f} ({confidence})")
            print(f"      Preview: {chunk['content'][:80]}...")

    def _assemble_context(self, chunks: List[Dict], show_details: bool = False) -> tuple[str, List[Dict]]:
        """Assemble context from retrieved chunks"""

        # Filter chunks by minimum confidence threshold
        filtered_chunks = [
            chunk for chunk in chunks
            if chunk['similarity_score'] >= 0.15  # Keep threshold low to retain context
        ]

        if not filtered_chunks:
            filtered_chunks = chunks[:1]  # Take at least one

        if show_details:
            print("🔧 Step 3: Assembling context...")
            print(f"   ✅ Using {len(filtered_chunks)} chunks for context")

        # Build context
        context_parts = []
        for i, chunk in enumerate(filtered_chunks, 1):
            source = chunk['metadata'].get('source_file', 'Migration Documentation')
            section = chunk['metadata'].get('section_number', 'N/A')
            section_title = chunk['metadata'].get('section_title', '')

            context_part = f"[Source {i}: {source}, Section {section}"
            if section_title:
                context_part += f", Title: {section_title}"
            context_part += f"]\n{chunk['content']}\n"

            context_parts.append(context_part)

        context = "\n---\n".join(context_parts)

        if show_details:
            print(f"   📝 Total context length: {len(context)} characters")

        return context, filtered_chunks

    def _show_final_details(self, answer: str, chunks: List[Dict], confidence_score: int) -> None:
        """Show detailed final results"""

        print("\n🎯 FINAL RESULT")
        print("=" * 50)
        print(f"📝 Answer: {answer}")
        print(f"🎯 Confidence: {confidence_score}/100 ({self._score_to_confidence_label(confidence_score)})")
        print(f"📚 Sources Used: {len(chunks)}")

        print(f"\n📊 Sources:")
        for i, chunk in enumerate(chunks, 1):
            score = chunk['similarity_score']
            source = chunk['metadata'].get('source_file', 'Unknown')
            section = chunk['metadata'].get('section_number', 'N/A')
            print(f"   {i}. {source} (Section {section}) - Score: {score:.3f}")

    def _show_clean_result(self, query: str, answer: str, confidence_score: int, chunks: List[Dict], reason: str) -> None:
        """Show clean result for normal usage with enhanced confidence display"""

        confidence_icon = self._get_confidence_color(confidence_score)
        confidence_label = self._score_to_confidence_label(confidence_score)

        print(f"\n❓ Question: {query}")
        print("─" * 60)
        print(f"📖 Answer:\n{answer}")

        # Enhanced confidence display
        print(f"\n{confidence_icon} Confidence Score: {confidence_score}/100 ({confidence_label})")
        print(f"💡 Reason: {reason}")

        # Visual confidence bar
        filled_bars = int(confidence_score / 10)
        empty_bars = 10 - filled_bars
        confidence_bar = "█" * filled_bars + "░" * empty_bars
        print(f"📊 Score: [{confidence_bar}] {confidence_score}%")

        if chunks:
            print(f"\n📚 Sources ({len(chunks)}):")
            for i, chunk in enumerate(chunks[:3], 1):  # Show top 3 sources
                source = chunk['metadata'].get('source_file', 'Unknown')
                section = chunk['metadata'].get('section_number', 'N/A')
                score = chunk['similarity_score']
                print(f"   {i}. {source} (Section {section}) - Relevance: {score:.3f}")

        print("─" * 60)

# ==========================================
# DEMO FUNCTION
# ==========================================

def run_demo_example(rag_system):
    """Run demo example showing confidence scoring"""

    print("\n🎬 DEMO EXAMPLE - Migration Question with Confidence Scoring")
    print("=" * 70)

    demo_queries = [
        "How do you convert COBOL COMP-3 fields to Java BigDecimal?",  # Should get high confidence
        "What are the daily transaction limits for different account types?",  # Should get medium confidence
        "How to deploy Spring Boot apps on Kubernetes?"  # Should get low confidence (not in migration docs)
    ]

    for i, query in enumerate(demo_queries, 1):
        print(f"\n{'🎯 TEST ' + str(i):-^50}")
        print(f"Query: {query}")
        print("▶️" * 10 + " PROCESSING " + "▶️" * 10)

        answer = rag_system.ask(query, show_details=(i == 1))  # Show details for first query only

        if i < len(demo_queries):
            input(f"\n👆 Press Enter for next test...")

    print("\n🏁 Demo complete! Notice how confidence scores vary based on:")
    print("   ✅ Quality of retrieved sources")
    print("   ✅ Completeness of the answer")
    print("   ✅ Technical specificity")
    print("   ✅ Context relevance")

# ==========================================
# EXECUTION
# ==========================================

print("🚀 Initializing Enhanced RAG System...")

try:
    # Initialize the RAG system
    migration_rag = MainframeMigrationRAG(
        retrieval_system=retrieval_system,
        embedding_model=embedding_model
    )

    print("✅ Enhanced RAG System Ready with Confidence Scoring!")
    print("\n💡 Usage Examples:")
    print("   📝 migration_rag.ask('How to handle overdraft calculations?')")
    print("   🔍 migration_rag.ask('Your question', show_details=True)")
    print("   🎬 run_demo_example(migration_rag)")

    # Optional: Run demo automatically
    print(f"\n🎬 Running confidence scoring demo...")
    demo_result = run_demo_example(migration_rag)

except NameError as e:
    print(f"❌ Setup incomplete: {e}")
    print("📋 Required from previous phases:")
    print("  • retrieval_system")
    print("  • embedding_model")
    migration_rag = None

# Convenience function
def ask(query: str, details: bool = False):
    """Convenience function for quick testing"""
    if migration_rag:
        return migration_rag.ask(query, show_details=details)
    else:
        print("❌ RAG system not initialized")

print(f"\n🎤 READY FOR USE!")
print(f"💡 Quick test: ask('How to convert COBOL decimals to Java?')")
print(f"🔍 With details: ask('Your question', details=True)")

# Agentic RAG with Tavily AWS Search

In [None]:
import requests
import json
import re
from typing import List, Dict, Any, Optional
from enum import Enum
from datetime import datetime

print("🤖 Complete Agentic RAG with Tavily AWS Search")
print("Mainframe Migration - Internal Docs + AWS Documentation")
print("=" * 60)

# ==========================================
# TAVILY AWS-FOCUSED SEARCH INTEGRATION
# ==========================================

class TavilySearcher:
    """Tavily AWS-focused search integration"""

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.tavily.com/search"

        # AWS-specific domains only
        self.aws_domains = [
            "docs.aws.amazon.com",
            "aws.amazon.com",
            "github.com/aws-samples",
            "github.com/awslabs",
            "repost.aws",
            "aws.amazon.com/blogs"
        ]

    def search(self, query: str, max_results: int = 5) -> Dict[str, Any]:
        """Search AWS documentation using Tavily API"""

        try:
            payload = {
                "api_key": self.api_key,
                "query": f"{query} AWS mainframe migration COBOL Java",  # Add context
                "search_depth": "advanced",
                "include_answer": True,
                "include_raw_content": True,
                "max_results": max_results,
                "include_images": False,
                "include_domains": self.aws_domains  # AWS-only search
            }

            response = requests.post(self.base_url, json=payload, timeout=30)
            response.raise_for_status()

            result = response.json()

            # Filter results to ensure they're from AWS domains
            if 'results' in result:
                filtered_results = []
                for r in result['results']:
                    url = r.get('url', '')
                    if any(domain in url for domain in self.aws_domains):
                        filtered_results.append(r)
                result['results'] = filtered_results

            return result

        except requests.exceptions.RequestException as e:
            print(f"❌ Tavily AWS search error: {e}")
            return {"results": [], "answer": None, "error": str(e)}

# ==========================================
# MIGRATION QUERY ROUTING ENGINE
# ==========================================

class MigrationQueryRouter:
    """Intelligent routing between internal docs and AWS documentation"""

    def __init__(self):
        # AWS documentation trigger patterns
        self.aws_patterns = {
            'aws_services': [
                r'\b(aws\s+transform|amazon\s+q|aws\s+mainframe\s+modernization)\b',
                r'\b(bedrock|kendra|opensearch|rds|ec2|lambda)\b',
                r'\b(aws\s+batch|step\s+functions|api\s+gateway)\b',
                r'\baws\s+.*(service|pricing|feature|limit|quota)\b',
                r'\b(cloudformation|cloudwatch|iam|s3|vpc)\b'
            ],
            'current_aws_info': [
                r'\b(latest|newest|current|recent).*\baws\b',
                r'\baws\s+.*(2024|2025|version|update|release)\b',
                r'\b(new|recent)\s+.*(aws|amazon)\s+.*(feature|service)\b',
                r'\bwhat.*new.*aws\b'
            ],
            'aws_best_practices': [
                r'\baws\s+.*(best\s+practice|recommendation|guidance)\b',
                r'\b(official|aws)\s+.*(guide|documentation|recommendation)\b',
                r'\bhow\s+does\s+aws\s+recommend\b',
                r'\baws\s+.*(architecture|pattern|framework)\b'
            ],
            'aws_specific_migration': [
                r'\baws\s+transform\s+.*(how|steps|process|guide)\b',
                r'\bamazon\s+q\s+developer\s+.*(migration|conversion)\b',
                r'\baws\s+mainframe\s+modernization\s+service\b',
                r'\b(aws\s+transform|amazon\s+q)\s+.*(capabilities|features)\b'
            ],
            'aws_tools_comparison': [
                r'\bcompare\s+.*aws.*\b(tools|services)\b',
                r'\baws\s+transform\s+vs\s+.*(micro\s+focus|tsri|ibm)\b',
                r'\bwhich\s+aws\s+service\s+for\b'
            ]
        }

        # Internal documentation trigger patterns
        self.internal_patterns = {
            'company_experience': [
                r'\b(our|we|company|organization)\s+.*(experience|approach|decision)\b',
                r'\b(lessons\s+learned|what\s+worked|what\s+failed)\b',
                r'\bhow\s+(did\s+we|do\s+we)\s+.*(migrate|convert|handle)\b',
                r'\b(our\s+migration|our\s+project|our\s+experience)\b'
            ],
            'specific_systems': [
                r'\b(accval01|cams|customer\s+account\s+management)\b',
                r'\b(cobol\s+program|copybook|jcl\s+job)\b.*\b(specification|documentation)\b',
                r'\bmigration\s+playbook\b',
                r'\b(architecture\s+decision|adr)\b'
            ],
            'troubleshooting_internal': [
                r'\bhow\s+to\s+(fix|resolve|troubleshoot)\s+.*(error|issue|problem)\b',
                r'\b(decimal\s+precision|comp-3|overdraft\s+calculation)\s+.*(error|issue)\b',
                r'\b(connection\s+timeout|memory\s+leak|performance\s+degradation)\b',
                r'\berror.*\b(during|after)\s+migration\b'
            ],
            'internal_standards': [
                r'\b(coding\s+standard|development\s+guideline|naming\s+convention)\b',
                r'\bwhat\s+.*(pattern|framework|library)\s+should\s+(we|i)\s+use\b',
                r'\b(java\s+development\s+standard|spring\s+boot\s+pattern)\b',
                r'\btesting\s+strategy\b.*migration\b'
            ],
            'business_rules': [
                r'\b(business\s+rule|validation\s+rule|account\s+validation)\b',
                r'\b(daily\s+limit|transaction\s+limit|overdraft\s+limit)\b',
                r'\bhow\s+does.*\b(accval01|validation\s+program)\b.*work\b'
            ]
        }

        # Response quality indicators for fallback decisions
        self.poor_response_indicators = [
            "does not specifically mention",
            "refer to external documentation",
            "consult the official documentation",
            "i don't have information",
            "not available in the provided context",
            "cannot be found in the documentation",
            "not covered in detail",
            "for more information",
            "additional resources needed"
        ]

    def route_query(self, query: str) -> tuple[str, str]:
        """
        Decide routing: 'internal', 'aws', or 'hybrid'
        Returns: (route_decision, reason)
        """

        query_lower = query.lower()

        # Check for AWS patterns first (more specific)
        for category, patterns in self.aws_patterns.items():
            for pattern in patterns:
                if re.search(pattern, query_lower):
                    return 'aws', f"AWS pattern match: {category}"

        # Check for internal patterns
        for category, patterns in self.internal_patterns.items():
            for pattern in patterns:
                if re.search(pattern, query_lower):
                    return 'internal', f"Internal pattern match: {category}"

        # Default routing based on query content
        aws_terms = ['aws', 'amazon', 'cloud', 'service', 'official', 'documentation']
        internal_terms = ['our', 'we', 'company', 'internal', 'playbook', 'experience', 'lesson']

        aws_score = sum(1 for term in aws_terms if term in query_lower)
        internal_score = sum(1 for term in internal_terms if term in query_lower)

        if aws_score > internal_score:
            return 'aws', f"AWS-related terms detected (score: {aws_score})"
        elif internal_score > aws_score:
            return 'internal', f"Internal-related terms detected (score: {internal_score})"
        else:
            return 'hybrid', "Ambiguous query - trying both sources"

    def should_try_aws_fallback(self, internal_response: str) -> tuple[bool, str]:
        """Analyze if internal response needs AWS fallback"""

        if not internal_response or len(internal_response.strip()) < 30:
            return True, "Internal response too short"

        response_lower = internal_response.lower()

        # Check for poor response indicators
        for indicator in self.poor_response_indicators:
            if indicator in response_lower:
                return True, f"Internal response insufficient: {indicator}"

        return False, "Internal response appears sufficient"

# ==========================================
# COMPLETE AGENTIC RAG SYSTEM
# ==========================================

class MigrationAgenticRAG:
    """Complete Agentic RAG for mainframe migration with intelligent routing"""

    def __init__(self, internal_rag_system, tavily_api_key: str, show_internals: bool = True):
        self.internal_rag = internal_rag_system
        self.aws_searcher = TavilySearcher(tavily_api_key)
        self.router = MigrationQueryRouter()
        self.show_internals = show_internals

        # Disable internals for internal RAG to reduce noise
        if hasattr(self.internal_rag, 'show_internals'):
            self.internal_rag.show_internals = False

    def answer_question(self, query: str) -> Dict[str, Any]:
        """Main agentic pipeline with intelligent routing"""

        if self.show_internals:
            print(f"\n🤖 MIGRATION AGENTIC RAG: '{query}'")
            print("=" * 60)

        # Stage 1: Routing decision
        route_decision, route_reason = self.router.route_query(query)

        if self.show_internals:
            print(f"🔍 Stage 1: Routing Analysis")
            print(f"   🎯 Decision: {route_decision.upper()}")
            print(f"   💡 Reason: {route_reason}")

        # Execute based on routing decision
        if route_decision == 'internal':
            return self._internal_only_response(query, route_reason)
        elif route_decision == 'aws':
            return self._aws_only_response(query, route_reason)
        else:  # hybrid
            return self._hybrid_response(query, route_reason)

    def _internal_only_response(self, query: str, reason: str) -> Dict[str, Any]:
        """Use only internal documentation"""

        if self.show_internals:
            print("\n📚 Stage 2: Using INTERNAL documentation only")

        try:
            internal_result = self.internal_rag.ask(query, show_details=False)

            # Check if we should fallback to AWS anyway
            should_fallback, fallback_reason = self.router.should_try_aws_fallback(internal_result)

            if should_fallback:
                if self.show_internals:
                    print(f"   🔄 Internal insufficient: {fallback_reason}")
                    print("   🌐 Falling back to AWS search...")
                return self._hybrid_response(query, f"Internal fallback: {fallback_reason}")

            if self.show_internals:
                print(f"   ✅ Internal response sufficient")

            return {
                'query': query,
                'answer': internal_result,
                'routing_decision': 'internal_only',
                'routing_reason': reason,
                'source_type': 'Internal Migration Documentation',
                'confidence_source': 'Internal Knowledge Base',
                'aws_searched': False,
                'internal_searched': True
            }

        except Exception as e:
            if self.show_internals:
                print(f"   ❌ Internal search failed: {e}")
            return self._aws_fallback_response(query, f"Internal search failed: {e}")

    def _aws_only_response(self, query: str, reason: str) -> Dict[str, Any]:
        """Use only AWS documentation via Tavily"""

        if self.show_internals:
            print("\n🌐 Stage 2: Using AWS documentation only")

        aws_result = self.aws_searcher.search(query, max_results=5)

        if self.show_internals:
            result_count = len(aws_result.get('results', []))
            print(f"   📊 Found {result_count} AWS results")

            if aws_result.get('answer'):
                print(f"   📝 Direct AWS answer available")

        return {
            'query': query,
            'answer': self._format_aws_answer(aws_result),
            'routing_decision': 'aws_only',
            'routing_reason': reason,
            'source_type': 'AWS Official Documentation',
            'confidence_source': 'AWS Authoritative Sources',
            'aws_results_count': len(aws_result.get('results', [])),
            'aws_sources': self._format_aws_sources(aws_result),
            'aws_searched': True,
            'internal_searched': False
        }

    def _hybrid_response(self, query: str, reason: str) -> Dict[str, Any]:
        """Combine internal and AWS sources"""

        if self.show_internals:
            print("\n🔄 Stage 2: Using HYBRID approach (Internal + AWS)")

        # Try internal first
        internal_result = None
        internal_success = False

        if self.show_internals:
            print("   📚 Searching internal documentation...")

        try:
            internal_result = self.internal_rag.ask(query, show_details=False)
            internal_success = True
            if self.show_internals:
                print(f"   ✅ Internal search completed")
        except Exception as e:
            internal_result = f"Internal documentation search failed: {e}"
            if self.show_internals:
                print(f"   ❌ Internal search failed: {e}")

        # Get AWS information
        if self.show_internals:
            print("   🌐 Searching AWS documentation...")

        aws_result = self.aws_searcher.search(query, max_results=3)
        aws_answer = self._format_aws_answer(aws_result)

        if self.show_internals:
            aws_count = len(aws_result.get('results', []))
            print(f"   📊 AWS search found {aws_count} results")

        # Combine responses intelligently
        if internal_success and len(internal_result.strip()) > 50:
            combined_answer = f"""**From Your Internal Migration Experience:**
{internal_result}

**From AWS Official Documentation:**
{aws_answer}

---
*This response combines your organization's migration experience with current AWS guidance.*"""
        else:
            combined_answer = f"""**From AWS Official Documentation:**
{aws_answer}

**Note:** Internal documentation search was {'unsuccessful' if not internal_success else 'limited'}. This response relies primarily on AWS official sources.

---
*For complete guidance, ensure your internal migration documentation is accessible.*"""

        return {
            'query': query,
            'answer': combined_answer,
            'routing_decision': 'hybrid',
            'routing_reason': reason,
            'source_type': 'Internal + AWS Documentation',
            'confidence_source': 'Combined Sources',
            'aws_results_count': len(aws_result.get('results', [])),
            'aws_sources': self._format_aws_sources(aws_result),
            'internal_success': internal_success,
            'aws_searched': True,
            'internal_searched': True
        }

    def _aws_fallback_response(self, query: str, fallback_reason: str) -> Dict[str, Any]:
        """Fallback to AWS when internal fails"""

        if self.show_internals:
            print(f"\n🌐 AWS Fallback: {fallback_reason}")

        aws_result = self.aws_searcher.search(query, max_results=5)

        return {
            'query': query,
            'answer': self._format_aws_answer(aws_result),
            'routing_decision': 'aws_fallback',
            'routing_reason': fallback_reason,
            'source_type': 'AWS Documentation (Fallback)',
            'confidence_source': 'AWS Fallback',
            'aws_results_count': len(aws_result.get('results', [])),
            'aws_searched': True,
            'internal_searched': False
        }

    def _format_aws_answer(self, aws_result: Dict) -> str:
        """Format answer from AWS search results"""

        # Use Tavily's direct answer if available and good quality
        if aws_result.get('answer') and len(aws_result['answer'].strip()) > 100:
            return aws_result['answer']

        # Otherwise combine top results
        if aws_result.get('results'):
            combined_content = []
            for i, result in enumerate(aws_result['results'][:3], 1):
                title = result.get('title', f'AWS Source {i}')
                content = result.get('content', '').strip()

                if content:
                    # Clean and truncate content
                    content = content[:400] + "..." if len(content) > 400 else content
                    combined_content.append(f"**{title}:**\n{content}")

            if combined_content:
                formatted_answer = "\n\n".join(combined_content)
                formatted_answer += "\n\n*Based on current AWS documentation and resources.*"
                return formatted_answer

        # Fallback if no good content
        if aws_result.get('error'):
            return f"AWS documentation search encountered an error: {aws_result['error']}"
        else:
            return "I found AWS resources but couldn't extract comprehensive information. Please check the source links provided."

    def _format_aws_sources(self, aws_result: Dict) -> List[Dict[str, Any]]:
        """Format AWS search sources for display"""

        sources = []
        for result in aws_result.get('results', []):
            source = {
                'title': result.get('title', 'AWS Documentation'),
                'url': result.get('url', 'No URL'),
                'domain': self._extract_domain(result.get('url', '')),
                'type': 'aws_official',
                'content_preview': result.get('content', '')[:100] + "..." if result.get('content') else 'No preview'
            }
            sources.append(source)

        return sources

    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            return urlparse(url).netloc
        except:
            return 'aws.amazon.com'

    def show_routing_summary(self, result: Dict[str, Any]) -> None:
        """Display routing decision summary"""

        print(f"\n📊 ROUTING SUMMARY")
        print("=" * 40)
        print(f"🎯 Decision: {result['routing_decision']}")
        print(f"💡 Reason: {result['routing_reason']}")
        print(f"📚 Source Type: {result['source_type']}")
        print(f"🔍 Internal Searched: {'Yes' if result.get('internal_searched') else 'No'}")
        print(f"🌐 AWS Searched: {'Yes' if result.get('aws_searched') else 'No'}")

        if result.get('aws_results_count'):
            print(f"📊 AWS Results: {result['aws_results_count']}")

# ==========================================
# SETUP AND INITIALIZATION
# ==========================================

def setup_migration_agentic_rag():
    """Setup the complete migration agentic RAG system"""

    print("🚀 Setting up Migration Agentic RAG System...")
    print("=" * 50)

    # Get Tavily API key
    try:
        from google.colab import userdata
        tavily_api_key = userdata.get('TAVILY_API_KEY')
        print("✅ Tavily API key loaded from Colab secrets")
    except Exception as e:
        print(f"❌ Error loading Tavily API key from secrets: {e}")
        print("🔑 Please add TAVILY_API_KEY to Colab secrets")
        print("📝 Get your free key from: https://tavily.com")

        # Allow manual input as fallback
        tavily_api_key = input("Enter Tavily API key manually (or press Enter to skip): ").strip()
        if not tavily_api_key:
            print("❌ No Tavily API key provided - AWS search will not work")
            return None

    # Test Tavily connection with AWS search
    print("🔄 Testing Tavily AWS search connection...")
    test_searcher = TavilySearcher(tavily_api_key)
    test_result = test_searcher.search("AWS Transform mainframe migration", max_results=1)

    if test_result.get('error'):
        print(f"❌ Tavily test failed: {test_result['error']}")
        return None
    else:
        result_count = len(test_result.get('results', []))
        print(f"✅ Tavily AWS search successful! Found {result_count} results")

    # Initialize agentic RAG system
    try:
        agentic_rag = MigrationAgenticRAG(
            internal_rag_system=migration_rag,  # From previous phase
            tavily_api_key=tavily_api_key,
            show_internals=True
        )

        print("✅ Migration Agentic RAG system initialized!")
        print("🎯 Ready for intelligent internal/AWS routing!")
        return agentic_rag

    except NameError as e:
        print(f"❌ Missing component: {e}")
        print("📋 Make sure previous phases are complete:")
        print("   • migration_rag (from Phase 1D enhanced)")
        return None

# ==========================================
# DEMO AND TESTING
# ==========================================

def demo_migration_routing():
    """Demonstrate intelligent routing with migration-specific queries"""

    demo_queries = [
        # Demo 1: Should route to INTERNAL docs
        "What were the lessons learned from our CAMS migration?",

        # Demo 2: Should route to AWS docs
        "What are the latest AWS Transform features for COBOL conversion?",

        # Demo 3: Should use HYBRID approach
        "Best practices for COBOL to Java migration"
    ]

    print("🎭 MIGRATION AGENTIC ROUTING DEMONSTRATION (3 Examples)")
    print("=" * 60)

    try:
        agentic_system = setup_migration_agentic_rag()
        if not agentic_system:
            return None

        for i, query in enumerate(demo_queries, 1):
            print(f"\n{'🎬 DEMO ' + str(i) + ': ' + query:-^70}")

            # Show expected routing
            route_descriptions = [
                "Expected: INTERNAL (company-specific experience)",
                "Expected: AWS (current AWS service features)",
                "Expected: HYBRID (combines internal + AWS guidance)"
            ]
            print(f"💡 {route_descriptions[i-1]}")

            result = agentic_system.answer_question(query)

            # Show routing summary
            agentic_system.show_routing_summary(result)

            # Show answer preview
            answer_preview = result['answer'][:200] + "..." if len(result['answer']) > 200 else result['answer']
            print(f"\n📖 Answer Preview:\n{answer_preview}")

            if i < len(demo_queries):
                input(f"\n👆 Press Enter to continue to demo {i+1}...")

        print("\n🎉 Migration agentic routing demo complete!")
        print("🎯 The system intelligently routes between:")
        print("   📚 Internal migration experience & documentation")
        print("   🌐 AWS official documentation & services")
        print("   🔄 Hybrid approach combining both sources")

        return agentic_system

    except Exception as e:
        print(f"❌ Demo failed: {e}")
        return None

# ==========================================
# EXECUTION
# ==========================================

print("🤖 Initializing Complete Migration Agentic RAG System...")

# Run demo automatically
migration_agentic_system = demo_migration_routing()

if migration_agentic_system:
    print(f"\n🎯 MIGRATION AGENTIC RAG READY!")
    print(f"💡 Usage: migration_agentic_system.answer_question('your query')")
    print(f"🎓 Test different query types to see intelligent routing!")

    # Store for easy access
    agentic_rag = migration_agentic_system

else:
    print("❌ Migration Agentic RAG setup failed. Check errors above.")
    agentic_rag = None

# Convenience functions
def ask_migration(query: str):
    """Quick test function for migration agentic RAG"""
    if agentic_rag:
        result = agentic_rag.answer_question(query)
        return result
    else:
        print("❌ Migration Agentic RAG system not initialized")

def test_routing(query: str):
    """Test routing decision without full processing"""
    if agentic_rag:
        route, reason = agentic_rag.router.route_query(query)
        print(f"Query: '{query}'")
        print(f"Route: {route.upper()}")
        print(f"Reason: {reason}")
        return route, reason
    else:
        print("❌ System not initialized")

print(f"\n🎤 READY FOR INTELLIGENT MIGRATION ASSISTANCE!")
print(f"💡 ask_migration('How does AWS Transform handle COBOL conversion?')")
print(f"💡 ask_migration('What were our lessons learned from migration?')")
print(f"🔍 test_routing('your query') - See routing decision only")

# Testing the Agentic RAG

In [None]:
ask_migration('How does our ACCVAL01 program handle overdraft calculations?')

In [None]:
ask_migration("What are the latest Amazon Q Developer features?")

In [None]:
ask_migration("Best practices for COBOL to Java migration?")