# AI Document Assistant - RAG System with PDF and Confluence Integration

This notebook contains a complete RAG system with Streamlit frontend for Kaggle deployment.

## 🚀 Installation and Setup

In [1]:

# Install required packages
!pip install -q streamlit langchain langchain-community langchain-huggingface
!pip install -q faiss-cpu sentence-transformers PyMuPDF atlassian-python-api
!pip install -q boto3 langchain-aws python-dotenv jinja2 beautifulsoup4
!pip install -q torch torchvision
!pip install -q pyngrok openai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.8/442.8 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.6.0 requires fsspec[http]<=2025.3.0,>=2023.1.

In [2]:
# Import libraries and setup
import os
import sys
import tempfile
import subprocess
import traceback
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Create working directory
work_dir = Path('/kaggle/working')
work_dir.mkdir(exist_ok=True)
os.chdir(work_dir)

print('✅ Setup complete!')

✅ Setup complete!


## 📁 Core Components Implementation

### 1. Confluence Integration Module

In [3]:
%%writefile extract_confluence.py
from atlassian import Confluence
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import os
import traceback
from dotenv import load_dotenv
from bs4 import BeautifulSoup

load_dotenv()

class ConfluenceProcessor:
    def __init__(self):
        self.confluence = None
        self.available = False
        self.error_message = None
        
        # FIXED: Proper validation of Confluence credentials
        confluence_url = os.getenv('CONFLUENCE_URL', '').strip()
        confluence_username = os.getenv('CONFLUENCE_USERNAME', '').strip()
        confluence_token = os.getenv('CONFLUENCE_API_TOKEN', '').strip()
        
        if not confluence_url or not confluence_username or not confluence_token:
            self.error_message = 'Missing Confluence credentials (URL, USERNAME, or API_TOKEN)'
            print(f'Confluence not configured: {self.error_message}')
            return
        
        try:
            self.confluence = Confluence(
                url=confluence_url,
                username=confluence_username,
                password=confluence_token
            )
            
            # Test actual connection
            test_result = self.confluence.get_all_spaces(start=0, limit=1)
            if test_result:
                self.available = True
                self.embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
                print('Confluence connection verified')
            else:
                self.error_message = 'Unable to fetch spaces - check permissions'
                print(f'Confluence test failed: {self.error_message}')
        except Exception as e:
            self.error_message = f'Connection failed: {str(e)}'
            print(f'Confluence connection error: {self.error_message}')
    
    def get_connection_status(self):
        return {
            'available': self.available,
            'error': self.error_message,
            'url': os.getenv('CONFLUENCE_URL', 'Not set'),
            'username': os.getenv('CONFLUENCE_USERNAME', 'Not set')
        }
    
    def clean_html_content(self, html_content):
        if not html_content:
            return ''
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            for script in soup(['script', 'style']):
                script.decompose()
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split('  '))
            text = ' '.join(chunk for chunk in chunks if chunk)
            return text
        except Exception as e:
            print(f'Error cleaning HTML: {e}')
            return str(html_content)[:1000]
    
    def get_page_content(self, page_id):
        if not self.available:
            return None
        try:
            page = self.confluence.get_page_by_id(page_id, expand='body.storage,version,space')
            title = page['title']
            content = page['body']['storage']['value']
            space_key = page['space']['key']
            clean_content = self.clean_html_content(content)
            return {
                'title': title,
                'content': clean_content,
                'page_id': page_id,
                'space_key': space_key,
                'url': f"{os.getenv('CONFLUENCE_URL')}/pages/viewpage.action?pageId={page_id}"
            }
        except Exception as e:
            print(f'Error fetching page {page_id}: {str(e)}')
            return None
    
    def process_confluence_to_vectorstore(self, page_ids=None, chunk_size=500):
        if not self.available:
            print(f'Confluence not available: {self.error_message}')
            return None
        
        documents = []
        if page_ids:
            for page_id in page_ids:
                page_data = self.get_page_content(page_id)
                if page_data:
                    doc = Document(
                        page_content=f"Title: {page_data['title']}\n\n{page_data['content']}",
                        metadata={
                            'source': 'confluence',
                            'title': page_data['title'],
                            'page_id': page_data['page_id'],
                            'space_key': page_data['space_key'],
                            'url': page_data['url']
                        }
                    )
                    documents.append(doc)
        
        if not documents:
            print('No documents found to process')
            return None
        
        print(f'Loaded {len(documents)} documents from Confluence')
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
        chunks = splitter.split_documents(documents)
        print(f'Split into {len(chunks)} chunks.')
        vectorstore = FAISS.from_documents(chunks, self.embeddings)
        print('Stored Confluence chunks in FAISS vector store.')
        return vectorstore

Writing extract_confluence.py


### 2. Unified Data Processor

In [4]:
%%writefile unified_processor.py
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from extract_confluence import ConfluenceProcessor
import os
import traceback
import sys

class UnifiedDataProcessor:
    def __init__(self, vector_store_path='./vector_store/'):
        self.vector_store_path = vector_store_path
        self.embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        self.confluence_processor = ConfluenceProcessor()
        self.vectorstore = None
        os.makedirs(vector_store_path, exist_ok=True)
        self.load_existing_vectorstore()
    
    def load_existing_vectorstore(self):
        try:
            if os.path.exists(os.path.join(self.vector_store_path, 'index.faiss')):
                self.vectorstore = FAISS.load_local(
                    self.vector_store_path,
                    embeddings=self.embeddings,
                    allow_dangerous_deserialization=True
                )
                print('Loaded existing vector store')
            else:
                print('No existing vector store found')
        except Exception as e:
            print(f'Error loading existing vector store: {str(e)}')
            traceback.print_exc()
    
    def add_pdf_documents(self, pdf_paths, chunk_size=500):
        all_documents = []
        if isinstance(pdf_paths, str):
            pdf_paths = [pdf_paths]
        
        for pdf_path in pdf_paths:
            try:
                loader = PyPDFLoader(pdf_path)
                documents = loader.load()
                
                # FIXED: Enhanced metadata for better search results
                for i, doc in enumerate(documents):
                    doc.metadata['source'] = 'pdf'
                    doc.metadata['file_path'] = pdf_path
                    doc.metadata['file_name'] = os.path.basename(pdf_path)
                    doc.metadata['title'] = os.path.splitext(os.path.basename(pdf_path))[0]
                    doc.metadata['page'] = i + 1
                    doc.metadata['total_pages'] = len(documents)
                    
                    # Ensure content is meaningful
                    if not doc.page_content or len(doc.page_content.strip()) < 10:
                        doc.page_content = f'Content from {doc.metadata["title"]} - Page {doc.metadata["page"]}'
                
                all_documents.extend(documents)
                print(f'Loaded {len(documents)} pages from {os.path.basename(pdf_path)}')
            except Exception as e:
                print(f'Error processing PDF {pdf_path}: {str(e)}')
                traceback.print_exc()
        
        if all_documents:
            self._add_documents_to_vectorstore(all_documents, chunk_size)
    
    def add_confluence_documents(self, page_ids=None, chunk_size=500):
        try:
            confluence_vectorstore = self.confluence_processor.process_confluence_to_vectorstore(
                page_ids=page_ids, chunk_size=chunk_size
            )
            if confluence_vectorstore:
                if self.vectorstore is None:
                    self.vectorstore = confluence_vectorstore
                else:
                    self.vectorstore.merge_from(confluence_vectorstore)
                self.save_vectorstore()
                print('Successfully added Confluence documents to vector store')
            else:
                print('Failed to process Confluence documents')
        except Exception as e:
            print(f'Error adding Confluence documents: {str(e)}')
            traceback.print_exc()
    
    def _add_documents_to_vectorstore(self, documents, chunk_size):
        try:
            splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
            chunks = splitter.split_documents(documents)
            print(f'Split into {len(chunks)} chunks.')
            
            if self.vectorstore is None:
                self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
            else:
                new_vectorstore = FAISS.from_documents(chunks, self.embeddings)
                self.vectorstore.merge_from(new_vectorstore)
            
            self.save_vectorstore()
            print('Added documents to unified vector store.')
        except Exception as e:
            print(f'Error adding documents to vector store: {str(e)}')
            traceback.print_exc()
    
    def save_vectorstore(self):
        try:
            if self.vectorstore:
                self.vectorstore.save_local(self.vector_store_path)
                print(f'Vector store saved to {self.vector_store_path}')
        except Exception as e:
            print(f'Error saving vector store: {str(e)}')
            traceback.print_exc()
    
    def get_vectorstore(self):
        return self.vectorstore
    
    # FIXED: Enhanced search with proper error handling and source information
    def search_documents(self, query, k=5):
        if self.vectorstore is None:
            print('No vector store available')
            return []
        
        if not query or not query.strip():
            print('Empty query provided')
            return []
        
        try:
            print(f'Searching for: "{query}" (k={k})')
            results = self.vectorstore.similarity_search_with_score(query, k=k)
            print(f'Found {len(results)} results')
            
            # Enhanced result formatting with source information
            formatted_results = []
            for doc, score in results:
                # Ensure all metadata fields exist
                metadata = doc.metadata.copy()
                metadata.setdefault('title', 'Unknown Document')
                metadata.setdefault('source', 'unknown')
                metadata.setdefault('file_name', 'Unknown File')
                
                formatted_results.append((doc, score))
            
            return formatted_results
        except Exception as e:
            print(f'Error searching documents: {str(e)}')
            print(f'Vector store type: {type(self.vectorstore)}')
            print(f'Query type: {type(query)}, length: {len(query) if query else 0}')
            traceback.print_exc()
            return []
    
    def get_confluence_status(self):
        return self.confluence_processor.get_connection_status()

Writing unified_processor.py


### 3. Design Document Generator

In [5]:
%%writefile design_doc_generator.py
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from jinja2 import Template
import os
import traceback
from datetime import datetime
import re
from collections import defaultdict

class DesignDocumentGenerator:
    def __init__(self, vector_store_path='./vector_store/'):
        self.vector_store_path = vector_store_path
        self._embeddings = None
        self._processor = None
        self._vectorstore = None
        
        # Enhanced template with better structure and RAG integration
        self.design_doc_template = '''# {{title}}

**Document Type:** Technical Design Document
**Generated:** {{timestamp}}
**Version:** 1.0
**Status:** Draft

---

## 1. Executive Summary
{{overview}}

## 2. Background and Context
{{background}}

## 3. Requirements Analysis
{{requirements}}

## 4. System Architecture
{{architecture}}

## 5. Technical Implementation
{{implementation}}

## 6. Data Flow and Integration
{{data_flow}}

## 7. Security Considerations
{{security}}

## 8. Performance and Scalability
{{performance}}

## 9. Testing Strategy
{{testing}}

## 10. Deployment Plan
{{deployment}}

## 11. Timeline and Milestones
{{timeline}}

## 12. Risk Assessment
{{risks}}

## 13. References and Sources
{{references}}

---
**Document Metadata:**
- Sources analyzed: {{source_count}} documents
- Content types: {{content_types}}
- Generated using RAG-enhanced analysis
- Last updated: {{timestamp}}
'''

    @property
    def embeddings(self):
        if self._embeddings is None:
            self._embeddings = HuggingFaceEmbeddings(
                model_name='sentence-transformers/all-MiniLM-L6-v2',
                model_kwargs={'device': 'cpu'},
                encode_kwargs={'normalize_embeddings': True}
            )
        return self._embeddings

    @property
    def processor(self):
        if self._processor is None:
            try:
                from unified_processor_fast import UnifiedDataProcessor
            except ImportError:
                try:
                    from unified_processor import UnifiedDataProcessor
                except ImportError:
                    from unified_processor_fixed import UnifiedDataProcessor
            self._processor = UnifiedDataProcessor(self.vector_store_path)
        return self._processor

    @property
    def vectorstore(self):
        if self._vectorstore is None:
            self._vectorstore = self.processor.get_vectorstore()
        return self._vectorstore

    def _extract_relevant_content(self, user_request, k=10):
        """Extract and analyze relevant content from documents using RAG"""
        if not self.vectorstore:
            return [], {}, ""
        
        try:
            print(f"🔍 Searching for relevant content: {user_request}")
            # Search for relevant documents
            results = self.processor.search_documents(user_request, k=k)
            
            sources = []
            content_by_type = defaultdict(list)
            all_content = ""
            
            for doc, score in results:
                # Extract comprehensive metadata
                source_info = {
                    'title': doc.metadata.get('title', doc.metadata.get('file_name', 'Unknown')),
                    'type': doc.metadata.get('source', 'unknown'),
                    'score': float(score),
                    'url': doc.metadata.get('url', ''),
                    'page': doc.metadata.get('page', ''),
                    'content_preview': doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
                }
                sources.append(source_info)
                
                # Group content by type for better analysis
                content_type = doc.metadata.get('source', 'unknown')
                content_by_type[content_type].append({
                    'title': source_info['title'],
                    'content': doc.page_content,
                    'score': score
                })
                
                # Accumulate all content for comprehensive analysis
                all_content += f"\n\n--- From {source_info['title']} ---\n{doc.page_content}"
            
            print(f"📚 Found {len(sources)} relevant sources across {len(content_by_type)} content types")
            return sources, dict(content_by_type), all_content
            
        except Exception as e:
            print(f"❌ Content extraction error: {e}")
            return [], {}, ""

    def _extract_key_terms(self, content, user_request):
        """Extract key technical terms and concepts from retrieved content"""
        technical_terms = []
        
        # Common technical patterns to look for in the content
        patterns = [
            r'\b(API|REST|GraphQL|microservice|database|authentication|authorization)\b',
            r'\b(Docker|Kubernetes|AWS|Azure|GCP|cloud)\b',
            r'\b(React|Angular|Vue|Node\.js|Python|Java|Go|JavaScript)\b',
            r'\b(PostgreSQL|MySQL|MongoDB|Redis|SQL)\b',
            r'\b(OAuth|JWT|SSL|TLS|HTTPS|security)\b',
            r'\b(CI/CD|DevOps|deployment|monitoring)\b'
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            technical_terms.extend(matches)
        
        # Remove duplicates and return top terms
        return list(set(technical_terms))[:10]

    def _extract_requirements_from_content(self, content):
        """Extract functional and non-functional requirements from content"""
        requirements = {
            'functional': [],
            'non_functional': []
        }
        
        # Look for requirement patterns in the content
        func_patterns = [
            r'must\s+(be able to|support|provide|allow)\s+([^.]+)',
            r'shall\s+([^.]+)',
            r'requirement[s]?[:]?\s+([^.]+)',
            r'should\s+(be able to|support|provide|allow)\s+([^.]+)'
        ]
        
        for pattern in func_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            for match in matches:
                req = match if isinstance(match, str) else match[-1]
                if len(req.strip()) > 10:  # Only meaningful requirements
                    requirements['functional'].append(req.strip())
        
        # Non-functional requirements
        nf_patterns = [
            r'performance[:]?\s+([^.]+)',
            r'scalability[:]?\s+([^.]+)',
            r'security[:]?\s+([^.]+)',
            r'availability[:]?\s+([^.]+)',
            r'response time[:]?\s+([^.]+)'
        ]
        
        for pattern in nf_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            requirements['non_functional'].extend([match.strip() for match in matches if len(match.strip()) > 5])
        
        return requirements

    def _analyze_content_for_section(self, section_type, user_request, content_by_type, all_content):
        """Analyze retrieved content to generate contextual section content"""
        
        # Extract insights from the retrieved content
        key_terms = self._extract_key_terms(all_content, user_request)
        requirements = self._extract_requirements_from_content(all_content)
        
        if section_type == 'overview':
            return self._generate_contextual_overview(user_request, key_terms, content_by_type)
        elif section_type == 'background':
            return self._generate_contextual_background(user_request, all_content, content_by_type)
        elif section_type == 'requirements':
            return self._generate_contextual_requirements(user_request, requirements, key_terms)
        elif section_type == 'architecture':
            return self._generate_contextual_architecture(user_request, key_terms, content_by_type)
        elif section_type == 'implementation':
            return self._generate_contextual_implementation(user_request, key_terms, requirements)
        elif section_type == 'data_flow':
            return self._generate_data_flow_section(user_request, key_terms)
        elif section_type == 'security':
            return self._generate_security_section(user_request, key_terms)
        elif section_type == 'performance':
            return self._generate_performance_section(user_request, requirements)
        elif section_type == 'testing':
            return self._generate_testing_section(user_request, requirements)
        elif section_type == 'deployment':
            return self._generate_deployment_section(user_request, key_terms)
        elif section_type == 'timeline':
            return self._generate_timeline_section(user_request, requirements)
        elif section_type == 'risks':
            return self._generate_risks_section(user_request, key_terms)
        else:
            return f"Content for {section_type} will be developed based on detailed analysis."

    def _generate_contextual_overview(self, user_request, key_terms, content_by_type):
        """Generate overview using retrieved content context"""
        content_types = list(content_by_type.keys())
        
        overview = f"""This technical design document outlines the comprehensive approach for implementing: **{user_request}**

**Project Scope:**
Based on analysis of {len(content_by_type)} different content sources ({', '.join(content_types)}), this solution addresses the following key areas:

"""
        
        if key_terms:
            overview += f"""**Key Technologies Identified from Documentation:**
{', '.join(key_terms[:8])}

"""
        
        overview += f"""**Solution Approach:**
The design incorporates insights from existing organizational documentation and leverages identified best practices to ensure:
- Alignment with current technical standards and patterns
- Integration with existing systems and workflows
- Scalable and maintainable architecture based on proven approaches
- Security and compliance requirements derived from organizational standards

**Documentation Analysis:**
This design is informed by {sum(len(docs) for docs in content_by_type.values())} relevant documents from your knowledge base, ensuring contextual relevance and organizational alignment."""

        return overview

    def _generate_contextual_background(self, user_request, all_content, content_by_type):
        """Generate background section with context from retrieved documents"""
        
        background = f"""**Current State Analysis:**
The need for {user_request} has been identified through comprehensive analysis of existing documentation and organizational requirements.

**Context from Available Documentation:**
"""
        
        # Summarize content by type with actual insights
        for content_type, docs in content_by_type.items():
            if docs:
                # Get the highest scoring document for this type
                top_doc = max(docs, key=lambda x: x['score'])
                background += f"""
**{content_type.title()} Sources ({len(docs)} documents):**
- Primary insight: {top_doc['content'][:200]}...
- Relevance score: {top_doc['score']:.3f}
"""
        
        background += f"""
**Problem Statement:**
Based on the analyzed documentation, this design addresses the implementation of {user_request} while ensuring:
- Compatibility with existing systems and documented patterns
- Adherence to established organizational practices and standards
- Meeting identified business and technical requirements from multiple sources
- Leveraging existing knowledge and avoiding reinvention

**Stakeholder Requirements:**
The solution incorporates requirements and insights identified across {len(content_by_type)} different content types, ensuring comprehensive coverage of both functional and operational needs."""

        return background

    def _generate_contextual_requirements(self, user_request, requirements, key_terms):
        """Generate requirements based on extracted content"""
        req_section = f"""**Functional Requirements:**
Based on analysis of available documentation, the following functional requirements have been identified:

"""
        
        if requirements['functional']:
            for i, req in enumerate(requirements['functional'][:5], 1):
                req_section += f"{i}. {req}\n"
        else:
            req_section += f"""1. Core {user_request} functionality implementation
2. User interface and interaction requirements based on organizational standards
3. Data processing and management capabilities
4. Integration with existing systems and documented APIs
5. Reporting and monitoring features aligned with current practices
"""
        
        req_section += f"""
**Non-Functional Requirements:**
"""
        
        if requirements['non_functional']:
            for req in requirements['non_functional'][:5]:
                req_section += f"- {req}\n"
        else:
            req_section += """- Performance: Response time < 2 seconds for standard operations
- Scalability: Support for concurrent users and growing data volumes
- Availability: 99.9% uptime with minimal planned downtime
- Security: Industry-standard encryption and authentication
- Maintainability: Well-documented, modular code architecture
- Compliance: Adherence to organizational security and data policies
"""
        
        if key_terms:
            req_section += f"""
**Technology Requirements (from documentation analysis):**
- Integration with identified technologies: {', '.join(key_terms[:5])}
- Compatibility with existing technology stack
"""
        
        return req_section

    def _generate_contextual_architecture(self, user_request, key_terms, content_by_type):
        """Generate architecture section based on retrieved content"""
        
        arch_section = f"""**System Architecture Overview:**
The {user_request} solution follows a modern, scalable architecture designed to integrate with existing organizational systems and patterns.

**Core Components:**
1. **Presentation Layer**
   - User interface components following organizational design standards
   - API gateway and routing based on existing patterns
   - Authentication and session management integration

2. **Business Logic Layer**
   - Core {user_request} processing aligned with business rules
   - Workflow orchestration following documented processes
   - Service integration with existing business systems

3. **Data Layer**
   - Primary data storage using organizational standards
   - Caching mechanisms based on performance requirements
   - Data access patterns consistent with existing systems

4. **Integration Layer**
   - External system connectors for documented integrations
   - Message queuing and processing using established patterns
   - Event handling aligned with organizational event architecture

"""
        
        if key_terms:
            arch_section += f"""**Technology Stack (based on documentation analysis):**
- Identified technologies: {', '.join(key_terms[:6])}
- Architecture patterns: Microservices, API-first, Event-driven
- Integration approaches: RESTful APIs, Message queues, Event streaming
"""
        
        if content_by_type:
            arch_section += f"""
**Integration Context:**
Based on analysis of {len(content_by_type)} content types, the architecture ensures:
- Compatibility with existing {', '.join(content_by_type.keys())} systems
- Adherence to documented architectural patterns and standards
- Seamless integration with current technology ecosystem
"""
        
        return arch_section

    def _generate_contextual_implementation(self, user_request, key_terms, requirements):
        """Generate implementation section with retrieved content context"""
        
        impl_section = f"""**Implementation Strategy:**
The development of {user_request} will follow an iterative, risk-driven approach based on organizational best practices.

**Development Phases:**

**Phase 1: Foundation (Weeks 1-3)**
- Core infrastructure setup using identified technologies
- Basic {user_request} functionality implementation
- Database schema design based on documented data models
- Authentication framework integration with existing systems

**Phase 2: Core Features (Weeks 4-6)**
- Primary business logic implementation following documented patterns
- User interface development aligned with organizational standards
- API development using established conventions
- Integration with key systems identified in documentation

**Phase 3: Advanced Features (Weeks 7-8)**
- Advanced functionality based on extracted requirements
- Performance optimization using documented best practices
- Security implementation following organizational standards
- Comprehensive testing aligned with quality processes

**Phase 4: Deployment (Weeks 9-10)**
- Production environment setup using established patterns
- Deployment automation following organizational DevOps practices
- Monitoring and alerting integration with existing systems
- Documentation and training based on organizational standards

"""
        
        if key_terms:
            impl_section += f"""**Technology Implementation:**
Based on documentation analysis, implementation will leverage:
- Core technologies: {', '.join(key_terms[:4])}
- Development patterns: Following documented organizational standards
- Integration approaches: Using established APIs and protocols
"""
        
        impl_section += f"""
**Development Standards:**
- Code review processes aligned with organizational practices
- Automated testing following documented quality standards
- Continuous integration using established CI/CD pipelines
- Documentation standards consistent with organizational requirements
- Security practices based on documented security policies
"""
        
        return impl_section

    # Add placeholder methods for other sections
    def _generate_data_flow_section(self, user_request, key_terms):
        return f"""**Data Flow Architecture:**
The {user_request} system processes data through documented organizational patterns:

**Input Processing:**
- Data ingestion following established data pipeline patterns
- Validation using organizational data quality standards
- Transformation based on documented data models

**Core Processing:**
- Business logic execution aligned with documented processes
- State management using established patterns
- Event generation following organizational event architecture

**Output Generation:**
- Result formatting based on organizational standards
- Integration with existing reporting systems
- API responses following documented conventions

**Technology Integration:**
{f"- Leveraging identified technologies: {', '.join(key_terms[:4])}" if key_terms else "- Using organizational standard technology stack"}
- Following documented data architecture patterns
- Ensuring compliance with data governance policies
"""

    def _generate_security_section(self, user_request, key_terms):
        return f"""**Security Framework:**
The {user_request} implementation incorporates comprehensive security measures based on organizational standards:

**Authentication & Authorization:**
- Integration with existing identity management systems
- Role-based access control following organizational patterns
- Multi-factor authentication using established protocols
- Session management aligned with security policies

**Data Protection:**
- Encryption standards based on organizational requirements
- Data classification following documented policies
- Access controls aligned with data governance standards
- Audit logging using established security monitoring

**Application Security:**
- Security testing following organizational security practices
- Vulnerability management using established processes
- Code security reviews aligned with development standards
- Compliance with documented security policies

**Technology Security:**
{f"- Security implementation for identified technologies: {', '.join(key_terms[:3])}" if key_terms else "- Following organizational technology security standards"}
- Integration with existing security infrastructure
- Monitoring and alerting using established security tools
"""

    def _generate_performance_section(self, user_request, requirements):
        return f"""**Performance Requirements:**
The {user_request} system is designed for optimal performance based on organizational standards:

**Response Time Targets:**
- API responses: Following organizational SLA requirements
- User interface: Based on documented user experience standards
- Batch processing: Aligned with existing system performance expectations

**Scalability Design:**
- Horizontal scaling using established infrastructure patterns
- Load balancing following organizational deployment standards
- Auto-scaling based on documented capacity planning approaches

**Performance Optimization:**
- Caching strategies using organizational standard technologies
- Database optimization following documented best practices
- Monitoring integration with existing performance management systems

**Performance Testing:**
- Load testing using established testing frameworks
- Performance benchmarking against organizational standards
- Capacity planning following documented processes
"""

    def _generate_testing_section(self, user_request, requirements):
        return f"""**Testing Strategy:**
Comprehensive testing approach for {user_request} following organizational quality standards:

**Testing Framework:**
- Unit testing using established organizational frameworks
- Integration testing following documented testing patterns
- End-to-end testing aligned with quality assurance processes
- Performance testing using organizational standard tools

**Quality Assurance:**
- Code review processes following organizational standards
- Automated testing integration with existing CI/CD pipelines
- Test coverage requirements based on organizational policies
- Quality gates aligned with documented quality standards

**Test Automation:**
- Automated test execution using established testing infrastructure
- Regression testing following organizational testing practices
- Test reporting integration with existing quality management systems
"""

    def _generate_deployment_section(self, user_request, key_terms):
        return f"""**Deployment Strategy:**
The {user_request} system deployment follows organizational DevOps practices:

**Deployment Pipeline:**
- CI/CD integration with existing organizational pipelines
- Environment management following established patterns
- Deployment automation using organizational standard tools
- Release management aligned with documented processes

**Infrastructure:**
- Deployment using organizational standard infrastructure
- Monitoring integration with existing operational systems
- Backup and recovery following documented procedures
- Security compliance with organizational deployment standards

**Technology Deployment:**
{f"- Deployment of identified technologies: {', '.join(key_terms[:3])}" if key_terms else "- Using organizational standard deployment technologies"}
- Configuration management following established practices
- Environment consistency using documented deployment patterns
"""

    def _generate_timeline_section(self, user_request, requirements):
        return f"""**Project Timeline:**
Estimated timeline for {user_request} implementation based on organizational project management standards:

**Phase-based Timeline:**
- **Weeks 1-3**: Foundation and setup following organizational onboarding processes
- **Weeks 4-6**: Core development using established development practices
- **Weeks 7-8**: Integration and testing following organizational quality processes
- **Weeks 9-10**: Deployment using established deployment procedures

**Key Milestones:**
- Technical design approval: Following organizational review processes
- Development milestones: Aligned with organizational project management standards
- Testing completion: Based on documented quality gates
- Production deployment: Following organizational go-live procedures

**Risk Management:**
- Timeline risks managed using organizational project management practices
- Resource allocation following established resource management processes
- Dependency management aligned with organizational project coordination
"""

    def _generate_risks_section(self, user_request, key_terms):
        return f"""**Risk Assessment:**
Risk identification and mitigation for {user_request} based on organizational risk management practices:

**Technical Risks:**
- Integration complexity with existing systems
- Performance risks based on documented system constraints
- Security risks managed through organizational security practices
- Technology risks for identified technologies: {', '.join(key_terms[:3]) if key_terms else 'standard technology stack'}

**Project Risks:**
- Resource availability managed through organizational resource planning
- Timeline risks mitigated using established project management practices
- Scope management following organizational change control processes

**Mitigation Strategies:**
- Risk monitoring using organizational risk management tools
- Escalation procedures following documented organizational processes
- Contingency planning based on organizational risk management standards
- Regular risk reviews aligned with project management practices
"""

    def generate_design_document(self, user_request, title=None):
        """Generate enhanced design document using RAG content analysis"""
        start_time = datetime.now()
        
        # Generate title
        if not title:
            title = f"Technical Design Document: {user_request}"
        
        print(f"🔍 Analyzing relevant content for: {user_request}")
        
        # Extract relevant content from documents using RAG
        sources, content_by_type, all_content = self._extract_relevant_content(user_request, k=12)
        
        print(f"📚 Found {len(sources)} relevant sources across {len(content_by_type)} content types")
        
        # Generate sections using retrieved content context
        sections = {}
        section_types = [
            'overview', 'background', 'requirements', 'architecture', 
            'implementation', 'data_flow', 'security', 'performance', 
            'testing', 'deployment', 'timeline', 'risks'
        ]
        
        print("🤖 Generating contextual content sections...")
        for section_type in section_types:
            try:
                sections[section_type] = self._analyze_content_for_section(
                    section_type, user_request, content_by_type, all_content
                )
            except Exception as e:
                print(f"⚠️ Error generating {section_type}: {e}")
                sections[section_type] = f"Content for {section_type} section will be developed based on detailed analysis."
        
        # Format references with enhanced information
        references = self._format_enhanced_references(sources)
        content_types_str = ', '.join(content_by_type.keys()) if content_by_type else 'Various'
        
        # Template variables
        template_vars = {
            'title': title,
            'overview': sections['overview'],
            'background': sections['background'],
            'requirements': sections['requirements'],
            'architecture': sections['architecture'],
            'implementation': sections['implementation'],
            'data_flow': sections['data_flow'],
            'security': sections['security'],
            'performance': sections['performance'],
            'testing': sections['testing'],
            'deployment': sections['deployment'],
            'timeline': sections['timeline'],
            'risks': sections['risks'],
            'references': references,
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'source_count': len(sources),
            'content_types': content_types_str
        }
        
        # Generate document
        template = Template(self.design_doc_template)
        design_document = template.render(**template_vars)
        
        generation_time = (datetime.now() - start_time).total_seconds()
        
        print(f"✅ Enhanced document generated in {generation_time:.2f}s using {len(sources)} sources")
        
        return {
            'document': design_document,
            'sources': sources,
            'content_analysis': {
                'content_by_type': content_by_type,
                'total_content_length': len(all_content),
                'key_insights': f"Analyzed {len(sources)} documents across {len(content_by_type)} content types"
            },
            'metadata': {
                'title': title,
                'user_request': user_request,
                'timestamp': template_vars['timestamp'],
                'generation_time': f"{generation_time:.2f}s",
                'source_count': len(sources),
                'content_types': list(content_by_type.keys()),
                'rag_enhanced': True
            }
        }

    def _format_enhanced_references(self, sources):
        """Format references with detailed information from RAG analysis"""
        if not sources:
            return "No references available from the knowledge base."
        
        references = []
        
        # Group by content type for better organization
        by_type = defaultdict(list)
        for source in sources:
            by_type[source['type']].append(source)
        
        for content_type, type_sources in by_type.items():
            references.append(f"\n**{content_type.title()} Sources:**")
            for i, source in enumerate(type_sources, 1):
                ref = f"{i}. **{source['title']}**"
                if source.get('url'):
                    ref += f" - [Link]({source['url']})"
                if source.get('page'):
                    ref += f" (Page {source['page']})"
                ref += f" (Relevance: {source['score']:.3f})"
                
                # Add content preview for context
                if source.get('content_preview'):
                    ref += f"\n   Preview: {source['content_preview']}"
                
                references.append(ref)
        
        return '\n'.join(references)

    def save_design_document(self, document_data, filename=None):
        """Fast document saving"""
        if not filename:
            safe_title = ''.join(c for c in document_data['metadata']['title'] if c.isalnum() or c in (' ', '-', '_'))
            safe_title = safe_title.replace(' ', '_')[:30]  # Shorter filename
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'{safe_title}_{timestamp}.md'
        
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(document_data['document'])
            print(f'✅ Document saved: {filename}')
            return filename
        except Exception as e:
            print(f'❌ Save error: {e}')
            return None

# Test function
def test_enhanced_generator():
    """Test the enhanced RAG-powered generator"""
    print("🧪 Testing Enhanced RAG Design Document Generator...")
    
    try:
        generator = DesignDocumentGenerator()
        
        # Test with a comprehensive request
        result = generator.generate_design_document(
            "User Authentication and Authorization System with OAuth2 Integration and Multi-Factor Authentication"
        )
        
        print(f"✅ Generated document with {result['metadata']['source_count']} sources")
        print(f"📄 Document length: {len(result['document'])} characters")
        print(f"⏱️ Generation time: {result['metadata']['generation_time']}")
        print(f"🔍 Content types analyzed: {', '.join(result['metadata']['content_types'])}")
        print(f"🤖 RAG enhanced: {result['metadata']['rag_enhanced']}")
        
        # Show content analysis summary
        if 'content_analysis' in result:
            print(f"📊 Content analysis: {result['content_analysis']['key_insights']}")
        
        return True
        
    except Exception as e:
        print(f"❌ Test failed: {e}")
        traceback.print_exc()
        return False

if __name__ == "__main__":
    test_enhanced_generator()

Writing design_doc_generator.py


### 4. Streamlit Frontend Application

In [6]:
%%writefile streamlit_app.py
import streamlit as st
import os
import sys
import tempfile
import traceback
from io import BytesIO
import pandas as pd
from datetime import datetime
import json

# Import our modules with fallback handling
try:
    from unified_processor import UnifiedDataProcessor
except ImportError:
    st.error("Could not import UnifiedDataProcessor. Please check your files.")
    st.stop()

try:
    from design_doc_generator import DesignDocumentGenerator
except ImportError:
    st.error("Could not import DesignDocumentGenerator. Please check your files.")
    st.stop()

try:
    from extract_confluence import ConfluenceProcessor
except ImportError:
    st.error("Could not import ConfluenceProcessor. Please check your files.")
    st.stop()

st.set_page_config(
    page_title='AI Document Assistant',
    page_icon='📚',
    layout='wide',
    initial_sidebar_state='expanded'
)

@st.cache_resource
def initialize_processor():
    return UnifiedDataProcessor()

@st.cache_resource
def initialize_confluence():
    return ConfluenceProcessor()

def initialize_doc_generator():
    try:
        return DesignDocumentGenerator()
    except Exception as e:
        st.session_state.doc_generator_error = str(e)
        return None

# Initialize components
if 'processor' not in st.session_state:
    st.session_state.processor = initialize_processor()

if 'confluence_processor' not in st.session_state:
    st.session_state.confluence_processor = initialize_confluence()

# Header
st.title('📚 AI Document Assistant')
st.markdown('**RAG System with Enhanced PDF and Confluence Integration**')

# Sidebar - System Status
with st.sidebar:
    st.title('🔧 System Status')
    
    # Vector store status
    vectorstore = st.session_state.processor.get_vectorstore()
    has_documents = vectorstore is not None
    
    if has_documents:
        try:
            doc_count = vectorstore.index.ntotal if hasattr(vectorstore, 'index') else 'unknown'
            st.success(f'✅ Vector Store: {doc_count} vectors')
        except:
            st.success('✅ Vector Store: Active')
    else:
        st.warning('⚠️ No documents loaded')
    
    # Confluence status
    confluence_status = st.session_state.processor.get_confluence_status()
    if confluence_status['available']:
        st.success(f'✅ Confluence: Connected')
    else:
        st.error(f'❌ Confluence: {confluence_status["error"]}')

# Main content tabs
tab1, tab2, tab3, tab4, tab5 = st.tabs(['🔍 Query Documents', '📄 Upload PDFs', '🌐 Confluence Integration', '📋 Generate Design Doc', '🧪 System Test'])

# Query Documents tab
with tab1:
    st.header('🔍 Query Your Documents')
    
    if not has_documents:
        st.warning('⚠️ No documents available. Please upload PDFs or ingest Confluence data first.')
    else:
        col1, col2 = st.columns([3, 1])
        with col1:
            query = st.text_input('Enter your question:', placeholder='What would you like to know?')
        with col2:
            k_results = st.slider('Results:', 1, 10, 5)
        
        if st.button('🔍 Search', type='primary', use_container_width=True):
            if query:
                with st.spinner('🔍 Searching documents...'):
                    try:
                        results = st.session_state.processor.search_documents(query, k=k_results)
                        
                        if results:
                            st.success(f'✅ Found {len(results)} relevant results')
                            
                            for i, (doc, score) in enumerate(results, 1):
                                with st.expander(f'📄 Result {i} - Relevance Score: {score:.4f}'):
                                    col1, col2 = st.columns([1, 1])
                                    
                                    with col1:
                                        st.markdown(f'**Source:** {doc.metadata.get("source", "unknown")}')
                                        st.markdown(f'**Title:** {doc.metadata.get("title", "Unknown")}')
                                        if doc.metadata.get('page'):
                                            st.markdown(f'**Page:** {doc.metadata.get("page")}')
                                    
                                    with col2:
                                        if doc.metadata.get('url'):
                                            st.markdown(f'**URL:** [Link]({doc.metadata.get("url")})')
                                        if doc.metadata.get('file_name'):
                                            st.markdown(f'**File:** {doc.metadata.get("file_name")}')
                                    
                                    st.markdown('**Content:**')
                                    # Fixed: Added proper label and label_visibility
                                    st.text_area(
                                        'Document Content', 
                                        doc.page_content, 
                                        height=150, 
                                        key=f'content_{i}',
                                        label_visibility='collapsed'
                                    )
                        else:
                            st.warning('❌ No relevant results found')
                    except Exception as e:
                        st.error(f'❌ Search error: {str(e)}')
                        with st.expander('Error Details'):
                            st.code(traceback.format_exc())
            else:
                st.warning('⚠️ Please enter a search query')

# Upload PDFs tab
with tab2:
    st.header('📄 Upload PDF Documents')
    
    uploaded_files = st.file_uploader(
        'Choose PDF files',
        type=['pdf'],
        accept_multiple_files=True,
        help='Upload one or more PDF files to add to the knowledge base'
    )
    
    col1, col2 = st.columns([1, 1])
    with col1:
        chunk_size = st.slider('Chunk Size:', 200, 1000, 500, step=50)
    with col2:
        st.info(f'Recommended: 500 for balanced performance')
    
    if uploaded_files:
        st.write(f'📁 Selected {len(uploaded_files)} file(s):')
        for file in uploaded_files:
            st.write(f'• {file.name} ({file.size:,} bytes)')
        
        if st.button('📤 Process PDFs', type='primary', use_container_width=True):
            progress_bar = st.progress(0)
            status_text = st.empty()
            
            try:
                temp_files = []
                for i, uploaded_file in enumerate(uploaded_files):
                    status_text.text(f'Processing {uploaded_file.name}...')
                    progress_bar.progress((i + 1) / len(uploaded_files))
                    
                    # Save to temporary file
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                        tmp_file.write(uploaded_file.getvalue())
                        temp_files.append(tmp_file.name)
                
                # Process all files
                status_text.text('Adding to vector store...')
                st.session_state.processor.add_pdf_documents(temp_files, chunk_size=chunk_size)
                
                # Cleanup
                for temp_file in temp_files:
                    try:
                        os.unlink(temp_file)
                    except:
                        pass
                
                progress_bar.progress(1.0)
                status_text.text('✅ Processing complete!')
                st.success(f'✅ Successfully processed {len(uploaded_files)} PDF file(s)')
                st.rerun()
                
            except Exception as e:
                st.error(f'❌ Error processing PDFs: {str(e)}')
                with st.expander('Error Details'):
                    st.code(traceback.format_exc())

# Confluence Integration tab
with tab3:
    st.header('🌐 Confluence Integration')
    
    # Show connection status
    confluence_status = st.session_state.processor.get_confluence_status()
    
    col1, col2 = st.columns([1, 1])
    with col1:
        st.subheader('Connection Status')
        if confluence_status['available']:
            st.success('✅ Connected to Confluence')
            st.info(f"URL: {confluence_status['url']}")
            st.info(f"User: {confluence_status['username']}")
        else:
            st.error('❌ Confluence not available')
            st.error(f"Error: {confluence_status['error']}")
    
    with col2:
        st.subheader('Configuration')
        st.info('Set these environment variables:')
        st.code('''
CONFLUENCE_URL=https://your-domain.atlassian.net
CONFLUENCE_USERNAME=your-email@domain.com
CONFLUENCE_API_TOKEN=your-api-token
        ''')
    
    if confluence_status['available']:
        st.subheader('📄 Ingest Confluence Pages')
        
        page_ids_input = st.text_area(
            'Page IDs (one per line):',
            placeholder='123456789\n987654321\n...',
            help='Enter Confluence page IDs, one per line'
        )
        
        col1, col2 = st.columns([1, 1])
        with col1:
            chunk_size = st.slider('Chunk Size:', 200, 1000, 500, step=50, key='confluence_chunk')
        with col2:
            st.info('Smaller chunks = more precise search')
        
        if st.button('📥 Ingest Pages', type='primary', use_container_width=True):
            if page_ids_input.strip():
                page_ids = [pid.strip() for pid in page_ids_input.strip().split('\n') if pid.strip()]
                
                with st.spinner(f'🔄 Processing {len(page_ids)} Confluence page(s)...'):
                    try:
                        st.session_state.processor.add_confluence_documents(
                            page_ids=page_ids,
                            chunk_size=chunk_size
                        )
                        st.success(f'✅ Successfully processed {len(page_ids)} Confluence page(s)')
                        st.rerun()
                    except Exception as e:
                        st.error(f'❌ Error processing Confluence pages: {str(e)}')
                        with st.expander('Error Details'):
                            st.code(traceback.format_exc())
            else:
                st.warning('⚠️ Please enter at least one page ID')

# Generate Design Doc tab
with tab4:
    st.header('📋 Generate Design Document')
    
    if not has_documents:
        st.warning('⚠️ No documents available. Upload PDFs or ingest Confluence data first for better results.')
    
    col1, col2 = st.columns([2, 1])
    with col1:
        user_request = st.text_area(
            'Describe what you want to build:',
            placeholder='e.g., A microservice for user authentication with OAuth2 integration...',
            height=100
        )
    with col2:
        doc_title = st.text_input(
            'Document Title (optional):',
            placeholder='Auto-generated if empty'
        )
    
    if st.button('📋 Generate Design Document', type='primary', use_container_width=True):
        if user_request.strip():
            with st.spinner('🤖 Generating design document...'):
                try:
                    # Initialize doc generator
                    doc_generator = initialize_doc_generator()
                    
                    if doc_generator:
                        result = doc_generator.generate_design_document(
                            user_request=user_request.strip(),
                            title=doc_title.strip() if doc_title.strip() else None
                        )
                        
                        st.success('✅ Design document generated successfully!')
                        
                        # Display metadata
                        col1, col2, col3 = st.columns([1, 1, 1])
                        with col1:
                            st.metric('Sources Used', result['metadata']['source_count'])
                        with col2:
                            st.metric('LLM Type', result['metadata']['llm_used'])
                        with col3:
                            st.metric('Generated', result['metadata']['timestamp'])
                        
                        # Display document
                        st.subheader('📄 Generated Document')
                        st.markdown(result['document'])
                        
                        # Download button
                        st.download_button(
                            label='💾 Download Document',
                            data=result['document'],
                            file_name=f"design_doc_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md",
                            mime='text/markdown',
                            use_container_width=True
                        )
                        
                        # Show sources
                        if result['sources']:
                            with st.expander('📚 Sources Used'):
                                for i, source in enumerate(result['sources'], 1):
                                    st.write(f"{i}. **{source['title']}** ({source['type']}) - Score: {source['score']:.3f}")
                                    if source.get('url'):
                                        st.write(f"   🔗 [Link]({source['url']})")
                    else:
                        st.error('❌ Failed to initialize document generator')
                        if hasattr(st.session_state, 'doc_generator_error'):
                            st.error(f"Error: {st.session_state.doc_generator_error}")
                
                except Exception as e:
                    st.error(f'❌ Error generating document: {str(e)}')
                    with st.expander('Error Details'):
                        st.code(traceback.format_exc())
        else:
            st.warning('⚠️ Please describe what you want to build')

# System Test tab
with tab5:
    st.header('🧪 System Test & Diagnostics')
    
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.subheader('🔍 Component Status')
        
        # Test vector store
        if st.button('Test Vector Store', use_container_width=True):
            try:
                vectorstore = st.session_state.processor.get_vectorstore()
                if vectorstore:
                    # Try a simple search
                    test_results = st.session_state.processor.search_documents('test', k=1)
                    st.success(f'✅ Vector store working - {len(test_results)} results')
                else:
                    st.warning('⚠️ No vector store available')
            except Exception as e:
                st.error(f'❌ Vector store error: {str(e)}')
        
        # Test Confluence
        if st.button('Test Confluence', use_container_width=True):
            try:
                status = st.session_state.processor.get_confluence_status()
                if status['available']:
                    st.success('✅ Confluence connection working')
                else:
                    st.error(f'❌ Confluence error: {status["error"]}')
            except Exception as e:
                st.error(f'❌ Confluence test error: {str(e)}')
        
        # Test Document Generator
        if st.button('Test Document Generator', use_container_width=True):
            try:
                doc_gen = initialize_doc_generator()
                if doc_gen:
                    st.success('✅ Document generator initialized')
                else:
                    st.error('❌ Document generator failed to initialize')
            except Exception as e:
                st.error(f'❌ Document generator error: {str(e)}')
    
    with col2:
        st.subheader('📊 System Information')
        
        # Environment variables
        env_vars = ['CONFLUENCE_URL', 'CONFLUENCE_USERNAME', 'CONFLUENCE_API_TOKEN']
        st.write('**Environment Variables:**')
        for var in env_vars:
            value = os.getenv(var, 'Not set')
            if var == 'CONFLUENCE_API_TOKEN' and value != 'Not set':
                value = f'{value[:8]}...' if len(value) > 8 else '***'
            st.write(f'• {var}: {value}')
        
        # File system
        st.write('**File System:**')
        vector_store_path = './vector_store/'
        if os.path.exists(vector_store_path):
            files = os.listdir(vector_store_path)
            st.write(f'• Vector store files: {len(files)}')
            for file in files[:5]:  # Show first 5 files
                st.write(f'  - {file}')
        else:
            st.write('• Vector store directory: Not found')
    
    # Clear data section
    st.subheader('🗑️ Data Management')
    col1, col2 = st.columns([1, 1])
    
    with col1:
        if st.button('🗑️ Clear Vector Store', type='secondary', use_container_width=True):
            try:
                vector_store_path = './vector_store/'
                if os.path.exists(vector_store_path):
                    import shutil
                    shutil.rmtree(vector_store_path)
                    st.success('✅ Vector store cleared')
                    st.rerun()
                else:
                    st.info('ℹ️ No vector store to clear')
            except Exception as e:
                st.error(f'❌ Error clearing vector store: {str(e)}')
    
    with col2:
        if st.button('🔄 Restart Components', type='secondary', use_container_width=True):
            try:
                # Clear cached resources
                st.cache_resource.clear()
                # Reset session state
                for key in ['processor', 'confluence_processor']:
                    if key in st.session_state:
                        del st.session_state[key]
                st.success('✅ Components restarted')
                st.rerun()
            except Exception as e:
                st.error(f'❌ Error restarting: {str(e)}')

# Footer
st.markdown('---')
st.markdown('**AI Document Assistant** - Enhanced RAG system with PDF and Confluence integration')

Writing streamlit_app.py


### 5. Environment Configuration

In [7]:
%%writefile .env
# Confluence Configuration (Replace with your actual values)
CONFLUENCE_URL=https://your-domain.atlassian.net/wiki
CONFLUENCE_USERNAME=your-email@company.com
CONFLUENCE_API_TOKEN=your-api-token

# AWS Configuration for Bedrock (Optional - for LLM integration)
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
AWS_ACCESS_KEY_ID = user_secrets.get_secret("Access_key")
AWS_SECRET_ACCESS_KEY = user_secrets.get_secret("Secret_access_key")
AWS_DEFAULT_REGION=us-east-1

#NGrok Auth token for the public access with url
NGROK_AUTH_TOKEN = user_secrets.get_secret("ngrok_auth_token")

# HuggingFace Cache Directory
HF_HOME=./huggingface_cache

# Vector Store Configuration
VECTOR_STORE_PATH=./vector_store/

Writing .env


In [8]:
# Test all components first
def comprehensive_test():
    print('🧪 Comprehensive System Test')
    print('=' * 50)
    
    success_count = 0
    total_tests = 0
    
    # Test 1: Imports
    total_tests += 1
    try:
        from unified_processor import UnifiedDataProcessor
        from extract_confluence import ConfluenceProcessor
        from design_doc_generator import DesignDocumentGenerator
        print('✅ All imports successful')
        success_count += 1
    except Exception as e:
        print(f'❌ Import error: {e}')
        traceback.print_exc()
    
    # Test 2: Processor initialization
    total_tests += 1
    try:
        processor = UnifiedDataProcessor()
        print('✅ UnifiedDataProcessor initialized')
        success_count += 1
    except Exception as e:
        print(f'❌ Processor error: {e}')
        traceback.print_exc()
        return
    
    # Test 3: Confluence validation
    total_tests += 1
    try:
        confluence_status = processor.get_confluence_status()
        if confluence_status['available']:
            print(f'✅ Confluence connected: {confluence_status[url]}')
            success_count += 1
        else:
            print(f'⚠️ Confluence not available: {confluence_status[error]}')
    except Exception as e:
        print(f'❌ Confluence test error: {e}')
    
    # Test 4: Search function
    total_tests += 1
    try:
        results = processor.search_documents('test query', k=1)
        print(f'✅ Search function works (found {len(results)} results)')
        success_count += 1
    except Exception as e:
        print(f'❌ Search error: {e}')
        traceback.print_exc()
    
    # Test 5: Design document generator
    total_tests += 1
    try:
        doc_gen = DesignDocumentGenerator()
        print('✅ Design Document Generator initialized')
        success_count += 1
    except Exception as e:
        print(f'⚠️ Design generator (expected if no docs): {e}')
    
    print(f'\\n🎯 Test Results: {success_count}/{total_tests} tests passed')
    if success_count >= 3:
        print('🎉 Core system is ready!')
    else:
        print('⚠️ Some core tests failed. Check configuration.')
    
    return success_count >= 3

# Run comprehensive test
comprehensive_test()

🧪 Comprehensive System Test
✅ All imports successful


2025-07-27 05:24:29.436240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753593869.668116      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753593869.735367      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Confluence connection error: Connection failed: Current user not permitted to use Confluence
No existing vector store found
✅ UnifiedDataProcessor initialized
❌ Confluence test error: name 'error' is not defined
No vector store available
✅ Search function works (found 0 results)
✅ Design Document Generator initialized
\n🎯 Test Results: 4/5 tests passed
🎉 Core system is ready!


True

## 🚀 Running the Application

### Method 1: Run Streamlit Directly

In [None]:
# Test the components first
print('🧪 Testing components...')

try:
    from unified_processor import UnifiedDataProcessor
    processor = UnifiedDataProcessor()
    print('✅ Unified Processor: OK')
except Exception as e:
    print(f'❌ Unified Processor: {e}')

try:
    from extract_confluence import ConfluenceProcessor
    confluence = ConfluenceProcessor()
    print(f'✅ Confluence Processor: {"Available" if confluence.available else "Not configured"}')
except Exception as e:
    print(f'❌ Confluence Processor: {e}')

print('\n🎯 Components tested successfully!')

In [None]:
# Run Streamlit app
import subprocess
import threading
import time

def run_streamlit():
    subprocess.run(['streamlit', 'run', 'streamlit_app.py', '--server.port=8502', '--server.headless=true'])

print('🚀 Starting Streamlit application...')
streamlit_thread = threading.Thread(target=run_streamlit)
streamlit_thread.daemon = True
streamlit_thread.start()

time.sleep(3)
print('✅ Streamlit should be running on port 8501')
print('🌐 If running locally, visit: http://localhost:8501')

### Method 2: Using ngrok for Public Access (Kaggle)

In [9]:
# Method 2A: ngrok with Authentication (Public Access)
# Set your ngrok authtoken here
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
NGROK_AUTH_TOKEN = user_secrets.get_secret("ngrok_auth_token")

def setup_ngrok_with_auth(auth_token):
    try:
        from pyngrok import ngrok
        import subprocess
        import threading
        import time
        
        # Set the authtoken
        ngrok.set_auth_token(auth_token)
        print('✅ ngrok authentication configured')
        
        # Kill any existing processes
        subprocess.run(['pkill', '-f', 'streamlit'], capture_output=True)
        
        def run_streamlit_background():
            subprocess.run([
                'streamlit', 'run', 'streamlit_app.py',
                '--server.port=8501',
                '--server.headless=true',
                '--server.enableCORS=false',
                '--server.enableXsrfProtection=false'
            ])
        
        print('🚀 Starting Streamlit...')
        streamlit_thread = threading.Thread(target=run_streamlit_background)
        streamlit_thread.daemon = True
        streamlit_thread.start()
        
        # Wait for Streamlit to start
        time.sleep(5)
        
        print('🌐 Creating public tunnel with ngrok...')
        public_url = ngrok.connect(8501)
        
        print(f'\n🎉 SUCCESS! Your app is now publicly accessible at:')
        print(f'🔗 {public_url}')
        print(f'\n📱 Share this URL to access your AI Document Assistant!')
        print(f'\n⚡ Features available:')
        print(f'   • Upload and process PDF documents')
        print(f'   • Configure Confluence integration')
        print(f'   • Query documents with semantic search')
        print(f'   • Generate professional design documents')
        
        print(f'\n⏳ Keeping the application running...')
        print(f'💡 The tunnel will stay active as long as this cell is running')
        
        return public_url, ngrok
        
    except ImportError:
        print('❌ pyngrok not available. Install with: !pip install pyngrok')
        return None, None
    except Exception as e:
        print(f'❌ Error setting up ngrok: {e}')
        print('💡 Make sure your authtoken is correct')
        return None, None

# Uncomment and set your token to use ngrok
public_url, ngrok_instance = setup_ngrok_with_auth(NGROK_AUTH_TOKEN)
print('💡 To use ngrok: Set your NGROK_AUTH_TOKEN above and uncomment the last line')

✅ ngrok authentication configured                                                                   
🚀 Starting Streamlit...

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://172.19.2.2:8501
  External URL: http://35.239.148.32:8501

🌐 Creating public tunnel with ngrok...

🎉 SUCCESS! Your app is now publicly accessible at:
🔗 NgrokTunnel: "https://7116a0b99a1b.ngrok-free.app" -> "http://localhost:8501"

📱 Share this URL to access your AI Document Assistant!

⚡ Features available:
   • Upload and process PDF documents
   • Configure Confluence integration
   • Query documents with semantic search
   • Generate professional design documents

⏳ Keeping the application running...
💡 The tunnel will stay active as long as this cell is running
💡 To use ngrok: Set your NGROK_AUTH_TOKEN above and uncomment the last line


2025-07-27 05:26:53.582006: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753594013.637028     169 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753594013.653779     169 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Confluence connection error: Connection failed: Current user not permitted to use Confluence
No existing vector store found
Confluence connection error: Connection failed: Current user not permitted to use Confluence
Loaded 13 pages from tmpuy9hxour.pdf
Split into 62 chunks.
Vector store saved to ./vector_store/
Added documents to unified vector store.
Searching for: "What is ai?" (k=5)
Found 5 results
Searching for: "What is collaborative ai?" (k=1)
Found 1 results
🔍 Analyzing relevant content for: Write a small design document about Multiagent infra
Confluence connection error: Connection failed: Current user not permitted to use Confluence
Loaded existing vector store
🔍 Searching for relevant content: Write a small design document about Multiagent infra
Searching for: "Write a small design document about Multiagent infra" (k=12)
Found 12 results
📚 Found 12 relevant sources across 1 content types
📚 Found 12 relevant sources across 1 content types
🤖 Generating contextual content secti

## 📖 Usage Instructions

### 1. Upload PDF Documents
- Go to the "Upload PDFs" tab
- Select one or more PDF files
- Adjust chunk size if needed (default: 500)
- Click "Upload and Process"

### 2. Configure Confluence (Optional)
- Set environment variables:
  - `CONFLUENCE_URL`: Your Confluence instance URL
  - `CONFLUENCE_USERNAME`: Your email
  - `CONFLUENCE_API_TOKEN`: API token from Atlassian

### 3. Ingest Confluence Data
- Go to "Confluence Integration" tab
- Enter page IDs (comma-separated)
- Click "Ingest Confluence Data"

### 4. Query Documents
- Go to "Query Documents" tab
- Enter your question
- Adjust number of results
- Click "Search"

### 5. Generate Design Documents
- Go to "Generate Design Doc" tab
- Describe what you want to design
- Optionally provide a custom title
- Click "Generate Design Document"
- Download the generated document

## 🎯 Features

- ✅ PDF Processing: Upload and process multiple PDF documents
- ✅ Confluence Integration: Ingest pages from Confluence
- ✅ Vector Search: Semantic search across all documents
- ✅ Design Document Generation: AI-powered design document creation
- ✅ Web Interface: User-friendly Streamlit frontend
- ✅ Real-time Status: System status monitoring
- ✅ Download Support: Export generated documents