In [None]:
import streamlit as st
from st_diff_viewer import diff_viewer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
import pandas as pd
import docx2txt
from PyPDF2 import PdfReader
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)
import tiktoken
import openai

# Configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400,
    length_function=len,
    add_start_index=True
)

def num_tokens(text):
    """Count tokens using tiktoken"""
    return len(enc.encode(text))

# Tokenizer for GPT-4
enc = tiktoken.encoding_for_model("gpt-4")

# Custom theme and session state initialization
st.set_page_config(layout="wide")
st.markdown("""
<style>
    [data-testid=stSidebar] {
        background-color: #f0f2f6;
    }
    .stProgress > div > div > div > div {
        background-color: #4B8BF5;
    }
    .st-b7 {
        color: #262730;
    }
    .report-section { 
        border-left: 4px solid #4B8BF5;
        padding-left: 1rem;
        margin: 1rem 0;
    }
</style>
""", unsafe_allow_html=True)

# Retry configuration for rate limits
def retriable_chain_invoke(chain, inputs):
    return chain.invoke(inputs)
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(openai.RateLimitError),
    reraise=True
)
def retriable_chain_invoke(chain, inputs):
    """Wrapper with retry logic for OpenAI API calls"""
    return chain.invoke(inputs)

def num_tokens(text):
    """Calculate token count for text"""
    return len(enc.encode(text))

def extract_text(file):
    """Extract text from DOCX or PDF files"""
    if file.name.endswith('.docx'):
        return docx2txt.process(file)
    elif file.name.endswith('.pdf'):
        reader = PdfReader(file)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    raise ValueError("Unsupported file format")

def process_document(file, doc_type, year):
    """Process document into chunks with metadata"""
    text = extract_text(file)
    chunks = text_splitter.split_text(text)
    return [{
        "text": chunk,
        "metadata": {
            "source": file.name,
            "doc_type": doc_type,
            "year": year,
            "page": (idx // 10) + 1
        }
    } for idx, chunk in enumerate(chunks)]

def analyze_changes(doc1_chunks, doc2_chunks):
    """Analyze documents using FAISS embeddings with rate limit handling"""
    embeddings = OpenAIEmbeddings(chunk_size=10)  # Smaller chunk size for rate limiting
    
    doc1_docs = [Document(page_content=c["text"], metadata=c["metadata"]) 
                for c in doc1_chunks]
    doc2_docs = [Document(page_content=c["text"], metadata=c["metadata"]) 
                for c in doc2_chunks]

    # Create vector stores with error handling
    db1 = FAISS.from_documents(doc1_docs, embeddings)
    time.sleep(1)  # Rate limit buffer
    db2 = FAISS.from_documents(doc2_docs, embeddings)

    changes = {"added": [], "removed": [], "modified": []}

    # Batch processing for rate limiting
    batch_size = 5
    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["added"].append({
                    "content": doc.page_content,
                    "metadata": doc.metadata
                })
        time.sleep(1)

    for i in range(0, len(doc1_docs), batch_size):
        batch = doc1_docs[i:i+batch_size]
        for doc in batch:
            similar = db2.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["removed"].append({
                    "content": doc.page_content,
                    "metadata": doc.metadata
                })
        time.sleep(1)

    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if similar and similar[0].metadata["source"] == doc.metadata["source"]:
                doc1 = similar[0]
                if doc1.page_content != doc.page_content:
                    changes["modified"].append({
                        "original": doc1.page_content,
                        "updated": doc.page_content,
                        "metadata": doc.metadata
                    })
        time.sleep(1)

    return changes

def generate_executive_summary(changes, doc1_meta, doc2_meta):
    """Generate human-readable summary with hierarchical processing"""
    # 1. Cluster related changes using embeddings
    cluster_prompt = ChatPromptTemplate.from_template("""
    Cluster these document changes into logical groups based on semantic similarity:
    {changes}
    
    Return ONLY a JSON array of cluster objects with:
    - "theme": Short descriptive title
    - "change_ids": Array of original change indices
    - "key_phrases": 3-5 key phrases per cluster
    """)
    
    # 2. Hierarchical summarization chain
    summary_prompt = ChatPromptTemplate.from_template("""
    As a professional analyst, create an executive summary comparing:
    {doc1} ({year1}) vs {doc2} ({year2}).
    
    Key clustered changes:
    {clustered_changes}

    Structure:
    1. **Major Structural Changes** (sections added/removed)
    2. **Content Evolution** (modified themes and concepts)
    3. **Strategic Implications** (business impact analysis)
    4. **Recommendations** (next steps based on changes)
    
    Include specific examples with citations like: 
    "The address changed from [X][p3] to [Y][p12]" 
    Use markdown with section headers and bold key terms.
    """)

    # Implementation steps
    cluster_chain = cluster_prompt | ChatOpenAI(model="gpt-4o", temperature=0.1)
    summary_chain = summary_prompt | ChatOpenAI(model="gpt-4o", temperature=0.3)

    # Process in batches using research-backed methods [1][3][5]
    def chunk_changes(changes, max_tokens=6000):
        current_chunk = []
        current_count = 0
        for idx, item in enumerate(changes):
            item_tokens = num_tokens(item['content'])
            if current_count + item_tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
            current_chunk.append((idx, item))
            current_count += item_tokens
        if current_chunk:
            yield current_chunk

    # Cluster changes using hierarchical approach [8]
    all_clusters = []
    for chunk in chunk_changes(
        [c for cat in changes.values() for c in cat], 
        max_tokens=6000
    ):
        cluster_result = retriable_chain_invoke(cluster_chain, {
            "changes": "\n".join(
                f"{idx}: {item['content'][:500]}..." 
                for idx, item in chunk
            )
        })
        all_clusters.extend(json.loads(cluster_result.content))
    
    # Process clusters with context-aware summarization [1][4][6]
    cluster_summaries = []
    for cluster in all_clusters:
        cluster_changes = [changes[i] for i in cluster["change_ids"]]
        cluster_text = "\n".join(
            f"Change {i}: {c['content'][:1000]} [Source: {c['metadata']['source']}, Page {c['metadata']['page']}]"
            for i, c in zip(cluster["change_ids"], cluster_changes)
        )
        
        cluster_summary = retriable_chain_invoke(summary_chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "clustered_changes": cluster_text
        })
        cluster_summaries.append(cluster_summary.content)
        time.sleep(1)  # Rate limit buffer

    # Final consolidation with cross-cluster analysis [8]
    final_prompt = ChatPromptTemplate.from_template("""
    Synthesize these cluster summaries into an executive report:
    {cluster_summaries}

    Maintain this structure:
    1. **Document Evolution Overview**
    2. **Strategic Direction Analysis**
    3. **Operational Impact Assessment**
    4. **Recommendations for Future Versions**

    Include 3-5 key visualizable trends using **bold** terms.
    Cite sources like [Source: {source}, Page {page}].
    """)

    final_chain = final_prompt | ChatOpenAI(model="gpt-4o", temperature=0.2)
    final_summary = retriable_chain_invoke(final_chain, {
        "cluster_summaries": "\n\n".join(cluster_summaries)
    })
    return final_summary.content, all_clusters

def chunked_report_generator(changes, max_tokens=12000):
    """Split changes into token-sized chunks for GPT processing"""
    current_chunk = []
    current_count = 0
    
    for cat in ['added', 'removed', 'modified']:
        for item in changes[cat]:
            content = f"{cat.upper()}:\n{item['content']}\n"
            tokens = num_tokens(content)
            
            if current_count + tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
                
            current_chunk.append(content)
            current_count += tokens
    
    if current_chunk:
        yield current_chunk

def generate_detailed_report(changes, doc1_meta, doc2_meta):
    """Generate detailed report with chunked processing"""
    report_prompt = ChatPromptTemplate.from_template("""
    Analyze document changes between:
    {doc1} ({year1}) and {doc2} ({year2})

    Changes:
    {changes}

    Format requirements:
    - Group by ADDED/REMOVED/MODIFIED
    - Include citations like [Source: {source}, Page {page}]
    - Highlight significant changes with **bold**
    - Use markdown headers ##
    - Maintain academic tone
    """)
    
    full_report = []
    chain = report_prompt | ChatOpenAI(
        model="gpt-4o",
        temperature=0.2,
        max_tokens=4000
    )
    
    for chunk in chunked_report_generator(changes):
        part = retriable_chain_invoke(chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "changes": "\n".join(chunk)
        }).content
        full_report.append(part)
        time.sleep(2)  # Rate limit buffer
    
    return "\n\n".join(full_report)

def main():
    st.title("Professional Document Comparison Suite")
    
    # Session state initialization
    if 'colors' not in st.session_state:
        st.session_state.colors = {
            'added': '#d4f7d4',
            'removed': '#f7d4d4',
            'modified': '#fff3d4'
        }

    with st.sidebar:
        st.header("Configuration")
        st.session_state.colors['added'] = st.color_picker("Added Color", '#d4f7d4')
        st.session_state.colors['removed'] = st.color_picker("Removed Color", '#f7d4d4')
        st.session_state.colors['modified'] = st.color_picker("Modified Color", '#fff3d4')

    col1, col2 = st.columns(2)
    with col1:
        doc1 = st.file_uploader("Upload Baseline Document", type=["pdf", "docx"])
        year1 = st.number_input("Baseline Year", min_value=1900, max_value=2025, value=2023)
    with col2:
        doc2 = st.file_uploader("Upload Comparison Document", type=["pdf", "docx"])
        year2 = st.number_input("Comparison Year", min_value=1900, max_value=2025, value=2024)

    if st.button("Analyze Documents") and doc1 and doc2:
        with st.status("Processing documents...", expanded=True) as status:
            try:
                # Process documents
                st.write("📄 Processing Document 1...")
                doc1_chunks = process_document(doc1, doc1.name.split('.')[-1], year1)
                
                st.write("📄 Processing Document 2...")
                doc2_chunks = process_document(doc2, doc2.name.split('.')[-1], year2)
                
                # Analyze changes
                st.write("🔍 Analyzing differences...")
                changes = analyze_changes(doc1_chunks, doc2_chunks)
                
                # Generate reports
                st.write("📊 Generating summary...")
                summary, all_clusters = generate_executive_summary(  # Unpack tuple
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
)
                
                st.write("📝 Compiling detailed report...")
                detailed_report = generate_detailed_report(
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
                )
                
                # Combine reports
                full_report = f"# Document Comparison Report\n{summary}\n{detailed_report}"
                
                status.update(label="Analysis complete! ✅", state="complete")
                
            except Exception as e:
                st.error(f"❌ Processing failed: {str(e)}")
                return

        # Visualization
        with st.expander("Detailed Analysis", expanded=True):
            tab1, tab2, tab3, tab4 = st.tabs(["Diff View", "Statistics", "Full Report", "Executive Summary"])
            
            with tab1:
                diff_viewer(
                    "\n".join([c["content"] for c in changes["removed"]]),
                    "\n".join([c["content"] for c in changes["added"]]),
                    split_view=True,
                    added_style=f"background: {st.session_state.colors['added']}",
                    removed_style=f"background: {st.session_state.colors['removed']}",
                    modified_style=f"background: {st.session_state.colors['modified']}"
                )
            
            with tab2:
                stats = pd.DataFrame({
                    'Change Type': ['Added', 'Removed', 'Modified'],
                    'Count': [
                        len(changes["added"]), 
                        len(changes["removed"]), 
                        len(changes["modified"])
                    ]
                })
                st.dataframe(
                    stats.style.applymap(
                        lambda x: f"background-color: {st.session_state.colors[x.lower()]};", 
                        subset=['Change Type']
                    ),
                    use_container_width=True
                )
            
            with tab3:
                st.markdown(full_report, unsafe_allow_html=True)
            
            with tab4:
                st.markdown(summary, unsafe_allow_html=True)
                st.write("### Change Clusters")
                
                if all_clusters:
                    for cluster in all_clusters:
                        with st.expander(f"{cluster.get('theme', 'Unnamed Cluster')}"):
                            st.write(f"**Key Phrases**: {', '.join(cluster.get('key_phrases', []))}")
                            st.write(f"Associated Changes: {len(cluster.get('change_ids', []))}")
                            st.write(f"Example Change: {changes[cluster['change_ids'][0]]['content'][:200]}...")
                else:
                    st.warning("No clusters identified in document changes")
        
        # Download button
        st.download_button(
            label="📥 Download Full Report",
            data=full_report,
            file_name="document_comparison.md",
            mime="text/markdown"
        )

if __name__ == "__main__":
    main()
