In [None]:
import streamlit as st
from st_diff_viewer import diff_viewer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
import pandas as pd
import docx2txt
from PyPDF2 import PdfReader
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)
import tiktoken
import openai

# Configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400,
    length_function=len,
    add_start_index=True
)

def num_tokens(text):
    """Count tokens using tiktoken"""
    return len(enc.encode(text))

# Tokenizer for GPT-4
enc = tiktoken.encoding_for_model("gpt-4")

# Custom theme and session state initialization
st.set_page_config(layout="wide")
st.markdown("""
<style>
    [data-testid=stSidebar] {
        background-color: #f0f2f6;
    }
    .stProgress > div > div > div > div {
        background-color: #4B8BF5;
    }
    .st-b7 {
        color: #262730;
    }
    .report-section { 
        border-left: 4px solid #4B8BF5;
        padding-left: 1rem;
        margin: 1rem 0;
    }
</style>
""", unsafe_allow_html=True)

# Retry configuration for rate limits
def retriable_chain_invoke(chain, inputs):
    return chain.invoke(inputs)
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(openai.RateLimitError),
    reraise=True
)
def retriable_chain_invoke(chain, inputs):
    """Wrapper with retry logic for OpenAI API calls"""
    return chain.invoke(inputs)

def num_tokens(text):
    """Calculate token count for text"""
    return len(enc.encode(text))

def extract_text(file):
    """Extract text from DOCX or PDF files"""
    if file.name.endswith('.docx'):
        return docx2txt.process(file)
    elif file.name.endswith('.pdf'):
        reader = PdfReader(file)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    raise ValueError("Unsupported file format")

def process_document(file, doc_type, year):
    """Process document into chunks with metadata"""
    text = extract_text(file)
    chunks = text_splitter.split_text(text)
    return [{
        "text": chunk,
        "metadata": {
            "source": file.name,
            "doc_type": doc_type,
            "year": year,
            "page": (idx // 10) + 1
        }
    } for idx, chunk in enumerate(chunks)]

def analyze_changes(doc1_chunks, doc2_chunks):
    """Analyze documents using FAISS embeddings with rate limit handling"""
    embeddings = OpenAIEmbeddings(chunk_size=10)  # Smaller chunk size for rate limiting
    
    doc1_docs = [Document(page_content=c["text"], metadata=c["metadata"]) 
                for c in doc1_chunks]
    doc2_docs = [Document(page_content=c["text"], metadata=c["metadata"]) 
                for c in doc2_chunks]

    # Create vector stores with error handling
    db1 = FAISS.from_documents(doc1_docs, embeddings)
    time.sleep(1)  # Rate limit buffer
    db2 = FAISS.from_documents(doc2_docs, embeddings)

    changes = {"added": [], "removed": [], "modified": []}

    # Batch processing for rate limiting
    batch_size = 5
    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["added"].append({
                    "content": doc.page_content,
                    "metadata": doc.metadata
                })
        time.sleep(1)

    for i in range(0, len(doc1_docs), batch_size):
        batch = doc1_docs[i:i+batch_size]
        for doc in batch:
            similar = db2.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["removed"].append({
                    "content": doc.page_content,
                    "metadata": doc.metadata
                })
        time.sleep(1)

    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if similar and similar[0].metadata["source"] == doc.metadata["source"]:
                doc1 = similar[0]
                if doc1.page_content != doc.page_content:
                    changes["modified"].append({
                        "original": doc1.page_content,
                        "updated": doc.page_content,
                        "metadata": doc.metadata
                    })
        time.sleep(1)

    return changes

def generate_executive_summary(changes, doc1_meta, doc2_meta):
    """Generate human-readable summary with hierarchical processing"""
    # 1. Cluster related changes using embeddings
    cluster_prompt = ChatPromptTemplate.from_template("""
    Cluster these document changes into logical groups based on semantic similarity:
    {changes}
    
    Return ONLY a JSON array of cluster objects with:
    - "theme": Short descriptive title
    - "change_ids": Array of original change indices
    - "key_phrases": 3-5 key phrases per cluster
    """)
    
    # 2. Hierarchical summarization chain
    summary_prompt = ChatPromptTemplate.from_template("""
    As a professional analyst, create an executive summary comparing:
    {doc1} ({year1}) vs {doc2} ({year2}).
    
    Key clustered changes:
    {clustered_changes}

    Structure:
    1. **Major Structural Changes** (sections added/removed)
    2. **Content Evolution** (modified themes and concepts)
    3. **Strategic Implications** (business impact analysis)
    4. **Recommendations** (next steps based on changes)
    
    Include specific examples with citations like: 
    "The address changed from [X][p3] to [Y][p12]" 
    Use markdown with section headers and bold key terms.
    """)

    # Implementation steps
    cluster_chain = cluster_prompt | ChatOpenAI(model="gpt-4o", temperature=0.1)
    summary_chain = summary_prompt | ChatOpenAI(model="gpt-4o", temperature=0.3)

    # Process in batches using research-backed methods [1][3][5]
    def chunk_changes(changes, max_tokens=6000):
        current_chunk = []
        current_count = 0
        for idx, item in enumerate(changes):
            item_tokens = num_tokens(item['content'])
            if current_count + item_tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
            current_chunk.append((idx, item))
            current_count += item_tokens
        if current_chunk:
            yield current_chunk

    # Cluster changes using hierarchical approach [8]
    all_clusters = []
    for chunk in chunk_changes(
        [c for cat in changes.values() for c in cat], 
        max_tokens=6000
    ):
        cluster_result = retriable_chain_invoke(cluster_chain, {
            "changes": "\n".join(
                f"{idx}: {item['content'][:500]}..." 
                for idx, item in chunk
            )
        })
        all_clusters.extend(json.loads(cluster_result.content))
    
    # Process clusters with context-aware summarization [1][4][6]
    cluster_summaries = []
    for cluster in all_clusters:
        cluster_changes = [changes[i] for i in cluster["change_ids"]]
        cluster_text = "\n".join(
            f"Change {i}: {c['content'][:1000]} [Source: {c['metadata']['source']}, Page {c['metadata']['page']}]"
            for i, c in zip(cluster["change_ids"], cluster_changes)
        )
        
        cluster_summary = retriable_chain_invoke(summary_chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "clustered_changes": cluster_text
        })
        cluster_summaries.append(cluster_summary.content)
        time.sleep(1)  # Rate limit buffer

    # Final consolidation with cross-cluster analysis [8]
    final_prompt = ChatPromptTemplate.from_template("""
    Synthesize these cluster summaries into an executive report:
    {cluster_summaries}

    Maintain this structure:
    1. **Document Evolution Overview**
    2. **Strategic Direction Analysis**
    3. **Operational Impact Assessment**
    4. **Recommendations for Future Versions**

    Include 3-5 key visualizable trends using **bold** terms.
    Cite sources like [Source: {source}, Page {page}].
    """)

    final_chain = final_prompt | ChatOpenAI(model="gpt-4o", temperature=0.2)
    final_summary = retriable_chain_invoke(final_chain, {
        "cluster_summaries": "\n\n".join(cluster_summaries)
    })
    return final_summary.content, all_clusters

def chunked_report_generator(changes, max_tokens=12000):
    """Split changes into token-sized chunks for GPT processing"""
    current_chunk = []
    current_count = 0
    
    for cat in ['added', 'removed', 'modified']:
        for item in changes[cat]:
            content = f"{cat.upper()}:\n{item['content']}\n"
            tokens = num_tokens(content)
            
            if current_count + tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
                
            current_chunk.append(content)
            current_count += tokens
    
    if current_chunk:
        yield current_chunk

def generate_detailed_report(changes, doc1_meta, doc2_meta):
    """Generate detailed report with chunked processing"""
    report_prompt = ChatPromptTemplate.from_template("""
    Analyze document changes between:
    {doc1} ({year1}) and {doc2} ({year2})

    Changes:
    {changes}

    Format requirements:
    - Group by ADDED/REMOVED/MODIFIED
    - Include citations like [Source: {source}, Page {page}]
    - Highlight significant changes with **bold**
    - Use markdown headers ##
    - Maintain academic tone
    """)
    
    full_report = []
    chain = report_prompt | ChatOpenAI(
        model="gpt-4o",
        temperature=0.2,
        max_tokens=4000
    )
    
    for chunk in chunked_report_generator(changes):
        part = retriable_chain_invoke(chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "changes": "\n".join(chunk)
        }).content
        full_report.append(part)
        time.sleep(2)  # Rate limit buffer
    
    return "\n\n".join(full_report)

def main():
    st.title("Professional Document Comparison Suite")
    
    # Session state initialization
    if 'colors' not in st.session_state:
        st.session_state.colors = {
            'added': '#d4f7d4',
            'removed': '#f7d4d4',
            'modified': '#fff3d4'
        }

    with st.sidebar:
        st.header("Configuration")
        st.session_state.colors['added'] = st.color_picker("Added Color", '#d4f7d4')
        st.session_state.colors['removed'] = st.color_picker("Removed Color", '#f7d4d4')
        st.session_state.colors['modified'] = st.color_picker("Modified Color", '#fff3d4')

    col1, col2 = st.columns(2)
    with col1:
        doc1 = st.file_uploader("Upload Baseline Document", type=["pdf", "docx"])
        year1 = st.number_input("Baseline Year", min_value=1900, max_value=2025, value=2023)
    with col2:
        doc2 = st.file_uploader("Upload Comparison Document", type=["pdf", "docx"])
        year2 = st.number_input("Comparison Year", min_value=1900, max_value=2025, value=2024)

    if st.button("Analyze Documents") and doc1 and doc2:
        with st.status("Processing documents...", expanded=True) as status:
            try:
                # Process documents
                st.write("📄 Processing Document 1...")
                doc1_chunks = process_document(doc1, doc1.name.split('.')[-1], year1)
                
                st.write("📄 Processing Document 2...")
                doc2_chunks = process_document(doc2, doc2.name.split('.')[-1], year2)
                
                # Analyze changes
                st.write("🔍 Analyzing differences...")
                changes = analyze_changes(doc1_chunks, doc2_chunks)
                
                # Generate reports
                st.write("📊 Generating summary...")
                summary, all_clusters = generate_executive_summary(  # Unpack tuple
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
)
                
                st.write("📝 Compiling detailed report...")
                detailed_report = generate_detailed_report(
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
                )
                
                # Combine reports
                full_report = f"# Document Comparison Report\n{summary}\n{detailed_report}"
                
                status.update(label="Analysis complete! ✅", state="complete")
                
            except Exception as e:
                st.error(f"❌ Processing failed: {str(e)}")
                return

        # Visualization
        with st.expander("Detailed Analysis", expanded=True):
            tab1, tab2, tab3, tab4 = st.tabs(["Diff View", "Statistics", "Full Report", "Executive Summary"])
            
            with tab1:
                diff_viewer(
                    "\n".join([c["content"] for c in changes["removed"]]),
                    "\n".join([c["content"] for c in changes["added"]]),
                    split_view=True,
                    added_style=f"background: {st.session_state.colors['added']}",
                    removed_style=f"background: {st.session_state.colors['removed']}",
                    modified_style=f"background: {st.session_state.colors['modified']}"
                )
            
            with tab2:
                stats = pd.DataFrame({
                    'Change Type': ['Added', 'Removed', 'Modified'],
                    'Count': [
                        len(changes["added"]), 
                        len(changes["removed"]), 
                        len(changes["modified"])
                    ]
                })
                st.dataframe(
                    stats.style.applymap(
                        lambda x: f"background-color: {st.session_state.colors[x.lower()]};", 
                        subset=['Change Type']
                    ),
                    use_container_width=True
                )
            
            with tab3:
                st.markdown(full_report, unsafe_allow_html=True)
            
            with tab4:
                st.markdown(summary, unsafe_allow_html=True)
                st.write("### Change Clusters")
                
                if all_clusters:
                    for cluster in all_clusters:
                        with st.expander(f"{cluster.get('theme', 'Unnamed Cluster')}"):
                            st.write(f"**Key Phrases**: {', '.join(cluster.get('key_phrases', []))}")
                            st.write(f"Associated Changes: {len(cluster.get('change_ids', []))}")
                            st.write(f"Example Change: {changes[cluster['change_ids'][0]]['content'][:200]}...")
                else:
                    st.warning("No clusters identified in document changes")
        
        # Download button
        st.download_button(
            label="📥 Download Full Report",
            data=full_report,
            file_name="document_comparison.md",
            mime="text/markdown"
        )

if __name__ == "__main__":
    main()


### Key Points
- It seems likely that the error "processing failed: expecting value: line 1 column 1 (char 0)" occurs due to incorrect JSON parsing in your code, specifically using `json.load` instead of `json.loads` for string parsing.
- Research suggests adding error handling and refining the prompt for the AI model can help resolve this issue, ensuring proper JSON formatting.
- The evidence leans toward modifying the clustering step in your `generate_executive_summary` function to use `json.loads` and include try-except blocks for robustness.

### Why This Error Occurs
The error message indicates a JSON parsing failure, likely because your code tries to parse the AI model's response as JSON using `json.load`, which is meant for file objects, not strings. Instead, you should use `json.loads` for parsing strings, which is the correct method for handling the response from the ChatOpenAI model.

### How to Fix It
To resolve this, modify the clustering part of your `generate_executive_summary` function:
- Replace `json.load` with `json.loads` to correctly parse the string response.
- Add error handling to catch and display any JSON decoding errors, helping you debug if the AI model returns unexpected output.

This approach should prevent the error and make your code more robust against potential issues with the AI model's responses.

---

### Detailed Analysis and Implementation

This section provides a comprehensive examination of the issue, including the root cause, proposed solutions, and additional considerations for enhancing your document comparison pipeline. The analysis is grounded in the code provided and aims to address the error "processing failed: expecting value: line 1 column 1 (char 0)" during the summary generation phase.

#### Understanding the Error
The error message "expecting value: line 1 column 1 (char 0)" is a standard JSON decoding error in Python, indicating that the `json` module expected a valid JSON value but encountered an empty string or invalid input at the start. This typically occurs when attempting to parse a string that is not properly formatted as JSON. In your code, this error manifests during the `generate_executive_summary` function, specifically in the clustering step where the response from the ChatOpenAI model is parsed.

The root cause appears to be the use of `json.load` instead of `json.loads` for parsing the model's response. In Python's `json` module:
- `json.load(fp)` is designed for reading JSON from a file-like object.
- `json.loads(s)` is intended for parsing JSON from a string.

Given that `cluster_result.content` is a string (the response from the ChatOpenAI model), using `json.load` is incorrect and likely causes the parsing failure, especially if the string is empty or malformed.

#### Code Analysis
Let's examine the relevant part of your code in the `generate_executive_summary` function:

```python
for chunk in chunk_changes(
    [c for cat in changes.values() for c in cat], 
    max_tokens=6000
):
    cluster_result = retriable_chain_ininvoke(cluster_chain, {
        "changes": "\n".join(
            f"{idx}: {item['content'][:500]}..." 
            for idx, item in enumerate(chunk)
        )
    })
    cluster_result_content = cluster_result.content.strip()
    try:
        parsed_result = json.load(cluster_result.content)  # Incorrect: should be json.loads
        all_clusters.extend(parsed_result)
    except json.decoder.JsonDecodeError as e:
        st.error(f"Failed to parse clustering result: {e}")
        st.error(f"Response from model: {cluster_result_content}")
        return
```

The line `json.load(cluster_result.content)` is problematic because:
- `cluster_result.content` is a string, not a file-like object, making `json.load` inappropriate.
- This mismatch likely results in the "expecting value" error, especially if the string is empty or not a valid JSON.

Additionally, the code snippet in the thinking trace suggests a potential syntax error (`all_clusters.extend json.load(cluster_result.content)` without parentheses), but assuming it's a formatting issue, the core problem is the use of `json.load`.

#### Proposed Solution
To fix this, we recommend the following modifications:

1. **Correct JSON Parsing**:
   - Replace `json.load(cluster_result.content)` with `json.loads(cluster_result.content)` to properly parse the string response.
   - Ensure the `json` module is imported at the top of your file: `import json`.

2. **Enhance Error Handling**:
   - Keep the try-except block to catch `json.decoder.JsonDecodeError`, which will help identify if the AI model returns unexpected output.
   - Strip any leading or trailing whitespace from `cluster_result.content` before parsing, as done in your code (`cluster_result.content.strip()`), to handle potential formatting issues.

3. **Refine the Prompt for Clarity**:
   - Modify the `cluster_prompt` to explicitly instruct the model to return only a valid JSON array, reducing the chance of extraneous text. For example:

```python
cluster_prompt = ChatPromptTemplate.from_template("""
You are an expert in data clustering and JSON formatting. Your task is to cluster the following document changes into logical groups based on their semantic similarity. Each change is labeled with an index and a snippet of its content.

Changes:
{changes}

Your response should be a JSON array of cluster objects, where each cluster object has:
- "theme": a short descriptive title for the cluster
- "change_ids": an array of the original change indices that belong to this cluster
- "key_phrases": an array of 3-5 key phrases that summarize the cluster

Do not include any additional text or explanations in your response. The output should be a valid JSON array.

Example of expected output:
[
    {
        "theme": "Example Cluster",
        "change_ids": [0, 2, 3],
        "key_phrases": ["phrase1", "phrase2", "phrase3"]
    },
    {
        "theme": "Another Cluster",
        "change_ids": [1, 4],
        "key_phrases": ["phrase4", "phrase5"]
    }
]
""")
```

This enhanced prompt includes an example, which can help the model generate the correct JSON format, reducing parsing errors.

#### Implementation Details
Here’s how the modified code should look in the `generate_executive_summary` function:

```python
import json

# ... (rest of the function remains the same)

for chunk in chunk_changes(
    [c for cat in changes.values() for c in cat], 
    max_tokens=6000
):
    cluster_result = retriable_chain_ininvoke(cluster_chain, {
        "changes": "\n".join(
            f"{idx}: {item['content'][:500]}..." 
            for idx, item in enumerate(chunk)
        )
    })
    cluster_result_content = cluster_result.content.strip()
    try:
        parsed_result = json.loads(cluster_result_content)  # Corrected to json.loads
        all_clusters.extend(parsed_result)
    except json.decoder.JsonDecodeError as e:
        st.error(f"Failed to parse clustering result: {e}")
        st.error(f"Response from model: {cluster_result_content}")
        return

# ... (rest of the function remains the same)
```

#### Additional Considerations
- **Model Response Validation**: The error could also occur if the ChatOpenAI model does not return a valid JSON string, possibly due to the input being too complex or the model's temperature setting. Your current setting (`temperature=0.1`) is low, which helps, but ensuring the input (`changes`) is within token limits (chunked at 6000 tokens) is crucial.
- **Debugging Tip**: If the error persists, the error handling will display the model's response, allowing you to inspect it for any unexpected text (e.g., introductory remarks before the JSON) and refine the prompt further.
- **Performance Impact**: Adding error handling and refining the prompt should not significantly impact performance, as the changes are localized to the parsing step.

#### Comparative Analysis of JSON Parsing Methods

To illustrate the difference, here’s a table comparing `json.load` and `json.loads`:

| Method       | Input Type          | Use Case                              | Example Usage                          |
|--------------|---------------------|---------------------------------------|----------------------------------------|
| `json.load`  | File-like object    | Reading JSON from a file              | `json.load(open('file.json', 'r'))`    |
| `json.loads` | String              | Parsing JSON from a string            | `json.loads('{"key": "value"}')`       |

This table highlights why `json.loads` is the appropriate choice for parsing `cluster_result.content`, which is a string.

#### Unexpected Detail: Prompt Engineering Impact
An unexpected aspect is how refining the prompt with an example can significantly improve the model's output, potentially reducing parsing errors. This is not immediately obvious but can be critical for ensuring the AI model adheres to the expected JSON format, especially for complex tasks like clustering document changes.

#### Conclusion
By correcting the JSON parsing method to `json.loads`, adding robust error handling, and refining the prompt for clarity, you should resolve the "expecting value" error and enhance the reliability of your document comparison pipeline. This approach ensures your code can handle potential issues with the AI model's responses and provides better debugging capabilities.

### Key Citations
- [Python JSON Module Documentation](https://docs.python.org/3/library/json.html)
- [LangChain ChatOpenAI Documentation](https://python.langchain.com/docs/integrations/chat/openai)

In [None]:
import streamlit as st
from st_diff_viewer import diff_viewer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
import pandas as pd
import docx2txt
from PyPDF2 import PdfReader
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)
import tiktoken
import openai
import json

# Configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400,
    length_function=len,
    add_start_index=True
)

enc = tiktoken.encoding_for_model("gpt-4")

# Custom theme
st.set_page_config(layout="wide")
st.markdown("""
<style>
    [data-testid=stSidebar] { background-color: #f0f2f6; }
    .stProgress > div > div > div > div { background-color: #4B8BF5; }
    .st-b7 { color: #262730; }
    .report-section { border-left: 4px solid #4B8BF5; padding-left: 1rem; margin: 1rem 0; }
</style>
""", unsafe_allow_html=True)

# Retry configuration
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(openai.RateLimitError),
    reraise=True
)
def retriable_chain_invoke(chain, inputs):
    return chain.invoke(inputs)

def num_tokens(text):
    return len(enc.encode(text))

def extract_text(file):
    if file.name.endswith('.docx'):
        return docx2txt.process(file)
    elif file.name.endswith('.pdf'):
        reader = PdfReader(file)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    raise ValueError("Unsupported file format")

def process_document(file, doc_type, year):
    text = extract_text(file)
    chunks = text_splitter.split_text(text)
    return [{
        "text": chunk,
        "metadata": {
            "source": file.name,
            "doc_type": doc_type,
            "year": year,
            "page": (idx // 10) + 1
        }
    } for idx, chunk in enumerate(chunks)]

def analyze_changes(doc1_chunks, doc2_chunks):
    embeddings = OpenAIEmbeddings(chunk_size=10)
    doc1_docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in doc1_chunks]
    doc2_docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in doc2_chunks]
    db1 = FAISS.from_documents(doc1_docs, embeddings)
    time.sleep(1)
    db2 = FAISS.from_documents(doc2_docs, embeddings)
    changes = {"added": [], "removed": [], "modified": []}
    batch_size = 5
    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["added"].append({"content": doc.page_content, "metadata": doc.metadata})
        time.sleep(1)
    for i in range(0, len(doc1_docs), batch_size):
        batch = doc1_docs[i:i+batch_size]
        for doc in batch:
            similar = db2.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["removed"].append({"content": doc.page_content, "metadata": doc.metadata})
        time.sleep(1)
    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if similar and similar[0].metadata["source"] == doc.metadata["source"]:
                doc1 = similar[0]
                if doc1.page_content != doc.page_content:
                    changes["modified"].append({
                        "original": doc1.page_content,
                        "updated": doc.page_content,
                        "metadata": doc.metadata
                    })
        time.sleep(1)
    return changes

def generate_executive_summary(changes, doc1_meta, doc2_meta):
    cluster_prompt = ChatPromptTemplate.from_template("""
    You are an expert in data clustering and JSON formatting. Your task is to cluster the following document changes into logical groups based on their semantic similarity. Each change is labeled with an index and a snippet of its content.

    Changes:
    {changes}

    Your response MUST be a valid JSON array of cluster objects, where each cluster object has:
    - "theme": a short descriptive title for the cluster (string)
    - "change_ids": an array of the original change indices that belong to this cluster (array of integers)
    - "key_phrases": an array of 3-5 key phrases that summarize the cluster (array of strings)

    If there are no changes to cluster or if clustering cannot be performed, return an empty JSON array: [].
    Do not include any additional text, explanations, or comments outside the JSON array. The output must be valid JSON.

    Example of expected output:
    [
        {{
            "theme": "Policy Updates",
            "change_ids": [0, 2, 3],
            "key_phrases": ["new regulation", "compliance", "deadline"]
        }},
        {{
            "theme": "Formatting Changes",
            "change_ids": [1, 4],
            "key_phrases": ["font size", "layout", "spacing"]
        }}
    ]
    """)
    summary_prompt = ChatPromptTemplate.from_template("""
    As a professional analyst, create an executive summary comparing:
    {doc1} ({year1}) vs {doc2} ({year2}).
    
    Key clustered changes:
    {clustered_changes}

    Structure:
    1. *Major Structural Changes* (sections added/removed)
    2. *Content Evolution* (modified themes and concepts)
    3. *Strategic Implications* (business impact analysis)
    4. *Recommendations* (next steps based on changes)
    
    Include specific examples with citations like: 
    "The address changed from [X][p3] to [Y][p12]" 
    Use markdown with section headers and bold key terms.
    """)
    cluster_chain = cluster_prompt | ChatOpenAI(model="gpt-4", temperature=0.1)
    summary_chain = summary_prompt | ChatOpenAI(model="gpt-4", temperature=0.3)

    def chunk_changes(changes, max_tokens=6000):
        current_chunk = []
        current_count = 0
        for idx, item in enumerate(changes):
            item_tokens = num_tokens(item['content'])
            if current_count + item_tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
            current_chunk.append((idx, item))
            current_count += item_tokens
        if current_chunk:
            yield current_chunk

    flat_changes = [c for cat in changes.values() for c in cat]
    if not flat_changes:
        st.warning("No changes detected to cluster.")
        return "No significant changes detected between documents.", [], flat_changes

    all_clusters = []
    for chunk in chunk_changes(flat_changes, max_tokens=6000):
        changes_input = "\n".join(f"{idx}: {item['content'][:500]}..." for idx, item in chunk)
        if not changes_input.strip():
            st.warning("Empty chunk encountered; skipping.")
            all_clusters.append([])  # Append empty cluster for this chunk
            continue

        cluster_result = retriable_chain_invoke(cluster_chain, {"changes": changes_input})
        cluster_result_content = cluster_result.content.strip()
        st.write(f"DEBUG: Model response for chunk: '{cluster_result_content}'")  # Debug output

        try:
            parsed_result = json.loads(cluster_result_content)
            if not isinstance(parsed_result, list):
                raise ValueError("Model response is not a JSON array")
            all_clusters.extend(parsed_result)
        except (json.JSONDecodeError, ValueError) as e:
            st.error(f"Failed to parse clustering result: {e}")
            st.error(f"Raw response from model: '{cluster_result_content}'")
            all_clusters.extend([])  # Fallback to empty cluster
            continue

    cluster_summaries = []
    for cluster in all_clusters:
        if not cluster or "change_ids" not in cluster:
            continue
        cluster_changes = [flat_changes[i] for i in cluster["change_ids"] if i < len(flat_changes)]
        cluster_text = "\n".join(
            f"Change {i}: {c['content'][:1000]} [Source: {c['metadata']['source']}, Page {c['metadata']['page']}]"
            for i, c in zip(cluster["change_ids"], cluster_changes)
        )
        cluster_summary = retriable_chain_invoke(summary_chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "clustered_changes": cluster_text if cluster_text else "No changes in this cluster"
        })
        cluster_summaries.append(cluster_summary.content)
        time.sleep(1)

    final_prompt = ChatPromptTemplate.from_template("""
    Synthesize these cluster summaries into an executive report:
    {cluster_summaries}

    Maintain this structure:
    1. *Document Evolution Overview*
    2. *Strategic Direction Analysis*
    3. *Operational Impact Assessment*
    4. *Recommendations for Future Versions*

    Include 3-5 key visualizable trends using *bold* terms.
    Cite sources like [Source: {{source}}, Page {{page}}].
    """)
    final_chain = final_prompt | ChatOpenAI(model="gpt-4", temperature=0.2)
    final_summary = retriable_chain_invoke(final_chain, {
        "cluster_summaries": "\n\n".join(cluster_summaries) if cluster_summaries else "No significant changes detected."
    })
    return final_summary.content, all_clusters, flat_changes

def generate_detailed_report(changes, doc1_meta, doc2_meta):
    report_prompt = ChatPromptTemplate.from_template("""
    Analyze document changes between:
    {doc1} ({year1}) and {doc2} ({year2})

    Changes:
    {changes}

    Format requirements:
    - Group by ADDED/REMOVED/MODIFIED
    - Include citations like [Source: {{source}}, Page {{page}}]
    - Highlight significant changes with *bold*
    - Use markdown headers ##
    - Maintain academic tone
    """)
    full_report = []
    chain = report_prompt | ChatOpenAI(model="gpt-4", temperature=0.2, max_tokens=4000)
    for chunk in chunked_report_generator(changes):
        part = retriable_chain_invoke(chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "changes": "\n".join(chunk)
        }).content
        full_report.append(part)
        time.sleep(2)
    return "\n\n".join(full_report)

def chunked_report_generator(changes, max_tokens=12000):
    current_chunk = []
    current_count = 0
    for cat in ['added', 'removed', 'modified']:
        for item in changes[cat]:
            content = f"{cat.upper()}:\n{item['content']}\n"
            tokens = num_tokens(content)
            if current_count + tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
            current_chunk.append(content)
            current_count += tokens
    if current_chunk:
        yield current_chunk

def main():
    st.title("Professional Document Comparison Suite")
    if 'colors' not in st.session_state:
        st.session_state.colors = {'added': '#d4f7d4', 'removed': '#f7d4d4', 'modified': '#fff3d4'}

    with st.sidebar:
        st.header("Configuration")
        st.session_state.colors['added'] = st.color_picker("Added Color", '#d4f7d4')
        st.session_state.colors['removed'] = st.color_picker("Removed Color", '#f7d4d4')
        st.session_state.colors['modified'] = st.color_picker("Modified Color", '#fff3d4')

    col1, col2 = st.columns(2)
    with col1:
        doc1 = st.file_uploader("Upload Baseline Document", type=["pdf", "docx"])
        year1 = st.number_input("Baseline Year", min_value=1900, max_value=2025, value=2023)
    with col2:
        doc2 = st.file_uploader("Upload Comparison Document", type=["pdf", "docx"])
        year2 = st.number_input("Comparison Year", min_value=1900, max_value=2025, value=2024)

    if st.button("Analyze Documents") and doc1 and doc2:
        with st.status("Processing documents...", expanded=True) as status:
            try:
                st.write("📄 Processing Document 1...")
                doc1_chunks = process_document(doc1, doc1.name.split('.')[-1], year1)
                st.write("📄 Processing Document 2...")
                doc2_chunks = process_document(doc2, doc2.name.split('.')[-1], year2)
                st.write("🔍 Analyzing differences...")
                changes = analyze_changes(doc1_chunks, doc2_chunks)
                st.write("📊 Generating summary...")
                summary, all_clusters, flat_changes = generate_executive_summary(
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
                )
                st.write("📝 Compiling detailed report...")
                detailed_report = generate_detailed_report(
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
                )
                full_report = f"# Document Comparison Report\n{summary}\n{detailed_report}"
                status.update(label="Analysis complete! ✅", state="complete")
            except Exception as e:
                st.error(f"❌ Processing failed: {str(e)}")
                return
                         
        with st.expander("Detailed Analysis", expanded=True):
            tab1, tab2, tab3, tab4 = st.tabs(["Diff View", "Statistics", "Full Report", "Executive Summary"])
            with tab1:
                diff_viewer(
                    "\n".join([c["content"] for c in changes["removed"]]),
                    "\n".join([c["content"] for c in changes["added"]]),
                    split_view=True,
                    added_style=f"background: {st.session_state.colors['added']}",
                    removed_style=f"background: {st.session_state.colors['removed']}",
                    modified_style=f"background: {st.session_state.colors['modified']}"
                )
            with tab2:
                stats = pd.DataFrame({
                    'Change Type': ['Added', 'Removed', 'Modified'],
                    'Count': [len(changes["added"]), len(changes["removed"]), len(changes["modified"])]
                })
                st.dataframe(
                    stats.style.applymap(
                        lambda x: f"background-color: {st.session_state.colors[x.lower()]};",
                        subset=['Change Type']
                    ),
                    use_container_width=True
                )
            with tab3:
                st.markdown(full_report, unsafe_allow_html=True)
            with tab4:
                st.markdown(summary, unsafe_allow_html=True)
                st.write("### Change Clusters")
                if all_clusters:
                    for cluster in all_clusters:
                        if not cluster or "theme" not in cluster:
                            continue
                        with st.expander(f"{cluster.get('theme', 'Unnamed Cluster')}"):
                            st.write(f"*Key Phrases*: {', '.join(cluster.get('key_phrases', []))}")
                            st.write(f"Associated Changes: {len(cluster.get('change_ids', []))}")
                            if cluster.get('change_ids'):  # Check if change_ids exists and is not empty
                                st.write(f"Example Change: {flat_changes[cluster['change_ids'][0]]['content'][:200]}...")
                else:
                    st.warning("No clusters identified in document changes")

        st.download_button(
            label="📥 Download Full Report",
            data=full_report,
            file_name="document_comparison.md",
            mime="text/markdown"
        )

if __name__ == "__main__":
    main()

    


ModuleNotFoundError: No module named 'st_diff_viewer'

In [None]:
import streamlit as st
from st_diff_viewer import diff_viewer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
import pandas as pd
import docx2txt
from PyPDF2 import PdfReader
import time
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type
)
import tiktoken
import openai
import json

# Configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400,
    length_function=len,
    add_start_index=True
)

enc = tiktoken.encoding_for_model("gpt-4")

# Custom theme
st.set_page_config(layout="wide")
st.markdown("""
<style>
    [data-testid=stSidebar] { background-color: #f0f2f6; }
    .stProgress > div > div > div > div { background-color: #4B8BF5; }
    .st-b7 { color: #262730; }
    .report-section { border-left: 4px solid #4B8BF5; padding-left: 1rem; margin: 1rem 0; }
</style>
""", unsafe_allow_html=True)

# Retry configuration
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(openai.RateLimitError),
    reraise=True
)
def retriable_chain_invoke(chain, inputs):
    return chain.invoke(inputs)

def num_tokens(text):
    return len(enc.encode(text))

def extract_text(file):
    if file.name.endswith('.docx'):
        return docx2txt.process(file)
    elif file.name.endswith('.pdf'):
        reader = PdfReader(file)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    raise ValueError("Unsupported file format")

def process_document(file, doc_type, year):
    text = extract_text(file)
    chunks = text_splitter.split_text(text)
    return [{
        "text": chunk,
        "metadata": {
            "source": file.name,
            "doc_type": doc_type,
            "year": year,
            "page": (idx // 10) + 1
        }
    } for idx, chunk in enumerate(chunks)]

def analyze_changes(doc1_chunks, doc2_chunks):
    embeddings = OpenAIEmbeddings(chunk_size=10)
    doc1_docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in doc1_chunks]
    doc2_docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in doc2_chunks]
    db1 = FAISS.from_documents(doc1_docs, embeddings)
    time.sleep(1)
    db2 = FAISS.from_documents(doc2_docs, embeddings)
    
    changes = {"added": [], "removed": [], "modified": []}
    
    batch_size = 5
    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["added"].append({"content": doc.page_content, "metadata": doc.metadata})
        time.sleep(1)

    for i in range(0, len(doc1_docs), batch_size):
        batch = doc1_docs[i:i+batch_size]
        for doc in batch:
            similar = db2.similarity_search(doc.page_content, k=1)
            if not similar or similar[0].metadata["source"] != doc.metadata["source"]:
                changes["removed"].append({"content": doc.page_content, "metadata": doc.metadata})
        time.sleep(1)

    for i in range(0, len(doc2_docs), batch_size):
        batch = doc2_docs[i:i+batch_size]
        for doc in batch:
            similar = db1.similarity_search(doc.page_content, k=1)
            if similar and similar[0].metadata["source"] == doc.metadata["source"]:
                doc1 = similar[0]
                if doc1.page_content != doc.page_content:
                    changes["modified"].append({
                        "original": doc1.page_content,
                        "updated": doc.page_content,
                        "metadata": doc.metadata
                    })
        time.sleep(1)

    return changes

def generate_executive_summary(changes, doc1_meta, doc2_meta):
    cluster_prompt = ChatPromptTemplate.from_template("""
    You are an expert in data clustering and JSON formatting. Your task is to cluster the following document changes into logical groups based on their semantic similarity. Each change is labeled with an index and a snippet of its content.

    Changes:
    {changes}

    Your response MUST be a valid JSON array of cluster objects, where each cluster object has:
    - "theme": a short descriptive title for the cluster (string)
    - "change_ids": an array of the original change indices that belong to this cluster (array of integers)
    - "key_phrases": an array of 3-5 key phrases that summarize the cluster (array of strings)

    If there are no changes to cluster or if clustering cannot be performed, return an empty JSON array: [].
    Do not include any additional text, explanations, or comments outside the JSON array. The output must be valid JSON.

    Example of expected output:
    [
        {{
            "theme": "Policy Updates",
            "change_ids": [0, 2, 3],
            "key_phrases": ["new regulation", "compliance", "deadline"]
        }},
        {{
            "theme": "Formatting Changes",
            "change_ids": [1, 4],
            "key_phrases": ["font size", "layout", "spacing"]
        }}
    ]
    """)
    
    summary_prompt = ChatPromptTemplate.from_template("""
    As a professional analyst, create an executive summary comparing:
    {doc1} ({year1}) vs {doc2} ({year2}).
    
    Key clustered changes:
    {clustered_changes}

    Structure:
    1. Major Structural Changes (sections added/removed)
    2. Content Evolution (modified themes and concepts)
    3. Strategic Implications (business impact analysis)
    4. Recommendations (next steps based on changes)
    
    Include specific examples with citations like: 
    "The address changed from [X][p3] to [Y][p12]" 
    Use markdown with section headers and bold key terms.
    """)
    
    cluster_chain = cluster_prompt | ChatOpenAI(model="gpt-4", temperature=0.1)
    summary_chain = summary_prompt | ChatOpenAI(model="gpt-4", temperature=0.1)
    
    def chunk_changes(changes, max_tokens=6000):
        current_chunk = []
        current_count = 0
        
        for idx, item in enumerate(changes):
            item_tokens = num_tokens(item['content'])
            if current_count + item_tokens > max_tokens:
                yield current_chunk
                current_chunk = []
                current_count = 0
            
            current_chunk.append((idx, item))
            current_count += item_tokens
        
        if current_chunk:
            yield current_chunk

    
    flat_changes = [c for cat in changes.values() for c in cat]
    
    if not flat_changes:
        st.warning("No changes detected to cluster.")
        return "No significant changes detected between documents.", [], flat_changes

    all_clusters = []
    for chunk in chunk_changes(flat_changes, max_tokens=6000):
        changes_input = "\n".join(f"{idx}: {item['content'][:500]}..." for idx, item in chunk)
        if not changes_input.strip():
            st.warning("Empty chunk encountered; skipping.")
            all_clusters.append([])  
            continue

        cluster_result = retriable_chain_invoke(cluster_chain, {"changes": changes_input})
        cluster_result_content = cluster_result.content.strip()
        st.write(f"DEBUG: Model response for chunk: '{cluster_result_content}'")  

        try:
            parsed_result = json.loads(cluster_result_content)
            if not isinstance(parsed_result, list):
                raise ValueError("Model response is not a JSON array")
            all_clusters.extend(parsed_result)
        except (json.JSONDecodeError, ValueError) as e:
            st.error(f"Failed to parse clustering result: {e}")
            st.error(f"Raw response from model: '{cluster_result_content}'")
            all_clusters.extend([])
            
    cluster_summaries = []
    for cluster in all_clusters:
        if not cluster or "change_ids" not in cluster:
            continue

        cluster_changes = [flat_changes[i] for i in cluster["change_ids"] if i < len(flat_changes)]
        cluster_text = "\n".join(
            f"Change {i}: {c['content'][:1000]} [Source: {c['metadata']['source']}, Page {c['metadata']['page']}]"
            for i, c in zip(cluster["change_ids"], cluster_changes)
        )
        
        cluster_summary = retriable_chain_invoke(summary_chain, {
            "doc1": doc1_meta["name"],
            "year1": doc1_meta["year"],
            "doc2": doc2_meta["name"],
            "year2": doc2_meta["year"],
            "clustered_changes": cluster_text if cluster_text else "No changes in this cluster"
        })
        
        cluster_summaries.append(cluster_summary.content)
        time.sleep(1)

    final_prompt = ChatPromptTemplate.from_template("""
    Synthesize these cluster summaries into an executive report:
    {cluster_summaries}

    Maintain this structure:
    1. Document Evolution Overview
    2. Strategic Direction Analysis
    3. Operational Impact Assessment
    4. Recommendations for Future Versions

    Include 3-5 key visualizable trends using bold terms.
    Cite sources like [Source: {{source}}, Page {{page}}].
    """)
   
    final_chain = final_prompt | ChatOpenAI(model="gpt-4", temperature=0.2)
   
    final_summary = retriable_chain_invoke(final_chain, {
        "cluster_summaries": "\n\n".join(cluster_summaries) if cluster_summaries else "No significant changes detected."
    })
    
    return final_summary.content, all_clusters, flat_changes

def generate_detailed_report(changes, doc1_meta, doc2_meta):
   report_prompt = ChatPromptTemplate.from_template("""
   Analyze document changes between:
   {doc1} ({year1}) and {doc2} ({year2})

   Changes:
   {changes}

   Format requirements:
   - Group by ADDED/REMOVED/MODIFIED
   - Include citations like [Source: {{source}}, Page {{page}}]
   - Highlight significant changes with bold
   - Use markdown headers ##
   - Maintain academic tone
   """)
   
   full_report = []
   
   chain = report_prompt | ChatOpenAI(model="gpt-4", temperature=0.2, max_tokens=4000)
   
   for chunk in chunked_report_generator(changes):
       part = retriable_chain_invoke(chain, {
           "doc1": doc1_meta["name"],
           "year1": doc1_meta["year"],
           "doc2": doc2_meta["name"],
           "year2": doc2_meta["year"],
           "changes": "\n".join(chunk)
       }).content
      
       full_report.append(part)
       time.sleep(2)

   return "\n\n".join(full_report)

def chunked_report_generator(changes, max_tokens=12000):
   current_chunk = []
   current_count = 0
   
   for cat in ['added', 'removed', 'modified']:
       for item in changes[cat]:
           content = f"{cat.upper()}:\n{item['content']}\n"
           tokens = num_tokens(content)
           
           if current_count + tokens > max_tokens:
               yield current_chunk
               current_chunk = []
               current_count = 0
            
           current_chunk.append(content)
           current_count += tokens
   
   if current_chunk:
       yield current_chunk
def main():
    st.title("Professional Document Comparison Suite")
   
    if 'colors' not in st.session_state:
        st.session_state.colors = {'added': '#d4f7d4', 'removed': '#f7d4d4', 'modified': '#fff3d4'}

    with st.sidebar:
        st.header("Configuration")
        st.session_state.colors['added'] = st.color_picker("Added Color", '#d4f7d4')
        st.session_state.colors['removed'] = st.color_picker("Removed Color", '#f7d4d4')
        st.session_state.colors['modified'] = st.color_picker("Modified Color", '#fff3d4')

    col1, col2 = st.columns(2)
   
    with col1:
        doc1 = st.file_uploader("Upload Baseline Document", type=["pdf", "docx"])
        year1 = st.number_input("Baseline Year", min_value=1900, max_value=2025, value=2023)

    with col2:
        doc2 = st.file_uploader("Upload Comparison Document", type=["pdf", "docx"])
        year2 = st.number_input("Comparison Year", min_value=1900, max_value=2025, value=2024)

    if st.button("Analyze Documents") and doc1 and doc2:
        with st.status("Processing documents...", expanded=True) as status:
            try:
                st.write("📄 Processing Document 1...")
                doc1_chunks = process_document(doc1, doc1.name.split('.')[-1], year1)
                st.write("📄 Processing Document 2...")
                doc2_chunks = process_document(doc2, doc2.name.split('.')[-1], year2)
                st.write("🔍 Analyzing differences...")
                changes = analyze_changes(doc1_chunks, doc2_chunks)
                st.write("📊 Generating summary...")
                summary, all_clusters, flat_changes = generate_executive_summary(
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
                )
                st.write("📝 Compiling detailed report...")
                detailed_report = generate_detailed_report(
                    changes,
                    {"name": doc1.name, "year": year1},
                    {"name": doc2.name, "year": year2}
                )
               
                full_report = f"# Document Comparison Report\n{summary}\n{detailed_report}"
                status.update(label="Analysis complete! ✅", state="complete")

                # Display analysis results
                                # Display analysis results
                tabs = st.tabs([
                    "Diff View", 
                    "Statistics", 
                    "Full Report", 
                    "Executive Summary",
                    "Change Clusters"
                ])
                
                with tabs[0]:  # Diff View
                    diff_viewer(
                        "\n".join([c["content"] for c in changes["removed"]]),
                        "\n".join([c["content"] for c in changes["added"]]),
                        split_view=True,
                        added_style=f"background: {st.session_state.colors['added']}",
                        removed_style=f"background: {st.session_state.colors['removed']}",
                        modified_style=f"background: {st.session_state.colors['modified']}"
                    )
                
                with tabs[1]:  # Statistics
                    stats_df = pd.DataFrame({
                        'Change Type': ['Added', 'Removed', 'Modified'],
                        'Count': [
                            len(changes["added"]), 
                            len(changes["removed"]), 
                            len(changes["modified"])
                        ]
                    })
                    st.dataframe(
                        stats_df.style.applymap(
                            lambda x: f"background-color: {st.session_state.colors[x.lower()]};",
                            subset=['Change Type']
                        ),
                        use_container_width=True
                    )
                
                with tabs[2]:  # Full Report
                    st.markdown(full_report, unsafe_allow_html=True)

                with tabs[3]:  # Executive Summary
                    st.markdown(summary.replace("\n","<br>"), unsafe_allow_html=True)

                with tabs[4]:  # Change Clusters
                    if all_clusters:
                        for cluster in all_clusters:
                            if not cluster or "theme" not in cluster:
                                continue
                            
                            st.subheader(cluster.get('theme', 'Unnamed Cluster'))
                            st.write(f"Key Phrases: {', '.join(cluster.get('key_phrases', []))}")
                            st.write(f"Associated Changes: {len(cluster.get('change_ids', []))}")

                            if cluster.get('change_ids'):  
                                if st.checkbox(f"Show Example Change from '{cluster.get('theme', 'Unnamed Cluster')}'"):
                                    example_change_index = cluster['change_ids'][0]
                                    example_change = flat_changes[example_change_index]
                                    example_content = example_change['content']
                                    metadata = example_change['metadata']
                                    source_info = f"[Source: {metadata['source']}, Page {metadata['page']}]"
                                    st.write(f"{example_content[:200]}... {source_info}")
                    else:
                        st.warning("No clusters identified in document changes")

                # Download button (outside of tabs)
                st.download_button(
                    label="📥 Download Full Report",
                    data=full_report,
                    file_name="document_comparison.md",
                    mime="text/markdown"
                )
            except Exception as e:
                st.error(f"❌ Processing failed: {str(e)}")
                return

if __name__ == "__main__":
    main()