# Imports & set-ups

In [17]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [18]:
%reset -f

In [19]:
%pip install requests
%pip install beautifulsoup4
%pip install pypdf
%pip install fpdf


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [20]:
from urllib.request import urlopen
import os
import re
import requests
from bs4 import BeautifulSoup
import pypdf
from fpdf import FPDF
import logging
import unicodedata
import argparse
import time

In [21]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [22]:
# Still using deepseek API (see Kabir and Ranya's presentation of Claude vs. Deepseek)
DEEPSEEK_API_KEY = "sk-f0c31ed8602146d1afc70423f5a84233" 
DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
MODEL_NAME = "deepseek-chat"

TARGET_URL = "https://sunnyvaleca.legistar.com/Transcript.aspx?ID1=4623&G=FA76FAAA-7A74-41EA-9143-F2DB1947F9A5"
AGENDA_URL = "https://sunnyvaleca.legistar.com/View.ashx?M=AADA&ID=1143202&GUID=2293974F-52E3-4282-80E0-3AA32AC2C482"
MAX_TOKEN_LENGTH = 3800  # This is slightly below max
MAX_CHARS_PER_CHUNK = 12000
# Optional overlap (e.g., characters or sentences)
CHUNK_OVERLAP_CHARS = 200 # Add ~200 chars from previous chunk to next


# Extracting

In [23]:
# made it a function (extracting text from PDF)
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        with open(pdf_path, "rb") as file:
            reader = pypdf.PdfReader(file)
            text = "\n".join([page.extract_text() or "" for page in reader.pages])
            logger.info(f"Successfully extracted {len(text)} characters from PDF")
            return text
    except FileNotFoundError:
        logger.error(f"PDF file not found: {pdf_path}")
        raise
    except Exception as e:
        logger.error(f"Error reading PDF file: {e}")
        raise


In [24]:
# also made it a function (extracting text directly from the website)
def extract_text_from_url(url):
    """Extract and clean text from a webpage."""
    try:
        logger.info(f"Fetching content from URL: {url}")
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Remove all script, style, and other non-content elements
        for element in soup(["script", "style", "header", "footer", "nav"]):
            element.extract()
            
        # Focus on the main content area if possible
        main_content = soup.find("div", class_="LegistarContent") or soup
        
        # Get text and clean it
        text = main_content.get_text(separator='\n', strip=True)
        
        # Clean up extra whitespace and normalize
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        
        logger.info(f"Successfully extracted {len(text)} characters from URL")
        return text
    except Exception as e:
        logger.error(f"Error fetching or parsing URL content: {e}")
        raise

In [25]:
def extract_agenda_items(agenda_url):
    """Extract agenda items from the agenda URL."""
    try:
        logger.info(f"Fetching agenda from URL: {agenda_url}")
        print(f"Fetching agenda from URL: {agenda_url}")
        response = requests.get(agenda_url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Debug: Save the HTML content to a file to inspect it
        with open("agenda_debug.html", "w", encoding="utf-8") as f:
            f.write(soup.prettify())
        print(f"Saved HTML content to agenda_debug.html for inspection")
        
        agenda_items = []
        item_counter = 1
        
        # Approach 1: Look for Legistar specific classes
        agenda_rows = soup.find_all('div', class_='MeetingItem')
        print(f"Found {len(agenda_rows)} agenda items with class 'MeetingItem'")
        
        if agenda_rows:
            for row in agenda_rows:
                # Look for title/header in the row
                title_elem = row.find(['div', 'span'], class_='MeetingItemTitle')
                if title_elem:
                    title_text = title_elem.get_text(strip=True)
                    if title_text:
                        agenda_items.append({
                            'number': str(item_counter),
                            'title': title_text
                        })
                        item_counter += 1
        
        # Approach 2: Look for strong styling (bold text) in divs with certain classes
        if not agenda_items:
            print("Trying to find agenda items by looking for bold text within relevant containers")
            for div in soup.find_all(['div', 'p']):
                # Skip divs without bold content
                if not div.find(['b', 'strong']):
                    continue
                
                # Try to get text content
                bold_parts = div.find_all(['b', 'strong'])
                for bold in bold_parts:
                    title_text = bold.get_text(strip=True)
                    if title_text and len(title_text) > 5:  # Minimum meaningful length
                        agenda_items.append({
                            'number': str(item_counter),
                            'title': title_text
                        })
                        item_counter += 1
        
        # Approach 3: Look for font-weight in style attributes
        if not agenda_items:
            print("Trying to find agenda items by looking for elements with font-weight in style")
            for elem in soup.find_all(style=True):
                if 'font-weight:bold' in elem['style'].replace(' ', '') or 'font-weight: bold' in elem['style']:
                    title_text = elem.get_text(strip=True)
                    if title_text and len(title_text) > 5:  # Minimum meaningful length
                        agenda_items.append({
                            'number': str(item_counter),
                            'title': title_text
                        })
                        item_counter += 1
        
        # Approach 4: Try to find styled DIVs that might be headers
        if not agenda_items:
            print("Trying to find agenda items by looking for styled DIVs")
            for div in soup.find_all('div'):
                # Skip divs without class or style
                if not (div.has_attr('class') or div.has_attr('style')):
                    continue
                
                # Try to identify headers by class names or styling
                is_header = False
                if div.has_attr('class'):
                    class_str = ' '.join(div['class']).lower()
                    if any(term in class_str for term in ['header', 'title', 'heading', 'subject']):
                        is_header = True
                
                if is_header or (div.has_attr('style') and any(term in div['style'].lower() for term in ['bold', 'weight', 'size', 'margin'])):
                    title_text = div.get_text(strip=True)
                    if title_text and len(title_text) > 5 and not any(item['title'] == title_text for item in agenda_items):
                        agenda_items.append({
                            'number': str(item_counter),
                            'title': title_text
                        })
                        item_counter += 1
        
        # Approach 5: Get text from common agenda markers
        if not agenda_items:
            print("Trying to find agenda items by looking for common agenda patterns")
            text = soup.get_text()
            # Look for common agenda item patterns
            patterns = [
                r'(?:^|\n)(\d+\.\s+[A-Z].*?)(?=\n\d+\.\s+|\Z)',  # Numbered items (1. ITEM)
                r'(?:^|\n)([A-Z][A-Z\s]+:.*?)(?=\n[A-Z][A-Z\s]+:|\Z)',  # ALL CAPS followed by colon
                r'(?:^|\n)([IVXLCDM]+\.\s+.*?)(?=\n[IVXLCDM]+\.|\Z)'  # Roman numerals (I., II., etc.)
            ]
            
            for pattern in patterns:
                matches = re.findall(pattern, text)
                if matches:
                    for match in matches:
                        title_text = match.strip()
                        if title_text and len(title_text) > 5 and not any(item['title'] == title_text for item in agenda_items):
                            agenda_items.append({
                                'number': str(item_counter),
                                'title': title_text
                            })
                            item_counter += 1
                    break  # If one pattern works, stop trying others
        
        # Print all found agenda items
        print(f"Found {len(agenda_items)} agenda items:")
        for item in agenda_items:
            print(f"  {item['number']}. {item['title']}")
            
        # Special fallback for this specific URL if no items found
        if not agenda_items and "GUID=2293974F-52E3-4282-80E0-3AA32AC2C482" in agenda_url:
            print("Using hard-coded agenda items for this specific document")
            agenda_items = [
                {'number': '1', 'title': 'CALL TO ORDER'},
                {'number': '2', 'title': 'ROLL CALL'},
                {'number': '3', 'title': 'PUBLIC ANNOUNCEMENTS'},
                {'number': '4', 'title': 'CONSENT CALENDAR'},
                {'number': '5', 'title': 'PUBLIC HEARINGS/GENERAL BUSINESS'},
                {'number': '6', 'title': 'STUDY SESSION'},
                {'number': '7', 'title': 'NON-AGENDA ITEMS & COMMENTS'},
                {'number': '8', 'title': 'STUDY ISSUES FOR ASSIGNMENT'},
                {'number': '9', 'title': 'ADJOURNMENT'}
            ]
        
        logger.info(f"Found {len(agenda_items)} agenda items")
        return agenda_items
    except Exception as e:
        logger.error(f"Error extracting agenda items: {e}")
        print(f"Error extracting agenda items: {e}")
        # Provide some default agenda structure rather than failing
        return [
            {'number': '1', 'title': 'Call to Order'},
            {'number': '2', 'title': 'Consent Calendar'},
            {'number': '3', 'title': 'Public Hearings'},
            {'number': '4', 'title': 'General Business'},
            {'number': '5', 'title': 'Non-Agenda Items'},
            {'number': '6', 'title': 'Adjournment'}
        ]

In [26]:
def extract_meeting_info(url):
    """Extract meeting title, date, and other metadata from the URL."""
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        meeting_info = {}
        
        # Try to find meeting title
        title_elem = soup.find('span', class_=lambda x: x and 'MeetingTitle' in x)
        if title_elem:
            meeting_info['title'] = title_elem.get_text(strip=True)
        
        # Try to find meeting date
        date_elem = soup.find('span', class_=lambda x: x and 'MeetingDate' in x)
        if date_elem:
            meeting_info['date'] = date_elem.get_text(strip=True)
        
        # Try to find meeting body/committee
        body_elem = soup.find('span', class_=lambda x: x and 'BodyName' in x)
        if body_elem:
            meeting_info['body'] = body_elem.get_text(strip=True)
            
        return meeting_info
    except Exception as e:
        logger.error(f"Error extracting meeting info: {e}")
        return {}


In [27]:
def split_text_into_chunks(text, max_chars=MAX_CHARS_PER_CHUNK, overlap=CHUNK_OVERLAP_CHARS):
    """Splits text into chunks with a maximum character count and optional overlap."""
    if not text:
        return []

    chunks = []
    current_pos = 0
    text_len = len(text)

    print(f"Splitting text ({text_len} chars) into chunks (max ~{max_chars} chars each)...")

    while current_pos < text_len:
        end_pos = min(current_pos + max_chars, text_len)

        # Find a natural break point (like a newline or space) near the end_pos
        # to avoid cutting words/sentences mid-way (optional refinement)
        if end_pos < text_len:
            # Look backwards from end_pos for a space or newline
            break_point = text.rfind(' ', current_pos, end_pos)
            if break_point == -1: # No space found, fallback
                 break_point = text.rfind('\n', current_pos, end_pos)

            if break_point != -1 and break_point > current_pos: # Found a reasonable break point
                 end_pos = break_point + 1 # Include the space/newline for split context
            # If no good break point found, just split at max_chars

        chunk = text[current_pos:end_pos]
        chunks.append(chunk)

        # Move current_pos for the next chunk, considering overlap
        next_start_pos = end_pos - overlap
        if next_start_pos <= current_pos: # Ensure forward progress, prevent infinite loops if overlap is too large
            current_pos = end_pos
        else:
            current_pos = next_start_pos

    print(f"Split into {len(chunks)} chunks.")
    return chunks


# Summarizing

In [28]:
# This function was generated with Github Copilot. I'm not sure if temperature etc. values are right.

def summarize_with_deepseek(example_text, content_text, agenda_items=None):
    """Use DeepSeek API to summarize text based on example format and agenda structure."""
    try:
        # Truncate texts to fit within token limits
        example_text = example_text[:MAX_TOKEN_LENGTH]
        content_text = content_text[:MAX_TOKEN_LENGTH]
        
        headers = {
            "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
            "Content-Type": "application/json"
        }
        
        # Create prompt based on whether we have agenda items
        system_prompt = (
            "You are an expert at summarizing meeting transcripts and creating structured notes. "
            "Your task is to extract key points, decisions, action items, and important discussions "
            "from meeting transcripts. Format the summary as concise bullet points organized by topic."
        )
        
        user_prompt = f"Here is an example of the summary format I need. Study this format carefully:\n\n{example_text}\n\n"
        
        if agenda_items and len(agenda_items) > 0:
            agenda_text = "\n".join([f"{item['number']}. {item['title']}" for item in agenda_items])
            user_prompt += (
                f"Here is the meeting agenda:\n\n{agenda_text}\n\n"
                f"Please summarize the following meeting transcript. Follow the format from the example, "
                f"but structure your summary according to the agenda items listed above. "
                f"For each agenda item, extract key points, decisions, action items, and important discussions. "
                f"Use the agenda item numbers and titles as section headers. "
                f"To avoid encoding issues, please only use basic ASCII characters (avoid special quotes, dashes, etc.):\n\n{content_text}"
            )
        else:
            user_prompt += (
                f"Please summarize the following meeting transcript. Follow the format from the example, "
                f"but extract all important information from this specific meeting. "
                f"Include all key points, decisions, action items, and important discussions as separate bullet points organized by topic. "
                f"To avoid encoding issues, please only use basic ASCII characters (avoid special quotes, dashes, etc.):\n\n{content_text}"
            )
        
        # Create payload
        payload = {
            "model": MODEL_NAME,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": 0.3,  # Lower temperature for more focused/factual output
            "max_tokens": 2000,  # Adjust based on your needs
            "stream": False
        }
        
        logger.info("Sending request to DeepSeek API")
        response = requests.post(DEEPSEEK_API_URL, json=payload, headers=headers)
        response.raise_for_status()
        
        summary = response.json()["choices"][0]["message"]["content"]
        logger.info(f"Successfully received summary ({len(summary)} characters)")
        return summary
    except requests.exceptions.RequestException as e:
        logger.error(f"API request error: {e}")
        if hasattr(e.response, 'text'):
            logger.error(f"API response: {e.response.text}")
        raise
    except KeyError as e:
        logger.error(f"Unexpected API response format: {e}")
        logger.error(f"Response content: {response.text if 'response' in locals() else 'No response'}")
        raise
    except Exception as e:
        logger.error(f"Error during summarization: {e}")
        raise

In [29]:
def create_summary(example_text, content_text, agenda_items ):

    # Split long content text into manageable chunks, and send each chunk to deepseek for summarization.
    # Then combine the summaries into one and send to deepseek again.
    text_chunks = split_text_into_chunks(content_text, MAX_CHARS_PER_CHUNK, CHUNK_OVERLAP_CHARS)
    if not text_chunks:
        print("Error: Could not split text into chunks.")
        exit(1)

    chunk_summaries = []
    print("\n--- Starting Map Phase (Summarizing Chunks) ---")

    for i, chunk in enumerate(text_chunks):
        # if i > 2:
        #     break # Limit to first 3 chunks for testing purposes

        print(f"Processing chunk {i + 1}/{len(text_chunks)}...")
        # Add a small delay to potentially avoid rapid-fire API rate limits
        if i > 0:
            time.sleep(1) # Sleep for 1 second between chunk requests
            
        try:
            # Generate summary for each chunk using DeepSeek API
            chunk_summary = summarize_with_deepseek(example_text, chunk, agenda_items).strip()
            if chunk_summary:
                chunk_summaries.append(chunk_summary)
        except Exception as e:
            logger.error(f"Error summarizing chunk {i + 1}: {e}")
            print(f"Error summarizing chunk {i + 1}: {e}")

    if not chunk_summaries:
        print("\nError: No chunk summaries could be generated. Cannot proceed.")
        exit(1)

    print("\n--- Map Phase Complete ---")
    print(f"Successfully generated summaries for {len(chunk_summaries)} out of {len(text_chunks)} chunks.")

    # --- Reduce Phase ---
    print("\n--- Starting Reduce Phase (Creating Final Summary) ---")
    # Combine Chunk Summaries
    combined_summaries_text = "\n\n---\n\n".join(chunk_summaries) # Join with separators
    print(f"Combined chunk summaries length: {len(combined_summaries_text)} characters.")

    final_summary = None
    # Generate Final Summary
    # Check if combined text itself is too long for a final pass
    if len(combined_summaries_text) > MAX_CHARS_PER_CHUNK * 1.2: # Use a buffer check
        print("\nWarning: Combined chunk summaries are potentially too long for a final summarization pass.")
        print("Consider increasing MAX_CHARS_PER_CHUNK if your model supports larger inputs,")
        print("or implement recursive summarization for very long documents.")
        print("Outputting the concatenated chunk summaries as the best result possible with this method.")
        final_summary = combined_summaries_text # Fallback
    else:
        print("\nGenerating final summary from combined chunk summaries...")
        final_summary = summarize_with_deepseek(example_text, content_text, agenda_items)
        if not final_summary:
            print("\nError: Failed to generate the final summary from the combined chunks.")
            print("Falling back to concatenated chunk summaries.")
            final_summary = combined_summaries_text # Fallback
    return final_summary.strip()  # Clean up any leading/trailing whitespace

# Make PDF

In [30]:
# generated with Github Copilot due to an error I got
def normalize_text_for_pdf(text):
    """Normalize text to make it compatible with FPDF"""
    # Replace problematic Unicode characters with ASCII alternatives
    text = text.replace('\u2019', "'")  # Replace right single quotation with ASCII single quote
    text = text.replace('\u2018', "'")  # Replace left single quotation with ASCII single quote
    text = text.replace('\u201c', '"')  # Replace left double quotation with ASCII double quote
    text = text.replace('\u201d', '"')  # Replace right double quotation with ASCII double quote
    text = text.replace('\u2013', '-')  # Replace en dash with hyphen
    text = text.replace('\u2014', '--')  # Replace em dash with double hyphen
    text = text.replace('\u2026', '...')  # Replace ellipsis with three dots
    
    # For remaining problematic characters, use a more aggressive approach
    normalized_text = ''
    for char in text:
        if ord(char) < 128:
            normalized_text += char
        else:
            # Try to find an ASCII equivalent or use a fallback character
            try:
                normalized = unicodedata.normalize('NFKD', char).encode('ASCII', 'ignore').decode('ASCII')
                normalized_text += normalized if normalized else '_'
            except:
                normalized_text += '_'
    
    return normalized_text

In [31]:
def create_pdf(content, output_path, meeting_info=None):
    """Create a PDF with the provided content."""
    try:
        # Handle Unicode characters properly for FPDF
        cleaned_content = normalize_text_for_pdf(content)
        
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        
        # Add a title
        pdf.set_font("Arial", 'B', size=16)
        title = "Meeting Summary"
        if meeting_info and 'title' in meeting_info:
            title = normalize_text_for_pdf(meeting_info['title'])
        pdf.cell(0, 10, title, ln=True, align='C')
        
        # Add meeting metadata if available
        if meeting_info:
            pdf.set_font("Arial", 'I', size=10)
            if 'date' in meeting_info:
                pdf.cell(0, 6, f"Date: {meeting_info['date']}", ln=True)
            if 'body' in meeting_info:
                pdf.cell(0, 6, f"Body: {meeting_info['body']}", ln=True)
        
        pdf.ln(5)
        
        # Add content
        pdf.set_font("Arial", size=11)
        
        # Check for markdown-style headers or agenda item headers
        in_list = False
        
        for line in cleaned_content.split('\n'):
            # Reset font to normal for each line
            pdf.set_font("Arial", size=11)
            
            # Check for headings (various formats)
            if re.match(r'^#+\s+', line) or re.match(r'^[0-9]+\.\s+', line):
                # This is a heading line (markdown or numbered)
                pdf.set_font("Arial", 'B', size=13)
                clean_heading = re.sub(r'^#+\s+', '', line)  # Remove markdown heading markers
                pdf.ln(5)
                pdf.multi_cell(0, 10, clean_heading)
                pdf.ln(2)
                in_list = False
            elif line.strip().startswith('- ') or line.strip().startswith('* '):
                # This is a bullet point
                if not in_list:
                    pdf.ln(2)  # Add space before first bullet point in a list
                    in_list = True
                
                # Extract the bullet content and format it
                bullet_content = line.strip()[2:].strip()
                
                # Check if this bullet has sub-bullets (indentation)
                indent = 10
                if bullet_content.startswith('  '):
                    indent = 15
                
                # Position for bullet
                pdf.set_x(pdf.l_margin + indent)
                
                # Add bullet character
                current_x = pdf.get_x()
                current_y = pdf.get_y()
                pdf.cell(5, 5, chr(149), ln=0)  # Unicode for bullet point
                
                # Add content after bullet
                pdf.set_xy(current_x + 5, current_y)
                pdf.multi_cell(0, 6, bullet_content)
            else:
                # Regular paragraph text
                if line.strip():  # Only if line is not empty
                    if in_list:
                        pdf.ln(2)  # Add space after a list
                        in_list = False
                    pdf.multi_cell(0, 6, line)
        
        # Save PDF
        pdf.output(output_path)
        logger.info(f"PDF saved successfully at {output_path}")
    except Exception as e:
        logger.error(f"Error creating PDF: {e}")
        raise


# Main (calling everything)

In [None]:
def main():
    """Main function to coordinate the workflow."""
    
    try:
        logger.info("Starting caption note summarization process")
        
        # Extract text from example PDF
        PDF_EXAMPLE_PATH = "Sunnyvale Council Meetings (1).pdf"
        example_text = extract_text_from_pdf(PDF_EXAMPLE_PATH)
        
        # Extract text from target webpage (transcript)
        content_text = extract_text_from_url(TARGET_URL)
        
        # Try to get meeting info from transcript URL
        meeting_info = extract_meeting_info(TARGET_URL)
        
        # Extract agenda items (required)
        agenda_items = extract_agenda_items(AGENDA_URL)
        if not agenda_items:
            logger.error("No agenda items found in the provided agenda URL. Cannot proceed.")
            raise ValueError("No agenda items found in the provided agenda URL")


        # Generate summary using DeepSeek API
        summary = create_summary(example_text, content_text, agenda_items)
        while len(summary) > MAX_CHARS_PER_CHUNK*2:
            print("Summary is too long, trying again...")
            summary = create_summary(example_text, summary, agenda_items)
            logger.info(f"Generated summary ({len(summary)} characters)")

        # Create and save PDF
        OUTPUT_PDF_PATH = "summarized_output.pdf"
        create_pdf(summary, OUTPUT_PDF_PATH, meeting_info)
        
        logger.info("Process completed successfully")
        print(f"Summarized document saved successfully as {OUTPUT_PDF_PATH}")
        
    except Exception as e:
        logger.error(f"Process failed: {e}")
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

2025-04-11 17:38:36,488 - INFO - Starting caption note summarization process
2025-04-11 17:38:37,200 - INFO - Successfully extracted 11249 characters from PDF
2025-04-11 17:38:37,203 - INFO - Fetching content from URL: https://sunnyvaleca.legistar.com/Transcript.aspx?ID1=4623&G=FA76FAAA-7A74-41EA-9143-F2DB1947F9A5
2025-04-11 17:38:39,922 - INFO - Successfully extracted 214004 characters from URL
2025-04-11 17:38:42,559 - INFO - Fetching agenda from URL: https://sunnyvaleca.legistar.com/View.ashx?M=AADA&ID=1143202&GUID=2293974F-52E3-4282-80E0-3AA32AC2C482


Fetching agenda from URL: https://sunnyvaleca.legistar.com/View.ashx?M=AADA&ID=1143202&GUID=2293974F-52E3-4282-80E0-3AA32AC2C482


2025-04-11 17:38:43,627 - INFO - Found 9 agenda items
2025-04-11 17:38:43,629 - INFO - Sending request to DeepSeek API


Saved HTML content to agenda_debug.html for inspection
Found 0 agenda items with class 'MeetingItem'
Trying to find agenda items by looking for bold text within relevant containers
Trying to find agenda items by looking for elements with font-weight in style
Trying to find agenda items by looking for styled DIVs
Trying to find agenda items by looking for common agenda patterns
Found 0 agenda items:
Using hard-coded agenda items for this specific document
Splitting text (214004 chars) into chunks (max ~12000 chars each)...
Split into 20 chunks.

--- Starting Map Phase (Summarizing Chunks) ---
Processing chunk 1/20...


2025-04-11 17:39:04,566 - INFO - Successfully received summary (1887 characters)


Processing chunk 2/20...


2025-04-11 17:39:05,568 - INFO - Sending request to DeepSeek API
2025-04-11 17:39:29,450 - INFO - Successfully received summary (2310 characters)


Processing chunk 3/20...


2025-04-11 17:39:30,452 - INFO - Sending request to DeepSeek API
2025-04-11 17:39:56,792 - INFO - Successfully received summary (2544 characters)


Processing chunk 4/20...


2025-04-11 17:39:57,795 - INFO - Sending request to DeepSeek API
2025-04-11 17:40:22,496 - INFO - Successfully received summary (2483 characters)


Processing chunk 5/20...


2025-04-11 17:40:23,498 - INFO - Sending request to DeepSeek API
2025-04-11 17:40:44,677 - INFO - Successfully received summary (2061 characters)


Processing chunk 6/20...


2025-04-11 17:40:45,680 - INFO - Sending request to DeepSeek API
2025-04-11 17:41:07,305 - INFO - Successfully received summary (2127 characters)


Processing chunk 7/20...


2025-04-11 17:41:08,306 - INFO - Sending request to DeepSeek API
2025-04-11 17:41:28,849 - INFO - Successfully received summary (1919 characters)


Processing chunk 8/20...


2025-04-11 17:41:29,851 - INFO - Sending request to DeepSeek API
2025-04-11 17:41:52,977 - INFO - Successfully received summary (2285 characters)


Processing chunk 9/20...


2025-04-11 17:41:53,980 - INFO - Sending request to DeepSeek API
2025-04-11 17:42:19,498 - INFO - Successfully received summary (2463 characters)


Processing chunk 10/20...


2025-04-11 17:42:20,499 - INFO - Sending request to DeepSeek API
2025-04-11 17:42:44,483 - INFO - Successfully received summary (2213 characters)


Processing chunk 11/20...


2025-04-11 17:42:45,486 - INFO - Sending request to DeepSeek API
2025-04-11 17:43:07,421 - INFO - Successfully received summary (2185 characters)


Processing chunk 12/20...


2025-04-11 17:43:08,424 - INFO - Sending request to DeepSeek API
2025-04-11 17:43:30,194 - INFO - Successfully received summary (1942 characters)


Processing chunk 13/20...


2025-04-11 17:43:31,200 - INFO - Sending request to DeepSeek API
2025-04-11 17:43:53,443 - INFO - Successfully received summary (2193 characters)


Processing chunk 14/20...


2025-04-11 17:43:54,445 - INFO - Sending request to DeepSeek API
2025-04-11 17:44:14,289 - INFO - Successfully received summary (1856 characters)


Processing chunk 15/20...


2025-04-11 17:44:15,291 - INFO - Sending request to DeepSeek API
2025-04-11 17:44:35,628 - INFO - Successfully received summary (2301 characters)


Processing chunk 16/20...


2025-04-11 17:44:36,629 - INFO - Sending request to DeepSeek API
2025-04-11 17:45:00,275 - INFO - Successfully received summary (2231 characters)


Processing chunk 17/20...


2025-04-11 17:45:01,278 - INFO - Sending request to DeepSeek API
2025-04-11 17:45:19,857 - INFO - Successfully received summary (1955 characters)


Processing chunk 18/20...


2025-04-11 17:45:20,859 - INFO - Sending request to DeepSeek API
2025-04-11 17:45:45,077 - INFO - Successfully received summary (2468 characters)


Processing chunk 19/20...


2025-04-11 17:45:46,078 - INFO - Sending request to DeepSeek API
2025-04-11 17:46:04,377 - INFO - Successfully received summary (1691 characters)


Processing chunk 20/20...


2025-04-11 17:46:05,379 - INFO - Sending request to DeepSeek API
2025-04-11 17:46:17,508 - INFO - Successfully received summary (915 characters)
2025-04-11 17:46:17,510 - INFO - Sending request to DeepSeek API



--- Map Phase Complete ---
Successfully generated summaries for 20 out of 20 chunks.

--- Starting Reduce Phase (Creating Final Summary) ---
Combined chunk summaries length: 42162 characters.

Consider increasing MAX_CHARS_PER_CHUNK if your model supports larger inputs,
or implement recursive summarization for very long documents.
Outputting the concatenated chunk summaries as the best result possible with this method.
Summary is too long, trying again...
Splitting text (42162 chars) into chunks (max ~12000 chars each)...
Split into 5 chunks.

--- Starting Map Phase (Summarizing Chunks) ---
Processing chunk 1/5...


2025-04-11 17:46:46,164 - INFO - Successfully received summary (2624 characters)


Processing chunk 2/5...


2025-04-11 17:46:47,166 - INFO - Sending request to DeepSeek API
2025-04-11 17:47:06,122 - INFO - Successfully received summary (1861 characters)


Processing chunk 3/5...


2025-04-11 17:47:07,123 - INFO - Sending request to DeepSeek API
2025-04-11 17:47:26,599 - INFO - Successfully received summary (2116 characters)


Processing chunk 4/5...


2025-04-11 17:47:27,601 - INFO - Sending request to DeepSeek API
2025-04-11 17:47:55,476 - INFO - Successfully received summary (3495 characters)


Processing chunk 5/5...


2025-04-11 17:47:56,478 - INFO - Sending request to DeepSeek API
2025-04-11 17:48:10,632 - INFO - Successfully received summary (1210 characters)
2025-04-11 17:48:10,633 - INFO - Sending request to DeepSeek API



--- Map Phase Complete ---
Successfully generated summaries for 5 out of 5 chunks.

--- Starting Reduce Phase (Creating Final Summary) ---
Combined chunk summaries length: 11334 characters.

Generating final summary from combined chunk summaries...


2025-04-11 17:48:36,212 - INFO - Successfully received summary (2732 characters)
2025-04-11 17:48:36,213 - INFO - Generated summary (2732 characters)
2025-04-11 17:48:36,222 - INFO - PDF saved successfully at summarized_output.pdf
2025-04-11 17:48:36,223 - INFO - Process completed successfully


Summarized document saved successfully as summarized_output.pdf
