In [1]:
import os
import json

from src.database.base import init_db
from src.database.companies import get_company_by_ticker
from src.database.filings import get_filings_by_company
from src.database.documents import get_documents_by_filing, DocumentType
from src.database.aggregates import create_aggregate

from src.database.prompts import get_prompt_by_name
from src.database.completions import create_completion, get_completion_ids
from src.database.aggregates import create_aggregate

from src.llm.prompts import format_document_messages, format_aggregate_messages, PromptRole
from src.llm.client import init_client, get_chat_response
from src.llm.models import MODEL_CONFIG

from src.utils.config import settings
from src.utils.logging import configure_logging, get_logger

configure_logging()
logger = get_logger(name="notebook")
_, _ = init_db(settings.database.url)

client = init_client(settings.openai_api.url)
_ = client.ps()
logger.info("ollama_initialzied", status="success")

2025-06-28T22:42:07.227415Z [info     ] database_initialized           status=success
HTTP Request: GET http://10.0.0.4:11434/api/ps "HTTP/1.1 200 OK"
2025-06-28T22:42:07.241334Z [info     ] ollama_initialzied             status=success
HTTP Request: GET http://10.0.0.4:11434/api/ps "HTTP/1.1 200 OK"
2025-06-28T22:42:07.241334Z [info     ] ollama_initialzied             status=success


In [2]:
TICKER = "MSFT"
DOC_MODEL = "qwen3:4b" # smaller model w/ larger context
AGGRAGATE_MODEL = "qwen3:14b"
SUMMARY_MODEL = "qwen3:14b"

In [3]:
company = get_company_by_ticker(TICKER)
filings = get_filings_by_company(company_id=company.id)
for filing in filings:
    documents = get_documents_by_filing(filing.id)
    for document in documents:
        pass

2025-06-28T21:56:47.486041Z [info     ] retrieved_company_by_ticker    company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 ticker=MSFT
2025-06-28T21:56:47.487881Z [info     ] retrieved_filings_by_company   company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 count=4
2025-06-28T21:56:47.490053Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e2b4-2e1d-7993-8000-1ba4dd3c05d1
2025-06-28T21:56:47.491769Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e2b5-7502-720a-8000-7206aae86d0c
2025-06-28T21:56:47.493153Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e2b5-ff23-739b-8000-30f51ca878b7
2025-06-28T21:56:47.494687Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e2b4-ce4b-7c6e-8000-b95ed376b305
2025-06-28T21:56:47.487881Z [info     ] retrieved_filings_by_company   company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 count=4
2025-06-28T21:56:47.490053Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e2b4-2e1d-7993-800

In [4]:
descriptions = {}
risk_factors = {}
mdas = {}

# Track completion IDs by document type for aggregation
description_completion_ids = []
risk_factor_completion_ids = []
mda_completion_ids = []

for filing in filings:
    documents = get_documents_by_filing(filing.id)
    for document in documents:

        messages = format_document_messages(document)

        logger.info(f"Starting Chat for {TICKER} {filing.filing_date} {document.document_type.value}")
        response = get_chat_response(client, DOC_MODEL, messages)

        # Extract necessary details from the response
        created_at = response.created_at
        total_duration = response.total_duration / 1e9
        content = response.message.content
        prompt = get_prompt_by_name(document.document_type.value)
        completion_data = {
            'model': DOC_MODEL,
            'document_ids': [document.id],  # Associate with the current document
            'system_prompt_id': prompt.id,  # Link to the prompt used
            'total_duration': total_duration,
            'created_at': created_at,
            'num_ctx': MODEL_CONFIG[DOC_MODEL]["ctx"],
            'content': content,  # Add the actual response content
        }
        completion = create_completion(completion_data)
        logger.info("created_completion", id = completion.id, document = document.document_name, time = completion.total_duration, length = len(completion.content))

        # Store results and track completion IDs for aggregation
        if document.document_type == DocumentType.DESCRIPTION:
            descriptions[f"{filing.filing_date}"] = response.message.content
            description_completion_ids.append(completion.id)
        elif document.document_type == DocumentType.RISK_FACTORS:
            risk_factors[f"{filing.filing_date}"] = response.message.content
            risk_factor_completion_ids.append(completion.id)
        elif document.document_type == DocumentType.MDA:
            mdas[f"{filing.filing_date}"] = response.message.content
            mda_completion_ids.append(completion.id)
        else:
            logger.warning("wtf")



logger.info("doc_completions_generated.")

os.makedirs(f'outputs/{TICKER}', exist_ok=True)
with open(f'outputs/{TICKER}/mdas.json', 'w') as file:
    json.dump(mdas, file, indent=4)
with open(f'outputs/{TICKER}/risk_factors.json', 'w') as file:
    json.dump(risk_factors, file, indent=4)
with open(f'outputs/{TICKER}/descriptions.json', 'w') as file:
    json.dump(descriptions, file, indent=4)

2025-06-28T21:56:47.502845Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e2b4-2e1d-7993-8000-1ba4dd3c05d1
2025-06-28T21:56:47.503572Z [info     ] Starting Chat for MSFT 2021-07-29 business_description
2025-06-28T21:56:47.503572Z [info     ] Starting Chat for MSFT 2021-07-29 business_description
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T21:58:12.843576Z [info     ] retrieved_prompt_by_name       name=business_description prompt_id=06860055-5fa7-7c96-8000-8afefd533328
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T21:58:12.843576Z [info     ] retrieved_prompt_by_name       name=business_description prompt_id=06860055-5fa7-7c96-8000-8afefd533328
2025-06-28T21:58:12.852740Z [info     ] created_completion             completion_id=06860657-4d94-76fe-8000-4afc2cb4c284 model=qwen3:4b
2025-06-28T21:58:12.854087Z [info     ] created_completion             document=MICROSOFT CORP 10-K 2021-07-29 - Business D

In [5]:
# Create MDA Aggregate
messages = format_aggregate_messages(mdas)
AGGresponse = get_chat_response(client, AGGRAGATE_MODEL, messages)
logger.info(f"MDA Aggregate Done", time=f"{response.total_duration / 1e9}s")

# Save the aggregate content to file
with open(f'outputs/{TICKER}/mda.md', 'w') as file:
    file.write(response.message.content)

# Create aggregate record in database
created_at = response.created_at
total_duration = response.total_duration / 1e9
content = response.message.content
prompt = get_prompt_by_name("aggregate_prompt")

mda_aggregate_data = {
    'model': AGGRAGATE_MODEL,
    'company_id': company.id,
    'document_type': DocumentType.MDA,
    'system_prompt_id': prompt.id,
    'total_duration': total_duration,
    'created_at': created_at,
    'num_ctx': MODEL_CONFIG[AGGRAGATE_MODEL]["ctx"],
    'content': content,
    'completion_ids': mda_completion_ids  # Associate with source completions
}

mda_aggregate = create_aggregate(mda_aggregate_data)
print(f"Created MDA aggregate record with ID: {mda_aggregate.id}")
print(f"Associated with {len(mda_completion_ids)} source completions")
print(f"Model: {mda_aggregate.model}, Duration: {mda_aggregate.total_duration:.2f} seconds")
print(f"Content length: {len(mda_aggregate.content)} characters")


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:11:20.437258Z [info     ] MDA Aggregate Done             time=67.695829903s
2025-06-28T22:11:20.438837Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T22:11:20.437258Z [info     ] MDA Aggregate Done             time=67.695829903s
2025-06-28T22:11:20.438837Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T22:11:20.484865Z [info     ] created_aggregate              aggregate_id=06860688-8712-7c9c-8000-20c9f05dccbd model=qwen3:14b
2025-06-28T22:11:20.484865Z [info     ] created_aggregate              aggregate_id=06860688-8712-7c9c-8000-20c9f05dccbd model=qwen3:14b


Created MDA aggregate record with ID: 06860688-8712-7c9c-8000-20c9f05dccbd
Associated with 4 source completions
Model: qwen3:14b, Duration: 67.70 seconds
Content length: 8618 characters


In [6]:
# mda_aggregate.source_completions
# for completion in mda_aggregate.source_completions:
#     for document in completion.source_documents:
#         print(document.filing.filing_url)

In [7]:
# Create Risk Factors Aggregate
messages = format_aggregate_messages(risk_factors)
response = get_chat_response(client, AGGRAGATE_MODEL, messages)
logger.info(f"Risk Factors Aggregate Done", time=f"{response.total_duration / 1e9}s")

# Save the aggregate content to file
with open(f'outputs/{TICKER}/risk_factors.md', 'w') as file:
    file.write(response.message.content)

# Create aggregate record in database
created_at = response.created_at
total_duration = response.total_duration / 1e9
content = response.message.content
prompt = get_prompt_by_name("aggregate_prompt")

risk_aggregate_data = {
    'model': AGGRAGATE_MODEL,
    'company_id': company.id,
    'document_type': DocumentType.RISK_FACTORS,
    'system_prompt_id': prompt.id,
    'total_duration': total_duration,
    'created_at': created_at,
    'num_ctx': MODEL_CONFIG[AGGRAGATE_MODEL]["ctx"],
    'content': content,
    'completion_ids': risk_factor_completion_ids  # Associate with source completions
}

risk_aggregate = create_aggregate(risk_aggregate_data)
print(f"Created Risk Factors aggregate record with ID: {risk_aggregate.id}")
print(f"Associated with {len(risk_factor_completion_ids)} source completions")
print(f"Model: {risk_aggregate.model}, Duration: {risk_aggregate.total_duration:.2f} seconds")
print(f"Content length: {len(risk_aggregate.content)} characters")


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:12:13.934058Z [info     ] Risk Factors Aggregate Done    time=53.436591118s
2025-06-28T22:12:13.935665Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T22:12:13.934058Z [info     ] Risk Factors Aggregate Done    time=53.436591118s
2025-06-28T22:12:13.935665Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T22:12:13.945078Z [info     ] created_aggregate              aggregate_id=0686068b-df00-7ff5-8000-facc77dadab1 model=qwen3:14b
2025-06-28T22:12:13.945078Z [info     ] created_aggregate              aggregate_id=0686068b-df00-7ff5-8000-facc77dadab1 model=qwen3:14b


Created Risk Factors aggregate record with ID: 0686068b-df00-7ff5-8000-facc77dadab1
Associated with 4 source completions
Model: qwen3:14b, Duration: 53.44 seconds
Content length: 5106 characters


In [8]:
# # Create Descriptions Aggregate
messages = format_aggregate_messages(descriptions)
response = get_chat_response(client, AGGRAGATE_MODEL, messages)
logger.info(f"Descriptions Aggregate Done", time=f"{response.total_duration / 1e9}s")

# Save the aggregate content to file
with open(f'outputs/{TICKER}/descriptions.md', 'w') as file:
    file.write(response.message.content)

# Create aggregate record in database
created_at = response.created_at
total_duration = response.total_duration / 1e9
content = response.message.content
prompt = get_prompt_by_name("aggregate_prompt")

desc_aggregate_data = {
    'model': AGGRAGATE_MODEL,
    'company_id': company.id,
    'document_type': DocumentType.DESCRIPTION,
    'system_prompt_id': prompt.id,
    'total_duration': total_duration,
    'created_at': created_at,
    'num_ctx': MODEL_CONFIG[AGGRAGATE_MODEL]["ctx"],
    'content': content,
    'completion_ids': description_completion_ids  # Associate with source completions
}

desc_aggregate = create_aggregate(desc_aggregate_data)
print(f"Created Business Description aggregate record with ID: {desc_aggregate.id}")
print(f"Associated with {len(description_completion_ids)} source completions")
print(f"Model: {desc_aggregate.model}, Duration: {desc_aggregate.total_duration:.2f} seconds")
print(f"Content length: {len(desc_aggregate.content)} characters")


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:13:38.056463Z [info     ] Descriptions Aggregate Done    time=84.104125119s
2025-06-28T22:13:38.058084Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T22:13:38.056463Z [info     ] Descriptions Aggregate Done    time=84.104125119s
2025-06-28T22:13:38.058084Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T22:13:38.067217Z [info     ] created_aggregate              aggregate_id=06860691-20f5-7a0a-8000-99c9accca3d2 model=qwen3:14b
2025-06-28T22:13:38.067217Z [info     ] created_aggregate              aggregate_id=06860691-20f5-7a0a-8000-99c9accca3d2 model=qwen3:14b


Created Business Description aggregate record with ID: 06860691-20f5-7a0a-8000-99c9accca3d2
Associated with 4 source completions
Model: qwen3:14b, Duration: 84.10 seconds
Content length: 8430 characters


## Generate Company Summary

Now let's use the recent aggregates to generate a summary and save it to the company's summary column.

In [4]:
from src.database.aggregates import get_recent_aggregates_by_ticker
from src.database.companies import update_company
from src.llm.prompts import AGGREGATE_SUMMARY_PROMPT

TICKER="MSFT"
company = get_company_by_ticker(TICKER)

# Get the most recent aggregates for the ticker
recent_aggregates = get_recent_aggregates_by_ticker(TICKER)
logger.info(f"Found {len(recent_aggregates)} recent aggregates for {TICKER}")

# Prepare content for summary generation
aggregate_contents = {}
for aggregate in recent_aggregates:
    doc_type = aggregate.document_type.value if aggregate.document_type else 'unknown'
    aggregate_contents[doc_type] = aggregate.content

logger.info(f"Aggregate types available: {list(aggregate_contents.keys())}")

2025-06-28T22:42:37.338539Z [info     ] retrieved_company_by_ticker    company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 ticker=MSFT
2025-06-28T22:42:37.339516Z [info     ] retrieved_company_by_ticker    company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 ticker=MSFT
2025-06-28T22:42:37.342880Z [info     ] retrieved_recent_aggregates_by_company company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 count=3 document_types=['management_discussion', 'risk_factors', 'business_description']
2025-06-28T22:42:37.343202Z [info     ] Found 3 recent aggregates for MSFT
2025-06-28T22:42:37.343681Z [info     ] Aggregate types available: ['management_discussion', 'risk_factors', 'business_description']
2025-06-28T22:42:37.339516Z [info     ] retrieved_company_by_ticker    company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 ticker=MSFT
2025-06-28T22:42:37.342880Z [info     ] retrieved_recent_aggregates_by_company company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 count=3 document_types=['management_discussion', 'risk_f

In [32]:
# Format the aggregates content for the summary prompt
formatted_content = ""
for doc_type, content in aggregate_contents.items():
    formatted_content += f"\n\n## {doc_type.upper()} Analysis\n\n{content}"

# Create messages for summary generation
summary_messages = [{
    'role': PromptRole.SYSTEM.value,
    'content': AGGREGATE_SUMMARY_PROMPT
}, {
    'role': PromptRole.USER.value,
    'content': formatted_content
}]

logger.info(f"Generating summary for {TICKER} using {len(aggregate_contents)} aggregate reports")

2025-06-28T22:30:02.007260Z [info     ] Generating summary for ACHR using 0 aggregate reports


In [33]:
# Generate the summary using the aggregates
summary_response = get_chat_response(client, AGGRAGATE_MODEL, summary_messages)
logger.info(f"Summary generation completed in {summary_response.total_duration / 1e9:.2f} seconds")

# Extract the summary content
summary_content = summary_response.message.content
logger.info(f"Generated summary length: {len(summary_content)} characters")

# Display first part of the summary
print(f"Summary for {TICKER}:")
print("=" * 50)
print(summary_content[:500] + "..." if len(summary_content) > 500 else summary_content)

HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:30:12.088541Z [info     ] Summary generation completed in 8.81 seconds
2025-06-28T22:30:12.089038Z [info     ] Generated summary length: 1105 characters
2025-06-28T22:30:12.088541Z [info     ] Summary generation completed in 8.81 seconds
2025-06-28T22:30:12.089038Z [info     ] Generated summary length: 1105 characters


Summary for ACHR:
<think>
Okay, the user wants an overview of a company based on historical reports. But they didn't provide any specific reports or company details. Hmm, maybe they forgot to include the reports. I should check the history again.

Wait, looking back, the user's initial message just says "Consider these historical reports. Write 100 words providing an overview of the company." But there are no reports attached. That's a problem. Without the reports, I can't generate an accurate overview. 

I need ...


In [34]:
# Save the summary to the company's summary column
update_data = {
    'summary': summary_content
}

updated_company = update_company(company.id, update_data)

if updated_company:
    logger.info(f"Successfully updated company summary for {TICKER}",
               company_id=str(company.id),
               summary_length=len(summary_content))
    print(f"\n✅ Summary saved to company record for {TICKER}")
    print(f"Summary length: {len(summary_content)} characters")
else:
    logger.error(f"Failed to update company summary for {TICKER}")
    print(f"\n❌ Failed to save summary to company record for {TICKER}")

2025-06-28T22:30:12.101539Z [info     ] updated_company                company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 name=Archer Aviation Inc.
2025-06-28T22:30:12.101919Z [info     ] Successfully updated company summary for ACHR company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 summary_length=1105
2025-06-28T22:30:12.101919Z [info     ] Successfully updated company summary for ACHR company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 summary_length=1105



✅ Summary saved to company record for ACHR
Summary length: 1105 characters


In [35]:
# Verify the summary was saved by retrieving the company again
from src.database.companies import get_company

refreshed_company = get_company(company.id)
if refreshed_company and refreshed_company.summary:
    print(f"\n✅ Verification: Company summary successfully retrieved")
    print(f"Summary preview: {refreshed_company.summary[:200]}...")
else:
    print(f"\n❌ Verification failed: No summary found in company record")

2025-06-28T22:30:12.106218Z [info     ] retrieved_company              company_id=0685e3e7-c4c5-7522-8000-7417d23b7385



✅ Verification: Company summary successfully retrieved
Summary preview: <think>
Okay, the user wants an overview of a company based on historical reports. But they didn't provide any specific reports or company details. Hmm, maybe they forgot to include the reports. I sho...


## Database Migration: Add Summary Column

First, let's add the missing `summary` column to the `companies` table.

In [36]:
from sqlalchemy import text, inspect
from src.database.base import get_db_session, engine

# Get a fresh session to avoid any transaction issues
session = get_db_session()

# First, ensure we start with a clean transaction
try:
    session.rollback()
except:
    pass  # Ignore any rollback errors

# Check if the summary column already exists
inspector = inspect(engine)
columns = [col['name'] for col in inspector.get_columns('companies')]
print("Current columns in companies table:", columns)

# Add the summary column if it doesn't exist
if 'summary' not in columns:
    try:
        sql = text("ALTER TABLE companies ADD COLUMN summary TEXT;")
        session.execute(sql)
        session.commit()
        print("Successfully added 'summary' column to the companies table.")
    except Exception as e:
        session.rollback()
        print(f"Error adding summary column: {e}")
        # Try to get a completely fresh session and retry
        try:
            session.close()
            session = get_db_session()
            sql = text("ALTER TABLE companies ADD COLUMN summary TEXT;")
            session.execute(sql)
            session.commit()
            print("Successfully added 'summary' column on retry.")
        except Exception as retry_error:
            session.rollback()
            print(f"Failed on retry as well: {retry_error}")
else:
    print("The 'summary' column already exists in the companies table.")

# Now check and add summary column to aggregates table
aggregates_columns = [col['name'] for col in inspector.get_columns('aggregates')]
print("\nCurrent columns in aggregates table:", aggregates_columns)

if 'summary' not in aggregates_columns:
    try:
        sql = text("ALTER TABLE aggregates ADD COLUMN summary TEXT;")
        session.execute(sql)
        session.commit()
        print("Successfully added 'summary' column to the aggregates table.")
    except Exception as e:
        session.rollback()
        print(f"Error adding summary column to aggregates: {e}")
        # Try with fresh session
        try:
            session.close()
            session = get_db_session()
            sql = text("ALTER TABLE aggregates ADD COLUMN summary TEXT;")
            session.execute(sql)
            session.commit()
            print("Successfully added 'summary' column to aggregates table on retry.")
        except Exception as retry_error:
            session.rollback()
            print(f"Failed to add aggregates summary column on retry: {retry_error}")
else:
    print("The 'summary' column already exists in the aggregates table.")

Current columns in companies table: ['id', 'cik', 'name', 'display_name', 'is_company', 'tickers', 'exchanges', 'sic', 'sic_description', 'fiscal_year_end', 'entity_type', 'ein', 'former_names', 'summary']
The 'summary' column already exists in the companies table.

Current columns in aggregates table: ['id', 'created_at', 'total_duration', 'content', 'system_prompt_id', 'model', 'temperature', 'top_p', 'num_ctx', 'company_id', 'document_type']
Successfully added 'summary' column to the aggregates table.


## Database Migration: Aggregate Table Schema

Let's ensure the aggregates table has all the necessary columns for the current model.

In [37]:
from sqlalchemy import text, inspect
from src.database.base import get_db_session, engine

# Get a fresh session to avoid any transaction issues
session = get_db_session()

# First, ensure we start with a clean transaction
try:
    session.rollback()
except:
    pass  # Ignore any rollback errors

# Check current aggregates table schema
inspector = inspect(engine)
aggregates_columns = [col['name'] for col in inspector.get_columns('aggregates')]
print("Current columns in aggregates table:", aggregates_columns)

# Define the columns that should exist in the aggregates table
required_columns = {
    'id': 'UUID PRIMARY KEY',
    'company_id': 'UUID REFERENCES companies(id) ON DELETE CASCADE',
    'document_type': 'document_type_enum',
    'created_at': 'TIMESTAMP DEFAULT NOW()',
    'total_duration': 'FLOAT',
    'content': 'TEXT',
    'summary': 'TEXT',
    'system_prompt_id': 'UUID REFERENCES prompts(id) ON DELETE SET NULL',
    'model': 'VARCHAR(50)',
    'temperature': 'FLOAT DEFAULT 0.7',
    'top_p': 'FLOAT DEFAULT 1.0',
    'num_ctx': 'INTEGER DEFAULT 4096'
}

# Add missing columns
missing_columns = []
for column_name, column_def in required_columns.items():
    if column_name not in aggregates_columns:
        missing_columns.append((column_name, column_def))

if missing_columns:
    print(f"\nMissing columns: {[col[0] for col in missing_columns]}")

    for column_name, column_def in missing_columns:
        try:
            # Special handling for document_type enum
            if column_name == 'document_type':
                # Check if enum exists first
                enum_check = text("""
                    SELECT EXISTS (
                        SELECT 1 FROM pg_type
                        WHERE typname = 'document_type_enum'
                    );
                """)
                enum_exists = session.execute(enum_check).scalar()

                if not enum_exists:
                    # Create the enum type
                    create_enum = text("""
                        CREATE TYPE document_type_enum AS ENUM (
                            'management_discussion',
                            'risk_factors',
                            'business_description'
                        );
                    """)
                    session.execute(create_enum)
                    print(f"Created document_type_enum type")

                sql = text(f"ALTER TABLE aggregates ADD COLUMN {column_name} {column_def};")
            else:
                sql = text(f"ALTER TABLE aggregates ADD COLUMN {column_name} {column_def};")

            session.execute(sql)
            session.commit()
            print(f"Successfully added '{column_name}' column")

        except Exception as e:
            session.rollback()
            print(f"Error adding {column_name} column: {e}")
            # Try with fresh session for next column
            try:
                session.close()
                session = get_db_session()
            except:
                pass
else:
    print("\n✅ All required columns already exist in aggregates table")

Current columns in aggregates table: ['id', 'created_at', 'total_duration', 'content', 'system_prompt_id', 'model', 'temperature', 'top_p', 'num_ctx', 'company_id', 'document_type', 'summary']

✅ All required columns already exist in aggregates table


## Generate Individual Aggregate Summaries

Let's loop through each recent aggregate and generate concise summaries for them.

In [5]:
from src.database.aggregates import update_aggregate

# Loop through each recent aggregate and generate summaries
for i, aggregate in enumerate(recent_aggregates):
    doc_type = aggregate.document_type.value if aggregate.document_type else 'unknown'
    print(f"\n{'='*60}")
    print(f"Processing Aggregate {i+1}/{len(recent_aggregates)}: {doc_type.upper()}")
    print(f"Aggregate ID: {aggregate.id}")
    print(f"Content length: {len(aggregate.content) if aggregate.content else 0} characters")
    print(f"Model: {aggregate.model}")
    print(f"Created: {aggregate.created_at}")

    # Skip if aggregate already has a summary
    if aggregate.summary:
        print(f"⏭️  Aggregate already has summary ({len(aggregate.summary)} chars). Skipping.")
        continue

    # Skip if no content to summarize
    if not aggregate.content or len(aggregate.content.strip()) < 50:
        print(f"⚠️  Aggregate has insufficient content. Skipping.")
        continue

    # Create summary prompt based on document type
    if doc_type == 'management_discussion':
        summary_prompt = "Summarize this management discussion and analysis in 2-3 sentences, highlighting key business developments, financial performance insights, and management's strategic outlook."
    elif doc_type == 'risk_factors':
        summary_prompt = "Summarize this risk factors analysis in 2-3 sentences, identifying the most significant risks and any notable changes in the company's risk profile."
    elif doc_type == 'business_description':
        summary_prompt = "Summarize this business description analysis in 2-3 sentences, capturing the company's core business model, market position, and key value propositions."
    else:
        summary_prompt = "Summarize this analysis in 2-3 sentences, highlighting the key findings and insights."

    # Create messages for individual aggregate summary
    aggregate_summary_messages = [{
        'role': PromptRole.SYSTEM.value,
        'content': summary_prompt
    }, {
        'role': PromptRole.USER.value,
        'content': aggregate.content
    }]

    print(f"🔄 Generating summary for {doc_type}...")

    try:
        # Generate the summary
        aggregate_summary_response = get_chat_response(client, AGGRAGATE_MODEL, aggregate_summary_messages)
        aggregate_summary_content = aggregate_summary_response.message.content.strip()

        print(f"✅ Summary generated ({len(aggregate_summary_content)} characters)")
        print(f"Preview: {aggregate_summary_content[:150]}...")

        # Update the aggregate with the summary
        update_data = {'summary': aggregate_summary_content}
        updated_aggregate = update_aggregate(aggregate.id, update_data)

        if updated_aggregate:
            print(f"💾 Summary saved to aggregate {aggregate.id}")
            logger.info(f"Updated aggregate summary",
                       aggregate_id=str(aggregate.id),
                       document_type=doc_type,
                       summary_length=len(aggregate_summary_content))
        else:
            print(f"❌ Failed to save summary to aggregate {aggregate.id}")
            logger.error(f"Failed to update aggregate summary", aggregate_id=str(aggregate.id))

    except Exception as e:
        print(f"❌ Error generating summary for {doc_type}: {str(e)}")
        logger.error(f"Error generating aggregate summary",
                    aggregate_id=str(aggregate.id),
                    document_type=doc_type,
                    error=str(e))

print(f"\n{'='*60}")
print(f"🎉 Completed processing {len(recent_aggregates)} aggregates for {TICKER}")


Processing Aggregate 1/3: MANAGEMENT_DISCUSSION
Aggregate ID: 06860688-8712-7c9c-8000-20c9f05dccbd
Content length: 8618 characters
Model: qwen3:14b
Created: 2025-06-28 22:10:00.695195
🔄 Generating summary for management_discussion...


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:42:56.074082Z [info     ] updated_aggregate              aggregate_id=06860688-8712-7c9c-8000-20c9f05dccbd
2025-06-28T22:42:56.074471Z [info     ] Updated aggregate summary      aggregate_id=06860688-8712-7c9c-8000-20c9f05dccbd document_type=management_discussion summary_length=1670


✅ Summary generated (1670 characters)
Preview: <think>
Okay, I need to summarize the management discussion and analysis (MD&A) from the 10-K filing in 2-3 sentences. The user wants key business dev...
💾 Summary saved to aggregate 06860688-8712-7c9c-8000-20c9f05dccbd

Processing Aggregate 2/3: RISK_FACTORS
Aggregate ID: 0686068b-df00-7ff5-8000-facc77dadab1
Content length: 5106 characters
Model: qwen3:14b
Created: 2025-06-28 22:12:13.929911
🔄 Generating summary for risk_factors...


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:43:07.604784Z [info     ] updated_aggregate              aggregate_id=0686068b-df00-7ff5-8000-facc77dadab1
2025-06-28T22:43:07.605145Z [info     ] Updated aggregate summary      aggregate_id=0686068b-df00-7ff5-8000-facc77dadab1 document_type=risk_factors summary_length=1722


✅ Summary generated (1722 characters)
Preview: <think>
Okay, the user wants a summary of the risk factors analysis in 2-3 sentences, highlighting the most significant risks and any notable changes ...
💾 Summary saved to aggregate 0686068b-df00-7ff5-8000-facc77dadab1

Processing Aggregate 3/3: BUSINESS_DESCRIPTION
Aggregate ID: 06860691-20f5-7a0a-8000-99c9accca3d2
Content length: 8430 characters
Model: qwen3:14b
Created: 2025-06-28 22:13:38.052474
🔄 Generating summary for business_description...


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T22:43:31.067033Z [info     ] updated_aggregate              aggregate_id=06860691-20f5-7a0a-8000-99c9accca3d2
2025-06-28T22:43:31.067565Z [info     ] Updated aggregate summary      aggregate_id=06860691-20f5-7a0a-8000-99c9accca3d2 document_type=business_description summary_length=3462


✅ Summary generated (3462 characters)
Preview: <think>
Okay, let's see. The user wants a summary of the business description analysis in 2-3 sentences. The core elements to capture are the company'...
💾 Summary saved to aggregate 06860691-20f5-7a0a-8000-99c9accca3d2

🎉 Completed processing 3 aggregates for MSFT


In [6]:
# Verify the summaries were saved by retrieving the aggregates again
print(f"\n📋 Summary Report for {TICKER}:")
print("=" * 50)

refreshed_aggregates = get_recent_aggregates_by_ticker(TICKER)
for aggregate in refreshed_aggregates:
    doc_type = aggregate.document_type.value if aggregate.document_type else 'unknown'
    summary_status = "✅ Has Summary" if aggregate.summary else "❌ No Summary"
    summary_length = len(aggregate.summary) if aggregate.summary else 0
    content_length = len(aggregate.content) if aggregate.content else 0

    print(f"\n{doc_type.upper()}:")
    print(f"  Status: {summary_status} ({summary_length} chars)")
    print(f"  Content: {content_length} characters")
    print(f"  Model: {aggregate.model}")
    print(f"  Created: {aggregate.created_at}")

    if aggregate.summary:
        print(f"  Summary Preview: {aggregate.summary[:100]}...")

2025-06-28T22:43:31.073866Z [info     ] retrieved_company_by_ticker    company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 ticker=MSFT
2025-06-28T22:43:31.075822Z [info     ] retrieved_recent_aggregates_by_company company_id=0685e2b2-ba5f-73a8-8000-6d5063ab6d43 count=3 document_types=['management_discussion', 'risk_factors', 'business_description']



📋 Summary Report for MSFT:

MANAGEMENT_DISCUSSION:
  Status: ✅ Has Summary (1670 chars)
  Content: 8618 characters
  Model: qwen3:14b
  Created: 2025-06-28 22:10:00.695195
  Summary Preview: <think>
Okay, I need to summarize the management discussion and analysis (MD&A) from the 10-K filing...

RISK_FACTORS:
  Status: ✅ Has Summary (1722 chars)
  Content: 5106 characters
  Model: qwen3:14b
  Created: 2025-06-28 22:12:13.929911
  Summary Preview: <think>
Okay, the user wants a summary of the risk factors analysis in 2-3 sentences, highlighting t...

BUSINESS_DESCRIPTION:
  Status: ✅ Has Summary (3462 chars)
  Content: 8430 characters
  Model: qwen3:14b
  Created: 2025-06-28 22:13:38.052474
  Summary Preview: <think>
Okay, let's see. The user wants a summary of the business description analysis in 2-3 senten...
