# Refactored Analysis Pipeline

This notebook demonstrates the simplified API for processing company documents using the refactored functions.

In [1]:
# Import the refactored modules
from src.database.base import init_db
from src.llm.client import init_client
from src.llm.completions import (
    process_company_document_completions,
    get_completion_results_by_type
)
from src.llm.aggregations import (
    process_all_aggregates,
    generate_company_summary,
    generate_aggregate_summaries,
    verify_company_summary,
    get_aggregates_summary_report
)
from src.database.documents import DocumentType
from src.utils.config import settings
from src.utils.logging import configure_logging, get_logger

# Initialize components
configure_logging()
logger = get_logger(name="refactored_notebook")
_, _ = init_db(settings.database.url)
client = init_client(settings.openai_api.url)
_ = client.ps()
logger.info("System initialized successfully")

2025-06-28T23:18:01.164712Z [info     ] database_initialized           status=success
HTTP Request: GET http://10.0.0.4:11434/api/ps "HTTP/1.1 200 OK"
2025-06-28T23:18:01.178382Z [info     ] System initialized successfully
HTTP Request: GET http://10.0.0.4:11434/api/ps "HTTP/1.1 200 OK"
2025-06-28T23:18:01.178382Z [info     ] System initialized successfully


In [2]:
# Configuration
TICKER = "ACHR"
DOC_MODEL = "qwen3:4b"  # smaller model w/ larger context
AGGREGATE_MODEL = "qwen3:14b"
OUTPUT_DIR = f"outputs/{TICKER}"

## Step 1: Process Document Completions

Generate completions for all documents using the simplified API.

In [3]:
# Process all document completions for the company
completion_ids_by_type = process_company_document_completions(
    client=client,
    ticker=TICKER,
    model=DOC_MODEL,
    output_dir=OUTPUT_DIR
)

print(f"Completion IDs by type: {completion_ids_by_type}")
logger.info("Document completions processing completed")

2025-06-28T23:18:01.187040Z [info     ] Starting document completion processing for ACHR
2025-06-28T23:18:01.201056Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
2025-06-28T23:18:01.201056Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
2025-06-28T23:18:01.202631Z [info     ] retrieved_filings_by_company   company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 count=5
2025-06-28T23:18:01.205318Z [info     ] retrieved_documents_by_filing  count=2 filing_id=0685e3e8-1f56-74e4-8000-f041f0dee00e
2025-06-28T23:18:01.205560Z [info     ] Starting Chat for ACHR 2021-05-24 risk_factors
2025-06-28T23:18:01.202631Z [info     ] retrieved_filings_by_company   company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 count=5
2025-06-28T23:18:01.205318Z [info     ] retrieved_documents_by_filing  count=2 filing_id=0685e3e8-1f56-74e4-8000-f041f0dee00e
2025-06-28T23:18:01.205560Z [info     ] Starting Chat fo

Completion IDs by type: {'description': [UUID('0686078e-b0bf-79f5-8000-93b392ff8f51'), UUID('06860799-c620-759f-8000-05e8bf4a4e9a'), UUID('068607a4-2bf7-7ce2-8000-0f7230d8ae79'), UUID('068607af-f202-7446-8000-46758f250133')], 'risk_factors': [UUID('0686078a-4d46-783f-8000-4862921e4109'), UUID('06860794-f7ef-711f-8000-966d0c2feeef'), UUID('0686079f-962e-7cee-8000-e3f9c363480d'), UUID('068607ab-08f6-7c04-8000-5c131ba0150f'), UUID('068607b6-209c-78e4-8000-dad6d31f6038')], 'mda': [UUID('0686078c-ba2f-7042-8000-c55993e9f258'), UUID('06860797-a341-70fa-8000-dd83fb10c12c'), UUID('068607a2-7417-7cef-8000-3a3e2409d8f5'), UUID('068607ae-2098-73a8-8000-dcb9db7a0182'), UUID('068607b9-518f-78c8-8000-d6c1ebbb1178')]}


## Step 2: Load Completion Data and Process Aggregates

Load the completion results and create aggregates for each document type.

In [4]:
# Load completion data for aggregation
completion_data_by_type = {
    "mda": get_completion_results_by_type(TICKER, DocumentType.MDA),
    "risk_factors": get_completion_results_by_type(TICKER, DocumentType.RISK_FACTORS),
    "description": get_completion_results_by_type(TICKER, DocumentType.DESCRIPTION)
}

print(f"Loaded completion data for: {list(completion_data_by_type.keys())}")

# Process all aggregates
aggregates = process_all_aggregates(
    client=client,
    ticker=TICKER,
    completion_ids_by_type=completion_ids_by_type,
    completion_data_by_type=completion_data_by_type,
    model=AGGREGATE_MODEL,
    output_dir=OUTPUT_DIR
)

print(f"Created aggregates: {list(aggregates.keys())}")
logger.info("Aggregate processing completed")

2025-06-28T23:32:37.109219Z [info     ] Starting aggregate processing for ACHR
2025-06-28T23:32:37.110194Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR


Loaded completion data for: ['mda', 'risk_factors', 'description']


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T23:34:12.960193Z [info     ] management_discussion Aggregate Done time=95.847081227s
2025-06-28T23:34:12.960727Z [info     ] Saved management_discussion aggregate to outputs/ACHR/mda.md
2025-06-28T23:34:12.961835Z [info     ] retrieved_prompt_by_name       name=aggregate_prompt prompt_id=0686004a-ce9f-7159-8000-2e2abd3dbaa8
2025-06-28T23:34:12.972563Z [info     ] created_aggregate              aggregate_id=068607bf-4f6c-7a80-8000-0f27dd2af339 model=qwen3:14b
2025-06-28T23:34:12.972898Z [info     ] Created management_discussion aggregate aggregate_id=UUID('068607bf-4f6c-7a80-8000-0f27dd2af339') completion_count=5 content_length=9817 duration=95.85s model=qwen3:14b
2025-06-28T23:34:12.973823Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T23:35:06.360728Z [info     ] risk_facto

Created aggregates: ['mda', 'risk_factors', 'description']


## Step 3: Generate Company Summary

Create a comprehensive company summary from the aggregates.

In [5]:
# Generate comprehensive company summary
company_summary = generate_company_summary(
    client=client,
    ticker=TICKER,
    model=AGGREGATE_MODEL
)

if company_summary:
    print(f"Company summary generated ({len(company_summary)} characters)")
    print(f"Preview: {company_summary[:300]}...")

    # Verify the summary was saved
    success, preview = verify_company_summary(TICKER)
    if success:
        print(f"✅ Summary verification successful")
    else:
        print(f"❌ Summary verification failed")
else:
    print("Failed to generate company summary")

2025-06-28T23:36:05.458746Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
2025-06-28T23:36:05.460674Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
2025-06-28T23:36:05.463744Z [info     ] retrieved_recent_aggregates_by_company company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 count=3 document_types=['management_discussion', 'risk_factors', 'business_description']
2025-06-28T23:36:05.464061Z [info     ] Found 3 recent aggregates for ACHR
2025-06-28T23:36:05.464416Z [info     ] Aggregate types available: ['management_discussion', 'risk_factors', 'business_description']
2025-06-28T23:36:05.464699Z [info     ] Generating summary for ACHR using 3 aggregate reports
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T23:36:28.069172Z [info     ] Summary generation completed in 22.60 seconds
2025-06-28T23:36:28.069577Z [info     ] Generated summary length: 

Company summary generated (2237 characters)
Preview: <think>
Okay, the user has provided a detailed analysis of Archer Aviation's historical filings, including the evolution of their business from 2022 to 2025. Now, they want a 100-word overview of the company based on these reports. Let me start by identifying the key points from each year.

In 2022,...
✅ Summary verification successful


## Step 4: Generate Individual Aggregate Summaries

Create concise summaries for each aggregate.

In [6]:
# Generate summaries for individual aggregates
summary_results = generate_aggregate_summaries(
    client=client,
    ticker=TICKER,
    model=AGGREGATE_MODEL
)

successful_summaries = sum(summary_results.values())
total_aggregates = len(summary_results)

print(f"Generated summaries for {successful_summaries}/{total_aggregates} aggregates")
logger.info("Individual aggregate summaries completed")

2025-06-28T23:36:28.126911Z [info     ] Starting individual aggregate summary generation for ACHR
2025-06-28T23:36:28.128523Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
2025-06-28T23:36:28.130045Z [info     ] retrieved_recent_aggregates_by_company company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 count=3 document_types=['management_discussion', 'risk_factors', 'business_description']
2025-06-28T23:36:28.130439Z [info     ] Processing Aggregate 1/3: MANAGEMENT_DISCUSSION aggregate_id=UUID('068607bf-4f6c-7a80-8000-0f27dd2af339') content_length=9817
2025-06-28T23:36:28.130713Z [info     ] Generating summary for management_discussion aggregate 068607bf-4f6c-7a80-8000-0f27dd2af339
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T23:37:01.147857Z [info     ] Summary generated for aggregate 068607bf-4f6c-7a80-8000-0f27dd2af339 summary_length=4629
2025-06-28T23:37:01.151951Z [info     ] updated_aggregate   

Generated summaries for 3/3 aggregates


## Step 5: View Summary Report

Display a comprehensive report of all aggregates and summaries.

In [7]:
# Generate and display summary report
report = get_aggregates_summary_report(TICKER)

print(f"\n📋 Summary Report for {TICKER}:")
print("=" * 60)

for doc_type, info in report.items():
    print(f"\n{doc_type.upper()}:")
    print(f"  ID: {info['id']}")
    print(f"  Has Summary: {'✅' if info['has_summary'] else '❌'} ({info['summary_length']} chars)")
    print(f"  Content: {info['content_length']} characters")
    print(f"  Model: {info['model']}")
    print(f"  Created: {info['created_at']}")

    if info['summary_preview']:
        print(f"  Summary Preview: {info['summary_preview']}")

print(f"\n🎉 Processing complete for {TICKER}!")

2025-06-28T23:37:29.365014Z [info     ] retrieved_company_by_ticker    company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 ticker=ACHR
2025-06-28T23:37:29.366225Z [info     ] retrieved_recent_aggregates_by_company company_id=0685e3e7-c4c5-7522-8000-7417d23b7385 count=3 document_types=['management_discussion', 'risk_factors', 'business_description']
2025-06-28T23:37:29.366631Z [info     ] Generated summary report for ACHR aggregates_count=3



📋 Summary Report for ACHR:

MANAGEMENT_DISCUSSION:
  ID: 068607bf-4f6c-7a80-8000-0f27dd2af339
  Has Summary: ✅ (4629 chars)
  Content: 9817 characters
  Model: qwen3:14b
  Created: 2025-06-28 23:34:12.958275
  Summary Preview: <think>
Okay, let's tackle this. The user wants a summary of the Management Discussion and Analysis ...

RISK_FACTORS:
  ID: 068607c2-a5d3-7809-8000-b7a7848a76c3
  Has Summary: ✅ (1620 chars)
  Content: 5028 characters
  Model: qwen3:14b
  Created: 2025-06-28 23:35:06.358852
  Summary Preview: <think>
Okay, I need to summarize this risk factors analysis in 2-3 sentences, identifying the most ...

BUSINESS_DESCRIPTION:
  ID: 068607c6-5732-786e-8000-2d6a4b1ce63e
  Has Summary: ✅ (2744 chars)
  Content: 5702 characters
  Model: qwen3:14b
  Created: 2025-06-28 23:36:05.444492
  Summary Preview: <think>
Okay, I need to summarize the business description analysis in 2-3 sentences. Let me start b...

🎉 Processing complete for ACHR!
