# Refactored Analysis Pipeline

This notebook demonstrates the simplified API for processing company documents using the refactored functions.
It processes all companies in the database automatically.

In [1]:
# Import the refactored modules
from src.database.base import init_db
from src.database.companies import get_all_company_tickers
from src.llm.client import init_client
from src.llm.completions import (
    process_company_document_completions,
    get_completion_results_by_type
)
from src.llm.aggregations import (
    process_all_aggregates,
    generate_company_summary,
    generate_aggregate_summaries,
    verify_company_summary,
    get_aggregates_summary_report
)
from src.database.documents import DocumentType
from src.utils.config import settings
from src.utils.logging import configure_logging, get_logger

# Initialize components
configure_logging()
logger = get_logger(name="refactored_notebook")
_, _ = init_db(settings.database.url)
client = init_client(settings.openai_api.url)
_ = client.ps()
logger.info("System initialized successfully")

2025-06-29T20:07:22.317393Z [info     ] database_initialized           status=success
HTTP Request: GET http://10.0.0.4:11434/api/ps "HTTP/1.1 200 OK"
2025-06-29T20:07:22.332373Z [info     ] System initialized successfully


In [2]:
# Configuration
DOC_MODEL = "qwen3:4b"  # smaller model w/ larger context
AGGREGATE_MODEL = "qwen3:14b"

# Get all tickers from the database
ALL_TICKERS = get_all_company_tickers()
print(f"Found {len(ALL_TICKERS)} tickers in database: {ALL_TICKERS[:10]}{'...' if len(ALL_TICKERS) > 10 else ''}")

if not ALL_TICKERS:
    print("No tickers found in database. Please ingest some companies first.")
    raise SystemExit("No companies to process")

2025-06-29T20:07:22.352027Z [info     ] retrieved_all_company_tickers  count=121


Found 121 tickers in database: ['AAL', 'AAPL', 'ACHR', 'ACHR-WT', 'AES', 'AGNC', 'AGNCL', 'AGNCM', 'AGNCN', 'AGNCO']...


## Processing All Companies

The following cells will process each company in the database through the complete pipeline:
1. Process document completions
2. Create aggregates
3. Generate company summary
4. Generate individual aggregate summaries
5. Display summary report

In [None]:
# Process all companies
results_summary = {}
failed_tickers = []

for i, ticker in enumerate(ALL_TICKERS, 1):
    print(f"\n{'='*60}")
    print(f"Processing {i}/{len(ALL_TICKERS)}: {ticker}")
    print(f"{'='*60}")

    try:
        OUTPUT_DIR = f"outputs/{ticker}"

        # Step 1: Process Document Completions
        print(f"\n📄 Step 1: Processing document completions for {ticker}...")
        completion_ids_by_type = process_company_document_completions(
            client=client,
            ticker=ticker,
            model=DOC_MODEL,
            output_dir=OUTPUT_DIR
        )
        print(f"✅ Completion IDs by type: {completion_ids_by_type}")

        # Step 2: Load Completion Data and Process Aggregates
        print(f"\n🔄 Step 2: Loading completion data and processing aggregates for {ticker}...")
        completion_data_by_type = {
            "mda": get_completion_results_by_type(ticker, DocumentType.MDA),
            "risk_factors": get_completion_results_by_type(ticker, DocumentType.RISK_FACTORS),
            "description": get_completion_results_by_type(ticker, DocumentType.DESCRIPTION)
        }

        aggregates = process_all_aggregates(
            client=client,
            ticker=ticker,
            completion_ids_by_type=completion_ids_by_type,
            completion_data_by_type=completion_data_by_type,
            model=AGGREGATE_MODEL,
            output_dir=OUTPUT_DIR
        )
        print(f"✅ Created aggregates: {list(aggregates.keys())}")

        # Step 3: Generate Company Summary
        print(f"\n📋 Step 3: Generating company summary for {ticker}...")
        company_summary = generate_company_summary(
            client=client,
            ticker=ticker,
            model=AGGREGATE_MODEL
        )

        if company_summary:
            print(f"✅ Company summary generated ({len(company_summary)} characters)")
            success, preview = verify_company_summary(ticker)
            if success:
                print(f"✅ Summary verification successful")
            else:
                print(f"❌ Summary verification failed")
        else:
            print(f"❌ Failed to generate company summary")

        # Step 4: Generate Individual Aggregate Summaries
        print(f"\n📊 Step 4: Generating aggregate summaries for {ticker}...")
        summary_results = generate_aggregate_summaries(
            client=client,
            ticker=ticker,
            model=AGGREGATE_MODEL
        )

        successful_summaries = sum(summary_results.values())
        total_aggregates = len(summary_results)
        print(f"✅ Generated summaries for {successful_summaries}/{total_aggregates} aggregates")

        # Step 5: Generate Summary Report
        print(f"\n📋 Step 5: Generating summary report for {ticker}...")
        report = get_aggregates_summary_report(ticker)

        # Store results
        results_summary[ticker] = {
            'completion_ids': completion_ids_by_type,
            'aggregates': list(aggregates.keys()),
            'has_company_summary': bool(company_summary),
            'aggregate_summaries': f"{successful_summaries}/{total_aggregates}",
            'report': report
        }

        print(f"\n🎉 Successfully completed processing for {ticker}!")

    except Exception as e:
        print(f"\n❌ Error processing {ticker}: {str(e)}")
        logger.error(f"Failed to process ticker {ticker}", error=str(e), exc_info=True)
        failed_tickers.append(ticker)
        continue

print(f"\n\n🏁 PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Successfully processed: {len(results_summary)} companies")
print(f"Failed: {len(failed_tickers)} companies")
if failed_tickers:
    print(f"Failed tickers: {failed_tickers}")

2025-06-29T20:07:22.358811Z [info     ] Starting document completion processing for AAL
2025-06-29T20:07:22.360522Z [info     ] retrieved_company_by_ticker    company_id=0685e254-6216-7004-8000-4640e4671663 ticker=AAL
2025-06-29T20:07:22.362100Z [info     ] retrieved_filings_by_company   company_id=0685e254-6216-7004-8000-4640e4671663 count=5
2025-06-29T20:07:22.364886Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e254-bb4f-72cd-8000-6533e481df8d
2025-06-29T20:07:22.365247Z [info     ] Starting Chat for AAL 2021-02-17 business_description



Processing 1/121: AAL

📄 Step 1: Processing document completions for AAL...


HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-29T20:09:00.698532Z [info     ] retrieved_prompt_by_name       name=business_description prompt_id=06860055-5fa7-7c96-8000-8afefd533328
2025-06-29T20:09:00.705584Z [info     ] created_completion             completion_id=068619d5-cb38-7882-8000-eeee09bb9a7a model=qwen3:4b
2025-06-29T20:09:00.707460Z [info     ] created_completion             document=American Airlines Group Inc. 10-K 2021-02-17 - Business Description id=UUID('068619d5-cb38-7882-8000-eeee09bb9a7a') length=8448 time=98.0
2025-06-29T20:09:00.712271Z [info     ] Starting Chat for AAL 2021-02-17 risk_factors
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-29T20:10:54.512079Z [info     ] retrieved_prompt_by_name       name=risk_factors prompt_id=06860055-5f8d-73c6-8000-acf740a64053
2025-06-29T20:10:54.521535Z [info     ] created_completion             completion_id=068619dc-e83a-7106-8000-a20fe7ce3245 model=qwen3:4b
2025-0

In [None]:
# Display detailed results summary
print(f"\n📊 DETAILED RESULTS SUMMARY")
print(f"{'='*80}")

for ticker, results in results_summary.items():
    print(f"\n🏢 {ticker}:")
    print(f"  Completions: {results['completion_ids']}")
    print(f"  Aggregates: {results['aggregates']}")
    print(f"  Company Summary: {'✅' if results['has_company_summary'] else '❌'}")
    print(f"  Aggregate Summaries: {results['aggregate_summaries']}")

    # Show report details
    if results['report']:
        for doc_type, info in results['report'].items():
            print(f"    {doc_type.upper()}: {info['content_length']} chars, Summary: {'✅' if info['has_summary'] else '❌'}")

if failed_tickers:
    print(f"\n❌ FAILED TICKERS: {failed_tickers}")