# Document Analysis Pipeline

In [None]:
from src.database.base import init_db
from src.database.companies import get_all_company_tickers
from src.llm.client import init_client
from src.llm.completions import (
    process_company_document_completions,
    get_completion_results_by_type
)
from src.llm.aggregations import (
    process_all_aggregates,
    generate_company_summary,
    generate_aggregate_summaries,
    verify_company_summary,
    get_aggregates_summary_report
)
from src.database.documents import DocumentType
from src.utils.config import settings
from src.utils.logging import configure_logging, get_logger

# Initialize components
configure_logging()
logger = get_logger(name="refactored_notebook")
_, _ = init_db(settings.database.url)
client = init_client(settings.openai_api.url)
_ = client.ps()
logger.info("System initialized successfully")

In [None]:
# Configuration
DOC_MODEL = "qwen3:4b"  # smaller model w/ larger context
AGGREGATE_MODEL = "qwen3:14b"

# Get all tickers from the database
ALL_TICKERS = get_all_company_tickers()
print(f"Found {len(ALL_TICKERS)} tickers in database: {ALL_TICKERS[:10]}{'...' if len(ALL_TICKERS) > 10 else ''}")

if not ALL_TICKERS:
    print("No tickers found in database. Please ingest some companies first.")
    raise SystemExit("No companies to process")

In [None]:
# Process all companies
results_summary = {}
failed_tickers = []

for i, ticker in enumerate(ALL_TICKERS, 1):
    print(f"\n{'='*60}")
    print(f"Processing {i}/{len(ALL_TICKERS)}: {ticker}")
    print(f"{'='*60}")

    try:
        OUTPUT_DIR = f"outputs/{ticker}"

        # Step 1: Process Document Completions
        print(f"\n📄 Step 1: Processing document completions for {ticker}...")
        completion_ids_by_type = process_company_document_completions(
            client=client,
            ticker=ticker,
            model=DOC_MODEL,
            output_dir=OUTPUT_DIR
        )
        print(f"✅ Completion IDs by type: {completion_ids_by_type}")

        # Step 2: Load Completion Data and Process Aggregates
        print(f"\n🔄 Step 2: Loading completion data and processing aggregates for {ticker}...")
        completion_data_by_type = {
            "mda": get_completion_results_by_type(ticker, DocumentType.MDA),
            "risk_factors": get_completion_results_by_type(ticker, DocumentType.RISK_FACTORS),
            "description": get_completion_results_by_type(ticker, DocumentType.DESCRIPTION)
        }

        aggregates = process_all_aggregates(
            client=client,
            ticker=ticker,
            completion_ids_by_type=completion_ids_by_type,
            completion_data_by_type=completion_data_by_type,
            model=AGGREGATE_MODEL,
            output_dir=OUTPUT_DIR
        )
        print(f"✅ Created aggregates: {list(aggregates.keys())}")

        # Step 3: Generate Company Summary
        print(f"\n📋 Step 3: Generating company summary for {ticker}...")
        company_summary = generate_company_summary(
            client=client,
            ticker=ticker,
            model=AGGREGATE_MODEL
        )

        if company_summary:
            print(f"✅ Company summary generated ({len(company_summary)} characters)")
            success, preview = verify_company_summary(ticker)
            if success:
                print(f"✅ Summary verification successful")
            else:
                print(f"❌ Summary verification failed")
        else:
            print(f"❌ Failed to generate company summary")

        # Step 4: Generate Individual Aggregate Summaries
        print(f"\n📊 Step 4: Generating aggregate summaries for {ticker}...")
        summary_results = generate_aggregate_summaries(
            client=client,
            ticker=ticker,
            model=AGGREGATE_MODEL
        )

        successful_summaries = sum(summary_results.values())
        total_aggregates = len(summary_results)
        print(f"✅ Generated summaries for {successful_summaries}/{total_aggregates} aggregates")

        # Step 5: Generate Summary Report
        print(f"\n📋 Step 5: Generating summary report for {ticker}...")
        report = get_aggregates_summary_report(ticker)

        # Store results
        results_summary[ticker] = {
            'completion_ids': completion_ids_by_type,
            'aggregates': list(aggregates.keys()),
            'has_company_summary': bool(company_summary),
            'aggregate_summaries': f"{successful_summaries}/{total_aggregates}",
            'report': report
        }

        print(f"\n🎉 Successfully completed processing for {ticker}!")

    except Exception as e:
        print(f"\n❌ Error processing {ticker}: {str(e)}")
        logger.error(f"Failed to process ticker {ticker}", error=str(e), exc_info=True)
        failed_tickers.append(ticker)
        continue

print(f"\n\n🏁 PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Successfully processed: {len(results_summary)} companies")
print(f"Failed: {len(failed_tickers)} companies")
if failed_tickers:
    print(f"Failed tickers: {failed_tickers}")

In [None]:
# Display detailed results summary
print(f"\n📊 DETAILED RESULTS SUMMARY")
print(f"{'='*80}")

for ticker, results in results_summary.items():
    print(f"\n🏢 {ticker}:")
    print(f"  Completions: {results['completion_ids']}")
    print(f"  Aggregates: {results['aggregates']}")
    print(f"  Company Summary: {'✅' if results['has_company_summary'] else '❌'}")
    print(f"  Aggregate Summaries: {results['aggregate_summaries']}")

    # Show report details
    if results['report']:
        for doc_type, info in results['report'].items():
            print(f"    {doc_type.upper()}: {info['content_length']} chars, Summary: {'✅' if info['has_summary'] else '❌'}")

if failed_tickers:
    print(f"\n❌ FAILED TICKERS: {failed_tickers}")