In [1]:
import os
import json

from src.database.base import init_db
from src.database.companies import get_company_by_ticker
from src.database.filings import get_filings_by_company
from src.database.documents import get_documents_by_filing, DocumentType
from src.database.aggregates import create_aggregate

from src.database.prompts import get_prompt_by_name
from src.database.completions import create_completion, get_completion_ids
from src.database.aggregates import create_aggregate

from src.llm.prompts import format_document_messages, format_aggregate_messages, PromptRole
from src.llm.client import init_client, get_chat_response
from src.llm.models import MODEL_CONFIG

from src.utils.config import settings
from src.utils.logging import configure_logging, get_logger

configure_logging()
logger = get_logger(name="notebook")
_, _ = init_db(settings.database.url)

client = init_client(settings.openai_api.url)
_ = client.ps()
logger.info("ollama_initialzied", status="success")

2025-06-28T20:44:11.764483Z [info     ] database_initialized           status=success
HTTP Request: GET http://10.0.0.4:11434/api/ps "HTTP/1.1 200 OK"
2025-06-28T20:44:11.778678Z [info     ] ollama_initialzied             status=success


In [2]:
TICKER = "GOOGL"
DOC_MODEL = "qwen3:4b" # smaller model w/ larger context
AGGRAGATE_MODEL = "qwen3:14b"

In [3]:
company = get_company_by_ticker(TICKER)
filings = get_filings_by_company(company_id=company.id)
for filing in filings:
    documents = get_documents_by_filing(filing.id)
    for document in documents:
        pass

2025-06-28T20:44:14.104049Z [info     ] retrieved_company_by_ticker    company_id=0685e290-2202-7cb1-8000-dbf004637dd0 ticker=GOOGL
2025-06-28T20:44:14.106050Z [info     ] retrieved_filings_by_company   company_id=0685e290-2202-7cb1-8000-dbf004637dd0 count=5
2025-06-28T20:44:14.108539Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e291-b033-75b5-8000-741d7b85e508
2025-06-28T20:44:14.109925Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e291-ef54-7915-8000-bf4d6d3136ff
2025-06-28T20:44:14.110990Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e291-43c6-7645-8000-385638c9009b
2025-06-28T20:44:14.112059Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e290-e16f-7a50-8000-0078d04b9998
2025-06-28T20:44:14.113193Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e290-78e9-76a0-8000-0f8670ce3a0e


In [None]:
descriptions = {}
risk_factors = {}
mdas = {}

# Track completion IDs by document type for aggregation
description_completion_ids = []
risk_factor_completion_ids = []
mda_completion_ids = []

for filing in filings:
    documents = get_documents_by_filing(filing.id)
    for document in documents:

        messages = format_document_messages(document)

        logger.info(f"Starting Chat for {TICKER} {filing.filing_date} {document.document_type.value}")
        response = get_chat_response(client, DOC_MODEL, messages)

        # Extract necessary details from the response
        created_at = response.created_at
        total_duration = response.total_duration / 1e9
        content = response.message.content
        prompt = get_prompt_by_name(document.document_type.value)
        completion_data = {
            'model': DOC_MODEL,
            'document_ids': [document.id],  # Associate with the current document
            'system_prompt_id': prompt.id,  # Link to the prompt used
            'total_duration': total_duration,
            'created_at': created_at,
            'num_ctx': MODEL_CONFIG[DOC_MODEL]["ctx"],
            'content': content,  # Add the actual response content
        }
        completion = create_completion(completion_data)
        logger.info("created_completion", id = completion.id, document = document.document_name, time = completion.total_duration, length = len(completion.content))

        # Store results and track completion IDs for aggregation
        if document.document_type == DocumentType.DESCRIPTION:
            descriptions[f"{filing.filing_date}"] = response.message.content
            description_completion_ids.append(completion.id)
        elif document.document_type == DocumentType.RISK_FACTORS:
            risk_factors[f"{filing.filing_date}"] = response.message.content
            risk_factor_completion_ids.append(completion.id)
        elif document.document_type == DocumentType.MDA:
            mdas[f"{filing.filing_date}"] = response.message.content
            mda_completion_ids.append(completion.id)
        else:
            logger.warning("wtf")



logger.info("doc_completions_generated.")

os.makedirs(f'outputs/{TICKER}', exist_ok=True)
with open(f'outputs/{TICKER}/mdas.json', 'w') as file:
    json.dump(mdas, file, indent=4)
with open(f'outputs/{TICKER}/risk_factors.json', 'w') as file:
    json.dump(risk_factors, file, indent=4)
with open(f'outputs/{TICKER}/descriptions.json', 'w') as file:
    json.dump(descriptions, file, indent=4)

2025-06-28T20:44:16.216883Z [info     ] retrieved_documents_by_filing  count=3 filing_id=0685e291-b033-75b5-8000-741d7b85e508
2025-06-28T20:44:16.217541Z [info     ] Starting Chat for GOOGL 2024-01-31 business_description
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T20:44:41.746634Z [info     ] retrieved_prompt_by_name       name=business_description prompt_id=06860055-5fa7-7c96-8000-8afefd533328
2025-06-28T20:44:41.757789Z [info     ] created_completion             completion_id=06860543-9bfc-710b-8000-a55686257274 model=qwen3:4b
2025-06-28T20:44:41.758932Z [info     ] created_completion             document=Alphabet Inc. 10-K 2024-01-31 - Business Description id=UUID('06860543-9bfc-710b-8000-a55686257274') length=5601 time=26.0
2025-06-28T20:44:41.763379Z [info     ] Starting Chat for GOOGL 2024-01-31 risk_factors
HTTP Request: POST http://10.0.0.4:11434/api/chat "HTTP/1.1 200 OK"
2025-06-28T20:45:37.361269Z [info     ] retrieved_prompt_by_name      

In [None]:
# Create MDA Aggregate
messages = format_aggregate_messages(mdas)
AGGresponse = get_chat_response(client, AGGRAGATE_MODEL, messages)
logger.info(f"MDA Aggregate Done", time=f"{response.total_duration / 1e9}s")

# Save the aggregate content to file
with open(f'outputs/{TICKER}/mda.md', 'w') as file:
    file.write(response.message.content)

# Create aggregate record in database
created_at = response.created_at
total_duration = response.total_duration / 1e9
content = response.message.content
prompt = get_prompt_by_name("aggregate_prompt")

mda_aggregate_data = {
    'model': AGGRAGATE_MODEL,
    'company_id': company.id,
    'document_type': DocumentType.MDA,
    'system_prompt_id': prompt.id,
    'total_duration': total_duration,
    'created_at': created_at,
    'num_ctx': MODEL_CONFIG[AGGRAGATE_MODEL]["ctx"],
    'content': content,
    'completion_ids': mda_completion_ids  # Associate with source completions
}

mda_aggregate = create_aggregate(mda_aggregate_data)
print(f"Created MDA aggregate record with ID: {mda_aggregate.id}")
print(f"Associated with {len(mda_completion_ids)} source completions")
print(f"Model: {mda_aggregate.model}, Duration: {mda_aggregate.total_duration:.2f} seconds")
print(f"Content length: {len(mda_aggregate.content)} characters")


In [None]:
# mda_aggregate.source_completions
# for completion in mda_aggregate.source_completions:
#     for document in completion.source_documents:
#         print(document.filing.filing_url)

In [None]:
# Create Risk Factors Aggregate
messages = format_aggregate_messages(risk_factors)
response = get_chat_response(client, AGGRAGATE_MODEL, messages)
logger.info(f"Risk Factors Aggregate Done", time=f"{response.total_duration / 1e9}s")

# Save the aggregate content to file
with open(f'outputs/{TICKER}/risk_factors.md', 'w') as file:
    file.write(response.message.content)

# Create aggregate record in database
created_at = response.created_at
total_duration = response.total_duration / 1e9
content = response.message.content
prompt = get_prompt_by_name("aggregate_prompt")

risk_aggregate_data = {
    'model': AGGRAGATE_MODEL,
    'company_id': company.id,
    'document_type': DocumentType.RISK_FACTORS,
    'system_prompt_id': prompt.id,
    'total_duration': total_duration,
    'created_at': created_at,
    'num_ctx': MODEL_CONFIG[AGGRAGATE_MODEL]["ctx"],
    'content': content,
    'completion_ids': risk_factor_completion_ids  # Associate with source completions
}

risk_aggregate = create_aggregate(risk_aggregate_data)
print(f"Created Risk Factors aggregate record with ID: {risk_aggregate.id}")
print(f"Associated with {len(risk_factor_completion_ids)} source completions")
print(f"Model: {risk_aggregate.model}, Duration: {risk_aggregate.total_duration:.2f} seconds")
print(f"Content length: {len(risk_aggregate.content)} characters")


In [None]:
# # Create Descriptions Aggregate
messages = format_aggregate_messages(descriptions)
response = get_chat_response(client, AGGRAGATE_MODEL, messages)
logger.info(f"Descriptions Aggregate Done", time=f"{response.total_duration / 1e9}s")

# Save the aggregate content to file
with open(f'outputs/{TICKER}/descriptions.md', 'w') as file:
    file.write(response.message.content)

# Create aggregate record in database
created_at = response.created_at
total_duration = response.total_duration / 1e9
content = response.message.content
prompt = get_prompt_by_name("aggregate_prompt")

desc_aggregate_data = {
    'model': AGGRAGATE_MODEL,
    'company_id': company.id,
    'document_type': DocumentType.DESCRIPTION,
    'system_prompt_id': prompt.id,
    'total_duration': total_duration,
    'created_at': created_at,
    'num_ctx': MODEL_CONFIG[AGGRAGATE_MODEL]["ctx"],
    'content': content,
    'completion_ids': description_completion_ids  # Associate with source completions
}

desc_aggregate = create_aggregate(desc_aggregate_data)
print(f"Created Business Description aggregate record with ID: {desc_aggregate.id}")
print(f"Associated with {len(description_completion_ids)} source completions")
print(f"Model: {desc_aggregate.model}, Duration: {desc_aggregate.total_duration:.2f} seconds")
print(f"Content length: {len(desc_aggregate.content)} characters")
