In [1]:
# Imports and Config
from collections import defaultdict
import time

from src.database.base import init_db
from src.database.filings import get_filings_by_company
from src.database.companies import get_company_by_ticker, update_company
from src.database.completions import Completion
from src.database.documents import get_documents_by_filing, DocumentType
from src.database.prompts import *

from src.llm.client import ModelConfig, init_client, remove_thinking_tags, get_generate_response
from src.llm.aggregates import create_document_type_aggregate
from src.llm.completions import process_document_completion

from src.utils.config import settings
from src.utils.logging import configure_logging




configure_logging()

logger = get_logger(__name__)

init_db(settings.database.url)
session = get_db_session()

client = init_client(settings.openai_api.url)

COMPANY = "AMD"
TICKERS = ["AMD", "AAPL", "GOOGL", ]
company = get_company_by_ticker(COMPANY)
filings = get_filings_by_company(company.id)
# filings = [filing for filing in filings if filing.period_of_report.year > 2023]
logger.info("filtered_list_of_filings", filings=filings)

from src.llm.client import retry_backoff

response = retry_backoff(3600, client.chat, 'gemma3:12b', options={"temperature": 1.0}, messages=[{"role": "user", "content": "go while-d"}])
logger.info(response.message)

response = retry_backoff(
    timeout=3,
    func=client.generate,
    model = "gemma3:12b",
    system = "you're an intelligent comedian who finds humor in witty dry comedy. Your target audience is young professionals and engineers.",
    prompt = "Write a one line joke for the front page of /r/unixporn",
    options = {
    },
)
logger.info(response.response)


2025-07-29T08:17:13.746271Z [error    ] retry_backoff                  backoff=1 caller="src/llm/client.py:38" error=ConnectionError('Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download') function=retry_backoff
2025-07-29T08:17:15.747500Z [error    ] retry_backoff                  backoff=2 caller="src/llm/client.py:38" error=ConnectionError('Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download') function=retry_backoff
2025-07-29T08:17:19.748575Z [error    ] retry_backoff                  backoff=4 caller="src/llm/client.py:38" error=ConnectionError('Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download') function=retry_backoff
2025-07-29T08:17:27.749841Z [error    ] retry_backoff                  backoff=8 caller="src/llm/client.py:38" error=ConnectionError('Failed to connect to Ollam

In [2]:
# create new prompts
# i = len(get_prompt_ids()) + 1
# data = {
#     "name": f"Prompt #{i}",
#     "role": PromptRole.SYSTEM,
#     "description": "focused_summary",
#     "content": """
#         We're archivists! We need to transform this input into output that is concise, factual, sometimes said with a hint of personality.  Our purpose is to accurately represent the information contained in the input text.  We prioritize completeness; ensure every relevant point is captured, even if seemingly minor. We look for literal and implied meanings.
#         """,
# }
# document_prompt = create_prompt(data)

# i = len(get_prompt_ids()) + 1
# data = {
#     "name": f"Prompt #{i}",
#     "role": PromptRole.SYSTEM,
#     "description": "aggregate_summary",
#     "content": """
#     You are a content writer with a worldwide audience of talented young professionals.
#     You will recieve a large series of research notes, sometimes spanning months or years.
#     Take these notes and write a summary of the topics discuessed therein.

#     Use markdown to format your response:
#     - list items
#     - **bold text**
#     - _italics_
#     - Headings require one or more '#' characters
# """,
# }
# aggregate_prompt = create_prompt(data)

# i = len(get_prompt_ids()) + 1
# data = {
#     "name": f"Prompt #{i}",
#     "role": PromptRole.SYSTEM,
#     "description": "aggregate_summary",
#     "content": """
#     You're a change analyst! You will be presented with a series of documents. Each document has a meta section with important context (including a date).

#     Compare each of the documents.

#     Summarize the progression of each document.

#     Use markdown to format your response:
#     - list items
#     - **bold text**
#     - _italics_
#     - Headings require one or more '#' characters
# """,
# }
# risk_aggregate_prompt = create_prompt(data)


# create markdown prompt
# i = len(get_prompt_ids()) + 1
# data = {
#     "name": f"Prompt #{i}",
#     "role": PromptRole.SYSTEM,
#     "description": "markdown_sanitation",
#     "content": """
# You are a formatting wizard who's job it is to enforce proper markdown page structure!

# You MUST NOT change any words or phrases in the provided document.

# You also MUST NOT add any form of wrapping or text around the document (backticks, quotes, etc)

# One common formatting error is the use of **bolded text** in place of a subheading (#'s).
# Here is an example of this error:
# <error>
# **Palantir: A Story of Innovation, Ethics, and Global Impact**

# In 2003, a group of technologists, entrepreneurs, and former intelligence officers founded Palantir with a bold mission: to build software that could transform how the world handles data, decisions, and operations. What began as a tool for counterterrorism in the U.S. intelligence community evolved into a global leader in data integration, AI, and ethical technology. Today, Palantir’s platforms empower governments, corporations, and institutions to solve complex problems—from combating crime to advancing public health—while upholding rigorous privacy and ethical standards.

# **The Power of Platforms**
# At the heart of Palantir’s success are its four core platforms, each designed to tackle unique challenges:
# - **Gotham** serves as a central nervous system for intelligence and defense agencies, synthesizing vast datasets (e.g., signals intelligence, financial records) to uncover patterns and guide high-stakes decisions.
# - **Foundry** acts as a universal operating system for data, enabling organizations to build, manage, and optimize data pipelines across departments, from data engineers to executives.
# - **Apollo** ensures seamless, secure deployment of software across any cloud or on-premise environment, minimizing downtime and maximizing security.
# - **AIP (Artificial Intelligence Platform)**, a recent innovation, integrates cutting-edge machine learning and generative AI (e.g., large language models) with Palantir’s existing systems, allowing enterprises to operationalize AI in real-time, from fraud detection to predictive analytics.

# **Customers, Revenue, and Global Reach**
# Palantir’s customer base spans 711 organizations as of 2024, with 55% of revenue coming from government clients (U.S. federal agencies, international defense, and public health) and 45% from commercial sectors like healthcare, energy, and finance. Their U.S. operations drive 66% of revenue, with the remaining 34% generated internationally. Top customers, such as the U.S. military and major financial institutions, see average annual revenue from Palantir exceeding $64 million—a testament to the scalability of their platforms.

# **A Sales Strategy Built on Trust**
# Palantir targets large-scale, mission-critical projects, leveraging a direct sales force, strategic partnerships (e.g., with Fujitsu), and cloud providers to expand access. Their "Developer Tier" initiative democratizes access to Foundry and AIP, allowing developers in the U.S. and select countries to experiment with their tools. This approach, combined with bootcamps and training, has helped Palantir navigate the risk-averse culture of government clients, where rapid deployment and proven outcomes are paramount.

# **Privacy: The Bedrock of Innovation**
# From the start, Palantir has prioritized privacy as a non-negotiable feature. Their "privacy by design" philosophy embeds data minimization, dynamic access controls, and auditability into every platform. For example, Foundry ensures compliance with GDPR and HIPAA through automated data retention policies and granular permissions. This commitment extends to AIP, where human oversight—rather than algorithmic automation—guides critical decisions. Palantir’s ethical framework, which includes accountability and transparency, has become a differentiator in an era of growing AI scrutiny.

# **Innovation at the Edge**
# Research and development drive Palantir’s evolution. Teams work on edge computing, AI integration, and user-centric design, often collaborating directly with customers to refine solutions. AIP bootcamps, for instance, enable real-world testing of AI applications, from healthcare diagnostics to supply chain optimization. These efforts ensure that Palantir’s tools remain at the forefront of technological progress, even as competitors—both traditional software giants and AI startups—strive to replicate their success.

# **Global Impact, Local Roots**
# Palantir’s platforms have transformed industries:
# - **Law Enforcement**: Streamlined investigations and resource allocation.
# - **Healthcare**: Enhanced disease tracking and drug discovery.
# - **Finance**: Strengthened fraud detection and compliance.
# With 3,936 employees globally (31% outside the U.S.), Palantir balances global ambition with local engagement, fostering inclusive workplaces and adhering to labor standards, including France’s works council requirements.

# **Challenges and the Road Ahead**
# Despite its strengths, Palantir faces hurdles: competition from internal development teams, the need to balance AI innovation with ethical boundaries, and the seasonal nature of enterprise software sales. Yet, its focus on privacy, scalability, and mission-driven partnerships positions it to navigate these challenges. As AIP and Foundry continue to evolve, Palantir remains committed to its founding vision: using technology not just to solve problems, but to do so responsibly,
# </error>

# And the corrected formatting:

# <correct>
# # Palantir: A Story of Innovation, Ethics, and Global Impact**

# In 2003, a group of technologists, entrepreneurs, and former intelligence officers founded Palantir with a bold mission: to build software that could transform how the world handles data, decisions, and operations. What began as a tool for counterterrorism in the U.S. intelligence community evolved into a global leader in data integration, AI, and ethical technology. Today, Palantir’s platforms empower governments, corporations, and institutions to solve complex problems—from combating crime to advancing public health—while upholding rigorous privacy and ethical standards.

# ## The Power of Platforms
# At the heart of Palantir’s success are its four core platforms, each designed to tackle unique challenges:
# - **Gotham** serves as a central nervous system for intelligence and defense agencies, synthesizing vast datasets (e.g., signals intelligence, financial records) to uncover patterns and guide high-stakes decisions.
# - **Foundry** acts as a universal operating system for data, enabling organizations to build, manage, and optimize data pipelines across departments, from data engineers to executives.
# - **Apollo** ensures seamless, secure deployment of software across any cloud or on-premise environment, minimizing downtime and maximizing security.
# - **AIP (Artificial Intelligence Platform)**, a recent innovation, integrates cutting-edge machine learning and generative AI (e.g., large language models) with Palantir’s existing systems, allowing enterprises to operationalize AI in real-time, from fraud detection to predictive analytics.

# ### Customers, Revenue, and Global Reach
# Palantir’s customer base spans 711 organizations as of 2024, with 55% of revenue coming from government clients (U.S. federal agencies, international defense, and public health) and 45% from commercial sectors like healthcare, energy, and finance. Their U.S. operations drive 66% of revenue, with the remaining 34% generated internationally. Top customers, such as the U.S. military and major financial institutions, see average annual revenue from Palantir exceeding $64 million—a testament to the scalability of their platforms.

# ### A Sales Strategy Built on Trust
# Palantir targets large-scale, mission-critical projects, leveraging a direct sales force, strategic partnerships (e.g., with Fujitsu), and cloud providers to expand access. Their "Developer Tier" initiative democratizes access to Foundry and AIP, allowing developers in the U.S. and select countries to experiment with their tools. This approach, combined with bootcamps and training, has helped Palantir navigate the risk-averse culture of government clients, where rapid deployment and proven outcomes are paramount.

# ### Privacy: The Bedrock of Innovation
# From the start, Palantir has prioritized privacy as a non-negotiable feature. Their "privacy by design" philosophy embeds data minimization, dynamic access controls, and auditability into every platform. For example, Foundry ensures compliance with GDPR and HIPAA through automated data retention policies and granular permissions. This commitment extends to AIP, where human oversight—rather than algorithmic automation—guides critical decisions. Palantir’s ethical framework, which includes accountability and transparency, has become a differentiator in an era of growing AI scrutiny.

# ### Innovation at the Edge
# Research and development drive Palantir’s evolution. Teams work on edge computing, AI integration, and user-centric design, often collaborating directly with customers to refine solutions. AIP bootcamps, for instance, enable real-world testing of AI applications, from healthcare diagnostics to supply chain optimization. These efforts ensure that Palantir’s tools remain at the forefront of technological progress, even as competitors—both traditional software giants and AI startups—strive to replicate their success.

# ## Global Impact, Local Roots
# Palantir’s platforms have transformed industries:
# - **Law Enforcement**: Streamlined investigations and resource allocation.
# - **Healthcare**: Enhanced disease tracking and drug discovery.
# - **Finance**: Strengthened fraud detection and compliance.
# With 3,936 employees globally (31% outside the U.S.), Palantir balances global ambition with local engagement, fostering inclusive workplaces and adhering to labor standards, including France’s works council requirements.

# ## Challenges and the Road Ahead**
# Despite its strengths, Palantir faces hurdles: competition from internal development teams, the need to balance AI innovation with ethical boundaries, and the seasonal nature of enterprise software sales. Yet, its focus on privacy, scalability, and mission-driven partnerships positions it to navigate these challenges. As AIP and Foundry continue to evolve, Palantir remains committed to its founding vision: using technology not just to solve problems, but to do so responsibly,
# </correct>
#         """,
# }
# markdown_prompt = create_prompt(data)

# create summary prompt
# i = len(get_prompt_ids()) + 1
# data = {
#     "name": f"Prompt #{i}",
#     "role": PromptRole.SYSTEM,
#     "description": "front_page_summary",
#     "content": """
#         You're a fantastic ad copy editor. The user will provide you with a document, which you need to boil down into it's most concise esscense.

#         Aim for no more than 50 words, 75 at the very most.

#         Use markdown (**bold text**, _italics_, - bullet points) to add emphasis to your response.
#     """,
# }
# front_page_prompt = create_prompt(data)


In [3]:
# prompts and model config
document_prompt = get_prompt('06878708-7a86-76e0-8000-8b1f4e2ed08c')
document_model_config = ModelConfig(
    name="qwen3:4b",
    num_ctx=24567,
    temperature=0.8,
    top_k=45
)

# aggregate_prompt = get_prompt("068786e2-1e9e-7eb6-8000-e716f8eb8a3f")
# aggregate_prompt = get_prompt("0687a819-b5cb-73a3-8000-4326f7819960")
aggregate_prompt = get_prompt("0687ab13-8e9b-7d45-8000-e36b40bd5fcd")

risk_aggregate_prompt = get_prompt("0687abe9-4f99-7ca7-8000-77b422a5c1dc")

aggregate_model_config = ModelConfig(
    name="qwen3:14b",
    num_ctx=9000,
    temperature=0.7,
    num_gpu=41
)

# aggregate_model_config = ModelConfig(
#     name="hf.co/unsloth/gemma-3-12b-it-qat-GGUF:latest",
#     num_ctx=10000,
#     temperature=0.0,
# )

markdown_prompt = get_prompt('0687a885-7cc1-7e88-8000-a95d86e69d22')
markdown_model_config = ModelConfig(
    name="hf.co/unsloth/gemma-3-12b-it-qat-GGUF:latest",
    num_ctx=10000,
    temperature=0.0,
)

front_page_prompt = get_prompt('0687a424-1757-743d-8000-01257e133b73')
front_page_model_config = ModelConfig(
    name="hf.co/unsloth/gemma-3-12b-it-qat-GGUF:latest",
    num_ctx=10000,
    temperature=0.8,
)



2025-07-29T08:18:30.989971Z [info     ] retrieved_prompt               caller="src/database/prompts.py:83" function=get_prompt prompt_id=06878708-7a86-76e0-8000-8b1f4e2ed08c
2025-07-29T08:18:30.991009Z [info     ] retrieved_prompt               caller="src/database/prompts.py:83" function=get_prompt prompt_id=0687ab13-8e9b-7d45-8000-e36b40bd5fcd
2025-07-29T08:18:30.991693Z [info     ] retrieved_prompt               caller="src/database/prompts.py:83" function=get_prompt prompt_id=0687abe9-4f99-7ca7-8000-77b422a5c1dc
2025-07-29T08:18:30.992891Z [info     ] retrieved_prompt               caller="src/database/prompts.py:83" function=get_prompt prompt_id=0687a885-7cc1-7e88-8000-a95d86e69d22
2025-07-29T08:18:30.993533Z [info     ] retrieved_prompt               caller="src/database/prompts.py:83" function=get_prompt prompt_id=0687a424-1757-743d-8000-01257e133b73


In [4]:
for (i, ticker) in enumerate(TICKERS):
    # break
    logger.info("get_completion_for_filing_documents", company=company.name, ticker=company.ticker, index=f"{i}/{len(TICKERS)}")
    LOOP_START = time.time()
    company = get_company_by_ticker(ticker)
    filings = get_filings_by_company(company.id)
    # filings = [filing for filing in filings if filing.period_of_report.year > 2023]

    # store output to make aggregate from
    completions_by_type: defaultdict[DocumentType, List[Completion]] = defaultdict(list)

    logger.debug("document_prompt", content=f"'{document_prompt.content.strip()[:90]}...'")

    for (j, filing) in enumerate(filings):
        logger.info("filing_loop", filing=filing, index=f"{j}/{len(filings)}")
        documents = get_documents_by_filing(filing.id)
        if len(documents) < 1:
            logger.warning("no documents")
            continue

        for (k, document) in enumerate(documents):
            logger.info("document_loop", document=document, index=f"{k}/{len(filings)}")
            result = process_document_completion(document, document_prompt, document_model_config)
            completions_by_type[document.document_type].append(result)

    logger.info("get_aggregations_for_document_types")
    logger.debug("aggregate_prompt", content=f"'{aggregate_prompt.content.strip()[:90]}...'")

    logger.info("get_mda_aggregate")
    mda_aggregate = create_document_type_aggregate(DocumentType.MDA, completions_by_type[DocumentType.MDA], aggregate_prompt, aggregate_model_config)

    logger.info("get_business_description_aggregate")
    description_aggregate = create_document_type_aggregate(DocumentType.DESCRIPTION, completions_by_type[DocumentType.DESCRIPTION], aggregate_prompt, aggregate_model_config)

    # use a different prompt for risk factors
    logger.info("get_risk_factors_aggregate")
    logger.info("risk_aggregate_prompt", content=f"'{risk_aggregate_prompt.content.strip()[:90]}...'")
    risk_aggregate = create_document_type_aggregate(DocumentType.RISK_FACTORS, completions_by_type[DocumentType.RISK_FACTORS], risk_aggregate_prompt, aggregate_model_config)


    # Make the company 'front page summary' text
    # remove <thinking>, format markdown, lint for misclanious wrapper text
    logger.info("formatting_content")
    unformatted_markdown = remove_thinking_tags(description_aggregate.content)
    with open('unformatted_markdown.md', 'w') as f:
        f.write(unformatted_markdown)

    markdown_response = get_generate_response(markdown_model_config, markdown_prompt.content, unformatted_markdown)
    with open('formatted_markdown.md', 'w') as f:
        f.write(markdown_response.response)

    summary_response = get_generate_response(front_page_model_config, front_page_prompt.content, markdown_response.response)
    linted_summary = get_generate_response(markdown_model_config, "We are cleaning up for publication. Remove any introductory 'meta' lines from the provided document.", summary_response.response)
    company = update_company(company.id, {'summary': linted_summary.response})

    LOOP_END = time.time()
    logger.info("finished_company", duration=f"{LOOP_END - LOOP_START:.2f}s")


2025-07-29T08:18:30.998656Z [info     ] get_completion_for_filing_documents caller="/tmp/ipykernel_33445/4124532146.py:3" company=ADVANCED MICRO DEVICES INC function=<module> index=0/3 ticker=AMD
2025-07-29T08:18:31.001387Z [info     ] retrieved_company_by_ticker    caller="src/database/companies.py:285" company=ADVANCED MICRO DEVICES INC function=get_company_by_ticker ticker=AMD
2025-07-29T08:18:31.002161Z [info     ] retrieved_filings_by_company   caller="src/database/filings.py:275" count=5 function=get_filings_by_company
2025-07-29T08:18:31.003481Z [info     ] filing_loop                    caller="/tmp/ipykernel_33445/4124532146.py:15" filing=AMD 2020 10-K function=<module> index=0/5
2025-07-29T08:18:31.010153Z [info     ] retrieved_documents_by_filing  caller="src/database/documents.py:255" count=3 filing_id=0685e25b-10ea-779d-8000-cb13053c2e9e function=get_documents_by_filing
2025-07-29T08:18:31.010562Z [info     ] document_loop                  caller="/tmp/ipykernel_33445/4124

KeyboardInterrupt: 

In [None]:
# # regenerate an aggregate
# from src.database.aggregates import Aggregate
# from src.llm.prompts import format_aggregate_messages

# # Get Aggregates where `aggregate.company.id == company.id` and `aggregate.document_type == document_type`
# company = get_company_by_ticker(COMPANY)
# document_type = DocumentType.DESCRIPTION

# aggregates = (
#     session.query(Aggregate)
#     .filter(Aggregate.company_id == company.id)
#     .filter(Aggregate.document_type == document_type)
#     .all()
# )

# logger.info("retrieved_aggregates", company=company.ticker, document_type=document_type.value, count=len(aggregates))
# aggregate = max(aggregates, key=lambda agg: agg.created_at)
# logger.info("most_recent_aggregate", aggregate_id=aggregate.id, created_at=str(aggregate.created_at))
# completions = aggregate.source_completions
# logger.info("comlpetions", length=len(completions))

# # check the formatting of context
# # formatted_context = format_aggregate_messages(aggregate_prompt, completions)
# # print(formatted_context[1]['content'])

# logger.info("aggregate_prompt", content=f"'{aggregate_prompt.content.strip()[:80]}...'")
# new_agg = create_document_type_aggregate(document_type, completions, aggregate_prompt, aggregate_model_config)
# new_agg.content = remove_thinking_tags(new_agg.content)
# with open('unformatted_markdown.md', 'w') as f:
#     f.write(new_agg.content)

# logger.info("formatting_markdown")
# new_agg = get_generate_response(markdown_model_config, markdown_prompt.content, new_agg.content)
# new_agg = get_generate_response(markdown_model_config, "We are cleaning up for publication. Remove any introductory 'meta' lines from the provided document.", new_agg.response)
# new_agg = get_generate_response(markdown_model_config, "This document will be presented with a separate heading. Trim ", new_agg.response)
# with open('formatted_markdown.md', 'w') as f:
#     f.write(new_agg.response)



In [None]:
from src.database.companies import get_all_company_tickers

companies = get_all_company_tickers()
print(companies)


['AAL', 'AAPL', 'ACHR', 'AES', 'AGNC', 'AMD', 'AMZN', 'APLD', 'AUR', 'AVGO', 'BA', 'BAC', 'CCL', 'CDE', 'CLF', 'CLSK', 'CSCO', 'CVE', 'F', 'GME', 'GOOGL', 'HBANP', 'HL', 'HOOD', 'HPE', 'INTC', 'IONQ', 'IPG', 'JOBY', 'KGC', 'KVUE', 'LCID', 'MARA', 'MP', 'MSFT', 'MU', 'NFLX', 'NU', 'NVDA', 'ORCL', 'PCG', 'PFE', 'PLTR', 'PR', 'QUBT', 'RIG', 'RIOT', 'RKLB', 'RKT', 'RXRX', 'S', 'SMCI', 'SMR', 'SNAP', 'SOFI', 'T', 'TSLA', 'UBER', 'UNH', 'VZ', 'WBD', 'WFC', 'WMT', 'XOM']


In [None]:
d = filings[0].documents[0]

In [None]:
d.document_type

business_description

In [None]:
d

AMD 2020 10-K business_description

In [None]:
print(d)

AMD 2020 10-K business_description
