In [1]:
!pip install -U sec-downloader sec-parser openai



In [2]:
from sec_downloader import Downloader
import sec_parser as sp
import warnings
import os
from openai import OpenAI
import json

In [3]:
dl = Downloader("Vanguard", "attila_sajo@vanguard.com")

In [4]:
# get latest
# html = dl.get_filing_html(ticker="LNG", form="10-K")
html = dl.get_filing_html(ticker="ILMN", form="10-K")

In [5]:
parser = sp.Edgar10QParser()

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="Invalid section type for")
    elements: list = parser.parse(html)
    
tree: sp.SemanticTree = sp.TreeBuilder().build(elements)

demo_output: str = sp.render(tree)
tree_text = '\n'.join(demo_output.split('\n'))

In [6]:
tree_text = '\n'.join([ line for line in demo_output.split('\n') if 'TableElement' not in line and 'ImageElement' not in line])

In [7]:
print(tree_text)

[1;34mTextElement[0m: 00011108032023FYfalsehttp://fas...mn:JacobThaysenMember2023-12-31
[1;34mTitleElement[0m: UNITED STATES
[1;34mTitleElement[0m: SECURITIES AND EXCHANGE COMMISSION
├── [1;34mTitleElement[0m: Washington, D.C. 20549
├── [1;34mTitleElement[0m: Form 10-K
│   └── [1;34mTitleElement[0m: ☑ANNUAL REPORT PURSUANT TO SECT...SECURITIES EXCHANGE ACT OF 1934
│       └── [1;34mTitleElement[0m: For the fiscal year ended December 31, 2023
├── [1;34mTitleElement[0m: or
│   └── [1;34mTitleElement[0m: ☐TRANSITION REPORT PURSUANT TO ...nsition period from          to
├── [1;34mTitleElement[0m: Commission file number: 001-35406
├── [1;34mTitleElement[0m: Illumina, Inc.
│   ├── [1;34mSupplementaryText[0m: (Exact name of registrant as specified in its charter)
├── [1;34mTitleElement[0m: 5200 Illumina Way, San Diego, CA 92122
│   ├── [1;34mSupplementaryText[0m: (Address of principal executive offices) (Zip code)
│   └── [1;34mTitleElement[0m: Registrant’s telep

# OpenAI

In [8]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

In [9]:
topic = "risk analysis"
prompt = f"""You are a document preprocessing assistant. Your job is to search for {topic} related TitleElement in the provided Document tree between the three - characters.
The provided document represents a longer text hierarchy. It has the following elements:
  * TitleElement: represent a section title
  * SupplementaryText: represent some supplementally text for the title
  * TextElement: represents a text connected to the title
  * TableElement: represent a table
  * ImageElement: represent an Image

Give the result in sentiment analysis JSON format. 
Just return the JSON formatted result. Don't add any other comments.

To generate the end JSON think step by step, which requires the following actions:
1) Iterate over the documents and search for titles which are connected to {topic}
2) Create a JSON with the list of the relevant title
3) return the JSON

Document:---
{tree_text}
---
"""

In [10]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-4o",
    response_format={
        "type": "json_schema",
        "json_schema": {
        "name": "risk_analysis",
        "description": "result of risk analysis",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    f"{topic}_titles": {
                        "type": "array",
                        "description": "list of the {topic} related titles",
                        "items": {
                                "type": "string"
                        }
                    },
                },
                "required": [f"{topic}_titles"],
                "additionalProperties": False
            }
        }
    }
)

In [11]:
result = json.loads(chat_completion.choices[0].message.content.strip().replace("```", ''))

In [12]:
# itterate over titles and collect the text as Markdown

In [13]:
result_markdown = ""

def itterate_childrens(node, result_markdown, level=0, get_text = False):
    level += 1
    printed = False
    if node.text in result[f"{topic}_titles"]:
        result_markdown += "\n" + "#"*level + " " + node.text + "\n\n"
        get_text = True
        printed = True
        
    if node.has_child and len(node.children) > 0:
        if get_text and node.text not in result[f"{topic}_titles"]:
            result_markdown += "\n" + "#"*level + " " + node.text + "\n\n"
        for subnode in node.children:
            result_markdown = itterate_childrens(subnode, result_markdown, level, get_text)
    else:
        if get_text and not printed:
            result_markdown += node.text + "\n"
            
    return result_markdown

for node in tree:
    result_markdown = itterate_childrens(node, result_markdown, 0, False)

In [14]:
len(result_markdown)/4

20599.75

In [15]:
print(result_markdown)


## RISK FACTORS

Our business is subject to various risks, including those described below. In addition to the other information included in this report, the following issues could adversely affect our operating results or our stock price.
Risks Relating to Research, Development, Marketing, and Sales of Products and Services

### Our continued growth is dependent on continuously developing and commercializing new products.

Our target markets are characterized by rapid technological change, changes in customer needs, existing and emerging competition, strong price competition, and frequent new product introductions. Accordingly, our continued growth depends on developing and commercializing new products and services, including improving our existing products and services, in order to address evolving market requirements on a timely basis. If we fail to innovate or adequately invest in new technologies, we could lose our competitive position in the markets that we serve.To the extent t

## Chapter sum

In [16]:
text_sums = ""

if len(result_markdown)/4 > 100000:
    company_type = "energy"
    for chapter in result_markdown.split("\n### "):
        prompt = f"""Summarize the input text between the three - signs. The text is an {company_type} company financial SEC 10-K report. The input text is Markdown formatted.
Give the result in JSON format.  Just return the JSON formatted result. Don't add any other comments.

The JSON should have one string field: 'summarized_text'. This should be a Markdown formated text of the input text.

Think step by step, which requires the following actions:
1) separate the {company_type} company-specific information and general business information
2) summarize the general information in a few sentences
3) summarize the company-specific information in a way which keeps the specific information

Input Markdown text:---
{chapter}
---
        """
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-4o",
            response_format={
                "type": "json_schema",
                "json_schema": {
                "name": f"{topic.replace(' ', '_')}",
                "description": f"result of risk {topic}",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "summarized_text": {
                                "type": "string",
                                "description": "summarization of the input text",
                            }
                        },
                        "required": ["summarized_text"],
                        "additionalProperties": False
                    }
                }
            }
        )
        text_sum = json.loads(chat_completion.choices[0].message.content.strip().replace("```", ''))
        text_sums += text_sum['summarized_text']+"\n\n"
else:
    text_sums = result_markdown

In [17]:
print(text_sums)


## RISK FACTORS

Our business is subject to various risks, including those described below. In addition to the other information included in this report, the following issues could adversely affect our operating results or our stock price.
Risks Relating to Research, Development, Marketing, and Sales of Products and Services

### Our continued growth is dependent on continuously developing and commercializing new products.

Our target markets are characterized by rapid technological change, changes in customer needs, existing and emerging competition, strong price competition, and frequent new product introductions. Accordingly, our continued growth depends on developing and commercializing new products and services, including improving our existing products and services, in order to address evolving market requirements on a timely basis. If we fail to innovate or adequately invest in new technologies, we could lose our competitive position in the markets that we serve.To the extent t

## List risks

In [18]:
prompt = f"""Summarize and organize the financial Markdown formatted text between the three - sign.
Give the result in {topic} JSON format.  Just return the JSON formatted result. Don't add any other comments.

To generate the end JSON think step by step, which requires the following actions:
1) Create a summary of the most important risks. These summaries should be relatively long.
2) Delete the risk which are two general, and can happen with every company. 
3) Organize the risks in two lists: 'internal' and 'not-internal'. Decide whether a certain risk in the previous step is internally controlled or not. 
   It is NOT internal if the risk is NOT dependent on the company, but instead some outsider, for example, the government, or environment. 
4) Reorder the risk by importance. A risk is more important if the input text gives more details about it.
5) Create the output JSON, listing separately the internal and not-internal problems.
6) Dublecheck the initial risk list just has risks which can managed by the company. 

Markdown:---
{text_sums}
---
"""

In [19]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-4o",
    response_format={
        "type": "json_schema",
        "json_schema": {
        "name": f"{topic.replace(' ', '_')}",
        "description": f"result of risk {topic}",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "internal": {
                        "type": "array",
                        "description": "list of the internal",
                        "items": {
                                "type": "string"
                        }
                    },
                    "not_internal": {
                        "type": "array",
                        "description": "list of the not internal",
                        "items": {
                                "type": "string"
                        }
                    },
                },
                "required": ["internal", "not_internal"],
                "additionalProperties": False
            }
        }
    }
)

In [20]:
result = json.loads(chat_completion.choices[0].message.content.strip().replace("```", ''))

In [21]:
for s in result["internal"]:
    print(f"\t* {s}")

	* Our continued growth is dependent on continuously developing and commercializing new products. Delays or failures in innovation and product introductions could lead to a loss of competitive position and negatively impact financial outcomes.
	* If we do not successfully manage the development, manufacturing, and launch of new products or services, including product transitions, our financial results could be adversely affected due to delays, increased costs, or lack of market acceptance.
	* We depend heavily on third-party manufacturers and suppliers for sub-assemblies, components, and materials. Disruptions in supply chains can adversely affect manufacturing and shipment timelines, impacting revenue.
	* Defects in products or failure to meet required quality standards may result in recalls, damage to reputation, and negative financial impacts.
	* Our acquisitions, including GRAIL, expose us to integration risks, legal issues, and additional liabilities. Unsuccessful integration coul

In [22]:
for s in result["not_internal"]:
    print(f"\t* {s}")

	* We face intense competition from existing and emerging technologies that could render our products obsolete or pressure us into reducing prices.
	* Our success is contingent on the acceptance and demand for sequencing technologies and markets for genetic analysis, which may not grow as expected.
	* Public health crises such as the COVID-19 pandemic may disrupt business operations, supply chains, demand, and financial performance.
	* Government regulations and changes in research funding can be unpredictable and may affect our market and revenue, especially in diagnostics.
	* The GRAIL acquisition is subject to ongoing regulatory reviews and legal proceedings, posing risks of penalties and affecting operational stability and stock prices.
	* We are exposed to foreign currency risks that can impact financial statements and operational costs due to fluctuating exchange rates.
	* International geopolitical risks, including the conflict between Russia and Ukraine, can impact internationa