In [None]:
!pip install -U sec-downloader sec-parser openai

In [None]:
from sec_downloader import Downloader
import sec_parser as sp
import warnings
import os
from openai import OpenAI
import json

In [None]:
dl = Downloader("Vanguard", "attila_sajo@vanguard.com")

In [None]:
# get latest
# html = dl.get_filing_html(ticker="LNG", form="10-K")
html = dl.get_filing_html(ticker="ILMN", form="10-K")

In [None]:
parser = sp.Edgar10QParser()

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="Invalid section type for")
    elements: list = parser.parse(html)
    
tree: sp.SemanticTree = sp.TreeBuilder().build(elements)

demo_output: str = sp.render(tree)
tree_text = '\n'.join(demo_output.split('\n'))

In [None]:
tree_text = '\n'.join([ line for line in demo_output.split('\n') if 'TableElement' not in line and 'ImageElement' not in line])

In [None]:
print(tree_text)

# OpenAI

In [None]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

In [None]:
topic = "risk analysis"
prompt = f"""You are a document preprocessing assistant. Your job is to search for {topic} related TitleElement in the provided Document tree between the three - characters.
The provided document represents a longer text hierarchy. It has the following elements:
  * TitleElement: represent a section title
  * SupplementaryText: represent some supplementally text for the title
  * TextElement: represents a text connected to the title
  * TableElement: represent a table
  * ImageElement: represent an Image

Give the result in sentiment analysis JSON format. 
Just return the JSON formatted result. Don't add any other comments.

To generate the end JSON think step by step, which requires the following actions:
1) Iterate over the documents and search for titles which are connected to {topic}
2) Create a JSON with the list of the relevant title
3) return the JSON

Document:---
{tree_text}
---
"""

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-4o",
    response_format={
        "type": "json_schema",
        "json_schema": {
        "name": "risk_analysis",
        "description": "result of risk analysis",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    f"{topic}_titles": {
                        "type": "array",
                        "description": "list of the {topic} related titles",
                        "items": {
                                "type": "string"
                        }
                    },
                },
                "required": [f"{topic}_titles"],
                "additionalProperties": False
            }
        }
    }
)

In [None]:
result = json.loads(chat_completion.choices[0].message.content.strip().replace("```", ''))

In [None]:
# itterate over titles and collect the text as Markdown

In [None]:
result_markdown = ""

def itterate_childrens(node, result_markdown, level=0, get_text = False):
    level += 1
    printed = False
    if node.text in result[f"{topic}_titles"]:
        result_markdown += "\n" + "#"*level + " " + node.text + "\n\n"
        get_text = True
        printed = True
        
    if node.has_child and len(node.children) > 0:
        if get_text and node.text not in result[f"{topic}_titles"]:
            result_markdown += "\n" + "#"*level + " " + node.text + "\n\n"
        for subnode in node.children:
            result_markdown = itterate_childrens(subnode, result_markdown, level, get_text)
    else:
        if get_text and not printed:
            result_markdown += node.text + "\n"
            
    return result_markdown

for node in tree:
    result_markdown = itterate_childrens(node, result_markdown, 0, False)

In [None]:
len(result_markdown)/4

In [None]:
print(result_markdown)

## Chapter sum

In [None]:
text_sums = ""

if len(result_markdown)/4 > 100000:
    company_type = "energy"
    for chapter in result_markdown.split("\n### "):
        prompt = f"""Summarize the input text between the three - signs. The text is an {company_type} company financial SEC 10-K report. The input text is Markdown formatted.
Give the result in JSON format.  Just return the JSON formatted result. Don't add any other comments.

The JSON should have one string field: 'summarized_text'. This should be a Markdown formated text of the input text.

Think step by step, which requires the following actions:
1) separate the {company_type} company-specific information and general business information
2) summarize the general information in a few sentences
3) summarize the company-specific information in a way which keeps the specific information

Input Markdown text:---
{chapter}
---
        """
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-4o",
            response_format={
                "type": "json_schema",
                "json_schema": {
                "name": f"{topic.replace(' ', '_')}",
                "description": f"result of risk {topic}",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "summarized_text": {
                                "type": "string",
                                "description": "summarization of the input text",
                            }
                        },
                        "required": ["summarized_text"],
                        "additionalProperties": False
                    }
                }
            }
        )
        text_sum = json.loads(chat_completion.choices[0].message.content.strip().replace("```", ''))
        text_sums += text_sum['summarized_text']+"\n\n"
else:
    text_sums = result_markdown

In [None]:
print(text_sums)

## List risks

In [None]:
prompt = f"""Summarize and organize the financial Markdown formatted text between the three - sign.
Give the result in {topic} JSON format.  Just return the JSON formatted result. Don't add any other comments.

To generate the end JSON think step by step, which requires the following actions:
1) Create a summary of the most important risks. These summaries should be relatively long.
2) Delete the risk which are two general, and can happen with every company. 
3) Organize the risks in two lists: 'internal' and 'not-internal'. Decide whether a certain risk in the previous step is internally controlled or not. 
   It is NOT internal if the risk is NOT dependent on the company, but instead some outsider, for example, the government, or environment. 
4) Reorder the risk by importance. A risk is more important if the input text gives more details about it.
5) Create the output JSON, listing separately the internal and not-internal problems.
6) Dublecheck the initial risk list just has risks which can managed by the company. 

Markdown:---
{text_sums}
---
"""

In [None]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-4o",
    response_format={
        "type": "json_schema",
        "json_schema": {
        "name": f"{topic.replace(' ', '_')}",
        "description": f"result of risk {topic}",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "internal": {
                        "type": "array",
                        "description": "list of the internal",
                        "items": {
                                "type": "string"
                        }
                    },
                    "not_internal": {
                        "type": "array",
                        "description": "list of the not internal",
                        "items": {
                                "type": "string"
                        }
                    },
                },
                "required": ["internal", "not_internal"],
                "additionalProperties": False
            }
        }
    }
)

In [None]:
result = json.loads(chat_completion.choices[0].message.content.strip().replace("```", ''))

In [None]:
for s in result["internal"]:
    print(f"\t* {s}")

In [None]:
for s in result["not_internal"]:
    print(f"\t* {s}")