In [1]:
import os
import re
import requests
from datetime import datetime

from bs4 import BeautifulSoup, NavigableString

import vertexai
from vertexai.generative_models import (
    GenerativeModel,
    GenerationConfig,
    Tool,
    FunctionDeclaration,
    Part)

In [2]:
project_id = !gcloud config get project
project_id = project_id[0]

vertexai.init(project=project_id, location="us-central1")

In [3]:
# IMPORTANT! You must set these variables with REAL VALUES for the correct headers to be in place to access the API
PARTNER_COMPANY = "EDGAR"
PARTNER_WEBSITE = "https://www.sec.gov/edgar/search/"

headers = {"User-Agent": f"{PARTNER_COMPANY} +{PARTNER_WEBSITE}"}

In [4]:
BASE_DIR = "filings" # Directory for storing raw filings

def download_single_filing(url, cik, form, date, file_extension):
    """
    This function downloads and saves a SEC filing from a specified URL.

    Args:
        cik (str): Central Index Key (CIK) for the company.
        form (str): The type of SEC filing (e.g., 10-K, 10-Q).
        date (str): The filing date in YYYY-MM-DD format.
        file_extension (str): File extension for saved file (usually 'txt' or 'zip').
    """

    # Define request headers to simulate a browser visit
    response = requests.get(url, headers=headers)

    if response.status_code == 200:

        # Make the directory accord
        dir_name = f"{BASE_DIR}/{cik}"
        os.makedirs(dir_name, exist_ok=True)
        file_path = f"{dir_name}/{form}_{date}.{file_extension}"

        with open(file_path, 'wb') as file:
            file.write(response.content)

        print(f"Downloaded {form} for CIK {cik} on {date} to {file_path}")

        return file_path

    else:
        print(f"Failed to download {form} for CIK {cik} on {date}. Status code: {response.status_code}")

        return None

def download_range_of_filings(cik, starting_year_and_quarter,
                            ending_year_and_quarter, include_10q = False):
    """
    Download filings from EDGAR for a given CIK and clean them up.

    Args:
        cik (str): Central Index Key (CIK)
        starting_year_and_quarter (str): Specified in the format "2023 Q1"
        ending_year_and_quarter (str): Specified in the format "2024 Q4"
        include_10q (bool): Whether to include 10-Qs in addition to 10-Ks
    """

    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    headers = {'User-Agent': 'Google Partner Learning Services Demo +https://partners.cloud.google.com/learn'}
    response = requests.get(url, headers=headers)

    forms_to_download = {"10-K", "10-Q"} if include_10q else {"10-K"}

    start_year, start_quarter = starting_year_and_quarter.split()
    end_year, end_quarter = ending_year_and_quarter.split()

    if response.status_code == 200:
        data = response.json()

        filing_paths = []
        for filing_date, form, accession_number in zip(data['filings']['recent']['filingDate'], data['filings']['recent']['form'], data['filings']['recent']['accessionNumber']):
            if (form in forms_to_download) and (int(start_year) <= datetime.strptime(filing_date, '%Y-%m-%d').year <= int(end_year)):
                file_path = download_single_filing(
                    f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number.replace('-', '')}/{accession_number}.txt",
                    cik, form, filing_date, 'htm'
                )
                filing_paths.append (file_path)
        return filing_paths
    else:
        print("Error from call.")
        print(response)

In [5]:
alphabet_cik = "0001652044"

download_range_of_filings(cik=alphabet_cik,
                starting_year_and_quarter="2024 Q1",
                ending_year_and_quarter="2024 Q4")

Downloaded 10-K for CIK 0001652044 on 2024-01-31 to filings/0001652044/10-K_2024-01-31.htm


['filings/0001652044/10-K_2024-01-31.htm']

In [6]:
model = GenerativeModel("gemini-2.0-flash",
                        generation_config=GenerationConfig(temperature=0),
                       )



In [7]:
downloaded_path = "filings/0001652044/10-K_2024-01-31.htm"

with open(downloaded_path, 'r') as f:
    filing_text = f.read()

response = model.count_tokens(filing_text)
print(response)    

total_tokens: 5798629
total_billable_characters: 13029439
prompt_tokens_details {
  modality: TEXT
  token_count: 5798629
}



In [8]:
items = ['Business',
        'Risk Factors',
        'Unresolved Staff Comments',
        'Properties', 'Legal Proceedings',
        'Mine Safety Disclosures',
        'Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities',
        'Management’s Discussion and Analysis of Financial Condition and Results of Operations',
        'Quantitative and Qualitative Disclosures About Market Risk',
        'Financial Statements and Supplementary Data',
        'Changes in and Disagreements with Accountants on Accounting and Financial Disclosure',
        'Controls and Procedures',
        'Other Information',
        'Disclosure Regarding Foreign Jurisdictions that Prevent Inspections',
        'Directors, Executive Officers, and Corporate Governance',
        'Executive Compensation',
        'Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters',
        'Certain Relationships and Related Transactions, and Director Independence',
        'Principal Accountant Fees and Services',
        'Exhibit and Financial Statement Schedules',
        'Form 10-K Summary']

def find_div_id(soup, item_name: str):
    found_tag = soup.find('a', string=item_name)
    if found_tag:
        return found_tag["href"].strip('#')
    else:
        print(f"Couldn't find a matching tag for: {item_name}")
        return None

def get_text_between(soup, cur_name, end_name):
    cur = soup.find('div', id=find_div_id(soup, cur_name)).next_sibling
    end = soup.find('div', id=find_div_id(soup, end_name))
    while cur and cur != end:
        if isinstance(cur, NavigableString):
            text = cur.strip()
            if len(text):
                yield text
        cur = cur.next_element

def get_items_from_filings(item_names, filing_paths):

    print("Items of interest: " + ", ".join(item_names) + "\n")
    item_strings = {item: f"<{item}>\n" for item in item_names}

    for path in filing_paths:
        with open(path, 'r', encoding='utf-8') as file:
            content = file.read()

        soup = BeautifulSoup(content, 'html.parser')

        for item in item_names:
            item_index = items.index(item)
            item_index = item_index if item_index < len(items) - 1 else 0
            item_output = ' '.join(text for text in get_text_between(soup, item, items[item_index + 1]))
            item_strings[item] += f"From {os.path.basename(path)}" + "\n" + item_output + "\n"

    return "\n\n".join(item_strings.values())    

In [11]:
item_names = ["Management’s Discussion and Analysis of Financial Condition and Results of Operations"]

report = get_items_from_filings(item_names, [downloaded_path])

# Print the first 2,000 characters as an example.
print(report[0:2000] + "...")

Items of interest: Management’s Discussion and Analysis of Financial Condition and Results of Operations

<Management’s Discussion and Analysis of Financial Condition and Results of Operations>
From 10-K_2024-01-31.htm
Table of Contents Alphabet Inc. ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS Please read the following discussion and analysis of our financial condition and results of operations together with “Note about Forward-Looking Statements,” Part I, Item 1 "Business," Part I, Item 1A "Risk Factors," and our consolidated financial statements and related notes included under Item 8 of this Annual Report on Form 10-K. The following section generally discusses 2023 results compared to 2022 results. Discussion of 2022 results compared to 2021 results to the extent not included in this report can be found in Item 7 of our 2022 Annual Report on Form 10-K. Understanding Alphabet’s Financial Results Alphabet is a collection of businesses 

In [12]:
response = model.count_tokens(report)
print(response)

total_tokens: 12688
total_billable_characters: 50620
prompt_tokens_details {
  modality: TEXT
  token_count: 12688
}



In [13]:
model.generate_content("What is Summit Therapeutics' CIK with the SEC?").text

"Summit Therapeutics Inc.'s CIK (Central Index Key) with the SEC is **0001349315**.\n"

In [14]:
from google import genai
from google.genai.types import Tool, GenerateContentConfig, GoogleSearch, HttpOptions
from IPython.display import display, HTML
import os

project_id = os.environ['GOOGLE_CLOUD_PROJECT'] = 'qwiklabs-gcp-00-43c6fa732d80'
location = os.environ['GOOGLE_CLOUD_LOCATION'] = 'us-central1'
os.environ['GOOGLE_GENAI_USE_VERTEXAI'] = 'True'

# Initialize the client, explicitly passing project and location for Vertex AI
client = genai.Client(project=project_id, location=location, http_options=HttpOptions(api_version="v1"))
model_id = "gemini-2.0-flash"

# Configure Google Search as a tool for grounding
search_tool = Tool(
    google_search=GoogleSearch()
)

# Helper function to lookup a company's SEC CIK
def lookup_cik(company_name: str) -> str:
    prompt = f"""
Find the Securities and Exchange Commission's Central Index Key (CIK)
for {company_name}.
Return only the 10-digit integer CIK including leading zeroes.
"""
    # Generate content with grounding via Google Search
    response = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config=GenerateContentConfig(
            tools=[search_tool],
            response_modalities=["TEXT"],
        )
    )

    # Concatenate all text parts of the response
    cik = "".join(part.text for part in response.candidates[0].content.parts).strip()
    print(f"Lookup for CIK of {company_name} resulted in: {cik}\n")

    # Display the grounding metadata (search suggestions and sources)
    if hasattr(response.candidates[0], 'grounding_metadata'):
        display(
            HTML(
                response.candidates[0]
                .grounding_metadata.search_entry_point.rendered_content
            )
        )

    return cik

In [15]:
lookup_cik("Summit Therapeutics")

Lookup for CIK of Summit Therapeutics resulted in: The Securities and Exchange Commission (SEC) Central Index Key (CIK) for Summit Therapeutics is 0001599298.



'The Securities and Exchange Commission (SEC) Central Index Key (CIK) for Summit Therapeutics is 0001599298.'

In [16]:
lookup_cik_fd = FunctionDeclaration(
    name="lookup_cik",
    description="Look up a company's CIK used for its SEC filings.",
    parameters={
        "type": "object",
        "properties": {
            "company_name": {
                "type": "string",
                "description": "The name of the company to look up."
            },
        },
        "required": [
            "company_name"
        ]
    },
)

In [17]:
retrieve_filings_fd = FunctionDeclaration(
    name="retrieve_filings",
    description="Retrieve filings from the SEC EDGAR API for a company within a date range.",
    parameters={
        "type": "object",
        "properties": {
            "cik": {
                "type": "string",
                "description": "The CIK of a company whose documents will be retrieved"
            },
            "starting_year_and_quarter": {
                "type": "string",
                "description": "The first report quarter year and quarter in the format: 2024 Q1"
            },
            "ending_year_and_quarter": {
                "type": "string",
                "description": "The year and quarter in the format: 2024 Q1"
            },
            "include_quarterly_reports": {
                "type": "boolean",
                "description": "Whether to include 10-Q quarterly filings in addition to annual 10-K filings"
            },
            "items_of_interest": {
                "description": "An array of one or more section (called items) of interest from 10-K or 10-Q filings",
                "type": "array",
                "items": {
                "type": "string",
                "enum": [
                    "Business",
                    "Risk Factors",
                    "Unresolved Staff Comments",
                    "Properties",
                    "Legal Proceedings",
                    "Mine Safety Disclosures",
                    "Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
                    "Management’s Discussion and Analysis of Financial Condition and Results of Operations",
                    "Quantitative and Qualitative Disclosures About Market Risk",
                    "Financial Statements and Supplementary Data",
                    "Changes in and Disagreements with Accountants on Accounting and Financial Disclosure",
                    "Controls and Procedures",
                    "Other Information",
                    "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
                    "Directors, Executive Officers and Corporate Governance",
                    "Executive Compensation",
                    "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
                    "Certain Relationships and Related Transactions, and Director Independence",
                    "Principal Accountant Fees and Services",
                    "Exhibit and Financial Statement Schedules",
                    "Form 10-K Summary"
                ]
                }
        }
        },
        "required": [
            "cik",
            "starting_year_and_quarter",
            "ending_year_and_quarter",
            "items_of_interest"
        ]
    },
)    

In [18]:
sec_tool = Tool(function_declarations=[retrieve_filings_fd,
                                    lookup_cik_fd])    

In [19]:
response1 = client.models.generate_content(
    model="gemini-2.0-flash",
    contents="Find the SEC CIK for Acme Corp. Return only the 10-digit CIK.",
    config=GenerateContentConfig(tools=[search_tool], response_modalities=["TEXT"])
)
cik = "".join(p.text for p in response1.candidates[0].content.parts).strip()

from vertexai.generative_models import GenerativeModel, GenerationConfig, Tool, FunctionDeclaration

# Define your function declarations (lookup_cik_fd, retrieve_filings_fd)
sec_tool = Tool(function_declarations=[lookup_cik_fd, retrieve_filings_fd])

system_instruction = """
    - You are a research assistant to a financial analyst.
    - Answer the user's question with an analysis, basing your response on information
    used in filings from the SEC EDGAR database.
    - Quote the SEC filing documents to support your analysis.
    - If you are not certain about a CIK, use your tool to look it up.
    - The current date is {current_date}.
    """.format(
        current_date=datetime.today().strftime("%Y-%m-%d")
    )

model = GenerativeModel(
    model_name="gemini-2.0-flash",
    generation_config=GenerationConfig(temperature=0),
    system_instruction=system_instruction,
    tools=[sec_tool]
)

response2 = model.generate_content(
    contents=f"Retrieve the 10-K for CIK {cik} covering 2023 Q4, and extract 'Risk Factors'."
)

In [20]:
def handle_response(response):

    parts_for_inner_response = []
    for part in response.candidates[0].content.parts:
        # If the content part has an attribute called 'text'
        if hasattr(part, "text"):
            print("\n" + part.text)
        # If the content has an attribute called 'function_call'
        if part.function_call:
            function_call = part.function_call
            try:
                if function_call.name == "lookup_cik":
                    cik = lookup_cik(function_call.args["company_name"])
                    parts_for_inner_response.append(
                        Part.from_function_response(
                            name="lookup_cik",
                            response={
                                "content": cik,
                            },
                        )
                    )
                if function_call.name == "retrieve_filings":
                    context = ""
                    filing_paths = download_range_of_filings(function_call.args["cik"],
                                    function_call.args["starting_year_and_quarter"],
                                    function_call.args["ending_year_and_quarter"],
                                    function_call.args.get("include_10q", False)
                    )
                    if filing_paths:
                        # TODO: Load cleaned filing docs
                        report = get_items_from_filings(function_call.args["items_of_interest"], filing_paths)
                        parts_for_inner_response.append(
                            Part.from_function_response(
                                name="retrieve_filings",
                                response={
                                    "content": report,
                                },
                            )
                        )
                    else:
                        print("No valid filings found or an error was encountered in retrieving them.")
            except AttributeError as e:
                print("Exception:")
                print(response)
                print(part)
                print(e)
    if parts_for_inner_response:
        inner_response = chat.send_message(parts_for_inner_response)
        handle_response(inner_response)   

In [21]:
chat = model.start_chat()

In [22]:
response = chat.send_message("How do Alphabet's risks in 2024 compare to Amazon's?")
handle_response(response)

Lookup for CIK of Alphabet resulted in: The Securities and Exchange Commission's (SEC) Central Index Key (CIK) for Alphabet Inc. is 0001652044. The CIK is a 10-digit number that the SEC assigns to entities and individuals who file disclosures with them. The CIK is used to identify filings in online databases like EDGAR.



Lookup for CIK of Amazon resulted in: The Securities and Exchange Commission (SEC) Central Index Key (CIK) for Amazon is **0001018724**.




Okay, I have the CIKs for both companies. Now I will retrieve their risk factors from their 10-K filings for 2024.

Downloaded 10-K for CIK 0001652044 on 2024-01-31 to filings/0001652044/10-K_2024-01-31.htm
Items of interest: Risk Factors

Downloaded 10-K for CIK 0001018724 on 2024-02-02 to filings/0001018724/10-K_2024-02-02.htm
Items of interest: Risk Factors


Okay, I have retrieved the risk factors for both Alphabet and Amazon from their 2024 10-K filings. Here's a comparison of their risks:

**Similarities:**

*   **Competition:** Both companies acknowledge facing intense competition in their respective markets. Alphabet states, "We face intense competition. If we do not continue to innovate and provide products and services that are useful to users, customers, and other partners, we may not remain competitive..." Amazon says, "Our businesses are rapidly evolving and intensely competitive, and we have many competitors across geographies...".
*   **International Operations:** Both 

In [23]:
chat = model.start_chat()
response = chat.send_message("How has Home Depot changed the way it describes its business over the past 3 years?")
handle_response(response)

Lookup for CIK of Home Depot resulted in: The Securities and Exchange Commission (SEC) Central Index Key (CIK) for Home Depot is 0000354950.



Downloaded 10-K for CIK 0000354950 on 2025-03-21 to filings/0000354950/10-K_2025-03-21.htm
Downloaded 10-K for CIK 0000354950 on 2024-03-13 to filings/0000354950/10-K_2024-03-13.htm
Downloaded 10-K for CIK 0000354950 on 2023-03-15 to filings/0000354950/10-K_2023-03-15.htm
Downloaded 10-K for CIK 0000354950 on 2022-03-23 to filings/0000354950/10-K_2022-03-23.htm
Items of interest: Business


Over the past three years, Home Depot has made several notable changes in how it describes its business in its 10-K filings. Here's a summary of the key changes:

*   **Acquisition and Focus on the Pro Customer:** In 2024, Home Depot acquired SRS, a leading residential specialty trade distribution company. This acquisition is highlighted in the business description, emphasizing the company's focus on serving professional customers ("Pros") in roofing, landscaping, and pool contracting. "In fiscal 2024, we acquired SRS, which sells products to specialty trade roofers, landscapers, and pool contractor