In [3]:
print("hello")

hello


In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import pandas as pd

def parse_income_statement(url):
    """
    Parse income statement data from an SEC filing.
    
    Args:
        url (str): URL to the SEC filing
        
    Returns:
        numpy.ndarray: Income statement data in a numpy array
    """
    # Fetch the document
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch document: {response.status_code}")
    
    # Parse HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the income statement table
    # Usually tables with "Consolidated Statements of Income" or similar titles
    tables = soup.find_all('table')
    income_table = None
    
    for table in tables:
        heading_text = ""
        # Check for table caption or previous heading
        caption = table.find('caption')
        if caption:
            heading_text = caption.get_text().lower()
        
        # Look for keywords in heading or table text
        table_text = table.get_text().lower()
        keywords = ["consolidated statements of income", "consolidated income statements", 
                   "statements of operations", "income statement"]
        
        if any(keyword in heading_text for keyword in keywords) or any(keyword in table_text for keyword in keywords):
            income_table = table
            break
    
    if not income_table:
        raise Exception("Income statement table not found")
    
    # Extract rows from the table
    rows = income_table.find_all('tr')
    
    # Initialize lists to store data
    line_items = []
    values = []
    
    # Process rows
    for row in rows:
        cells = row.find_all(['th', 'td'])
        if len(cells) < 2:
            continue
            
        # First cell usually contains the line item description
        item = cells[0].get_text().strip()
        
        # Skip header rows or empty rows
        if not item or item.lower() in ['consolidated statements of income', 'in millions, except per share data']:
            continue
            
        # Extract values from other cells, convert to numbers
        row_values = []
        for cell in cells[1:]:
            text = cell.get_text().strip()
            
            # Handle dollar signs, parentheses (negative values), and commas
            if text:
                # Remove $ and commas
                text = text.replace('$', '').replace(',', '')
                
                # Handle parentheses for negative values
                if '(' in text and ')' in text:
                    text = text.replace('(', '-').replace(')', '')
                    
                try:
                    value = float(text)
                    row_values.append(value)
                except ValueError:
                    # If conversion fails, it might be a header or non-numeric cell
                    row_values.append(np.nan)
            else:
                row_values.append(np.nan)
                
        if row_values and not all(np.isnan(val) for val in row_values):
            line_items.append(item)
            values.append(row_values)
    
    # Convert to numpy array
    if values:
        # Find the maximum length of any row
        max_length = max(len(row) for row in values)
        
        # Pad shorter rows with NaN
        padded_values = [row + [np.nan] * (max_length - len(row)) for row in values]
        
        # Convert to numpy array
        np_data = np.array(padded_values)
        
        # Create a structured array with line items as labels
        df = pd.DataFrame(np_data, index=line_items)
        
        # Print the extracted data
        print("Income Statement Data:")
        print(df)
        
        return np_data, line_items
    else:
        raise Exception("Failed to extract values from income statement")

# Example usage
url = "https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126.htm"
income_data, line_items = parse_income_statement(url)

# To save the data to a file
np.save('income_statement_data.npy', income_data)
with open('line_items.txt', 'w') as f:
    for item in line_items:
        f.write(f"{item}\n")

print("Data saved to income_statement_data.npy and line_items.txt")

Exception: Failed to fetch document: 403

In [7]:
url = "https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126.htm#if3830601512b46079053ec0daaf407ac_103"
response = requests.get(url)

In [8]:
response

<Response [403]>

In [1]:
import requests
import os

# Define the URL
url = "https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126.htm"

# Set headers to mimic a browser (SEC requires a user-agent)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
}

# Send GET request to the URL
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Save the content to a file
    with open("nvda_filing.html", "wb") as f:
        f.write(response.content)
    print(f"Successfully downloaded the filing to 'nvda_filing.html'")
    
    # If you want to extract just the specific section from the fragment identifier
    # You'll need to parse the HTML and extract that section
    # The fragment ID in your URL is: if3830601512b46079053ec0daaf407ac_103
else:
    print(f"Failed to download the filing. Status code: {response.status_code}")

Failed to download the filing. Status code: 403


In [10]:
pip install -U sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.7.0-py3-none-any.whl.metadata (25 kB)
Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.7.0-py3-none-any.whl (28 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.7.0 sec-edgar-downloader-5.0.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
from sec_edgar_downloader import Downloader

# Initialize a downloader instance. Download filings to the current
# working directory. Must declare company name and email address
# to form a user-agent string that complies with the SEC Edgar's
# programmatic downloading fair access policy.
# More info: https://www.sec.gov/os/webmaster-faq#code-support
# Company name and email are used to form a user-agent of the form:
# User-Agent: <Company Name> <Email Address>
dl = Downloader("Analyst Copilot", "liamdrew92@icloud.com")


# Get the five most recent 8-K filings for Apple
# dl.get("8-K", "AAPL", limit=5)

# Get the latest 10-K filing for Microsoft
dl.get("10-K", "NVDA", limit=1)

1

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import re

def parse_nvidia_income_statement(html_file_path):
    # Read the HTML file
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Look for the income statement table
    # Search for the table with the title containing "Consolidated Statements of Income"
    tables = soup.find_all('table')
    income_table = None
    
    for table in tables:
        # Check if this table contains the income statement
        if table.find_previous(text=re.compile("Consolidated Statements of Income", re.IGNORECASE)):
            income_table = table
            print("Found a table")
            break
    
    if not income_table:
        raise ValueError("Could not find the income statement table in the HTML file")
    
    # Extract the column headers (years)
    # the headers are getting messed up here
    headers = ["Item"]  # First column for row names

    
    # Find all header cells
    header_cells = income_table.find_all('th')
    for cell in header_cells:
        text = cell.get_text(strip=True)
        if "Jan" in text or "Year Ended" in text:
            headers.append(text)
    
    # If no headers found in th tags, look in the first row
    if len(headers) == 1:  # Only the "Item" header we added
        first_row = income_table.find('tr')
        if first_row:
            for cell in first_row.find_all('td'):
                text = cell.get_text(strip=True)
                if "Jan" in text or "Year Ended" in text:
                    headers.append(text)
    
    # Create a list to store all rows
    data = []
    
    # Get all rows from the table
    rows = income_table.find_all('tr')
    for row in rows:
        cells = row.find_all(['td', 'th'])
        # if len(cells) > 1:  # Skip empty rows
        row_data = []
        for cell in cells:
            text = cell.get_text(strip=False)
            # text = cell.get_text(strip=True)
            # # Replace empty cells with NaN
            # if text == "":
            #     text = float('nan')
            # # Try to convert to numeric if possible
            # try:
            #     # Remove commas, dollar signs, and parentheses for negative numbers
            #     text = text.replace('$', '').replace(',', '')
            #     if '(' in text and ')' in text:  # Handle negative numbers in parentheses
            #         text = text.replace('(', '-').replace(')', '')
            #     text = float(text)
            # except (ValueError, AttributeError):
            #     pass  # Keep as string if not numeric


            row_data.append(text)
        data.append(row_data)
    
    # Create DataFrame
    df = pd.DataFrame(data)

    print("Returning df")
    return df
    
    # # Clean up the DataFrame
    # # If we have the right number of headers, use them
    # print("Len headers is " + str(len(headers)))
    # print(headers[0])
    # print("DF shape is " + str(df.shape[1]))
    # if len(headers) == df.shape[1]:
    #     # print("correct number of headers")
    #     df.columns = headers
    # else:
    #     # If headers don't match columns, use default naming
    #     df.columns = ['Column_' + str(i) for i in range(df.shape[1])]
    #     # And put the first row as headers if it looks like headers
    #     if any(isinstance(x, str) and "Revenue" in x for x in df.iloc[0]):
    #         new_columns = df.iloc[0].tolist()
    #         df = df[1:]
    #         df.columns = new_columns
    
    # # Set the first column as index if it contains text descriptions
    # if df.iloc[:, 0].apply(lambda x: isinstance(x, str)).all():
    #     df = df.set_index(df.columns[0])
    
    # # Clean up the index/row names
    # df.index = df.index.str.strip() if hasattr(df.index, 'str') else df.index
    
    # return df

# Usage
income_statement = parse_nvidia_income_statement("sec-edgar-filings/NVDA/10-K/0001045810-25-000023/full-submission.html")
print(income_statement)

# # Save to CSV
# income_statement.to_csv("nvidia_income_statement.csv")

Found a table
Returning df
                                   0             1     2             3   \
0                                                                         
1                                        Year Ended  None          None   
2                                      Jan 26, 2025        Jan 28, 2024   
3                             Revenue        100.0      %                 
4                     Cost of revenue         25.0                        
5                        Gross profit         75.0                        
6                  Operating expenses                                     
7            Research and development          9.9                        
8   Sales, general and administrative          2.7                        
9            Total operating expenses         12.6                        
10                   Operating income         62.4                        
11                    Interest income          1.4                       

  if table.find_previous(text=re.compile("Consolidated Statements of Income", re.IGNORECASE)):


In [6]:
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd

def parse_10k(html_content):
    """
    Parse a 10-K HTML document and extract sections organized by Parts and Items.
    
    Args:
        html_content (str): The HTML content of the 10-K document
        
    Returns:
        dict: A nested dictionary with Parts and Items from the 10-K
    """
    # Create BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Dictionary to store the parsed results
    parsed_10k = {}
    
    # Regular expressions for matching Parts and Items
    part_pattern = re.compile(r'\s*PART\s+([IVX]+)\s*', re.IGNORECASE)
    item_pattern = re.compile(r'\s*Item\s+(\d+[A-Z]?)\.?\s*(.*?)(?:\s*\d+\s*)?$', re.IGNORECASE)
    
    # Find all headings that might contain Parts or Items
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'div', 'p', 'span'], 
                             class_=lambda c: c and ('title' in c.lower() or 'heading' in c.lower() or 'header' in c.lower()))
    
    # Add all strong and b tags which might also contain headers
    headings.extend(soup.find_all(['strong', 'b']))
    
    current_part = None
    current_item = None
    
    # Process all potential headings
    for heading in headings:
        text = heading.get_text().strip()
        
        # Check if it's a Part
        part_match = part_pattern.match(text)
        if part_match:
            current_part = f"Part {part_match.group(1)}"
            parsed_10k[current_part] = {}
            continue
            
        # Check if it's an Item
        item_match = item_pattern.match(text)
        if item_match and current_part:
            item_num = item_match.group(1)
            item_title = item_match.group(2).strip()
            current_item = f"Item {item_num}"
            
            # Create the item entry in the current part
            parsed_10k[current_part][current_item] = {
                'title': item_title,
                'content': '',  # We'll extract content in a second pass
                'page': None
            }
            
            # Try to find the page number that might follow the item title
            page_match = re.search(r'\d+$', text)
            if page_match:
                parsed_10k[current_part][current_item]['page'] = int(page_match.group())
    
    # Extract content for each item
    extract_item_content(soup, parsed_10k)
    
    return parsed_10k

def extract_item_content(soup, parsed_10k):
    """
    Extract content for each item in the 10-K document.
    
    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the HTML document
        parsed_10k (dict): The dictionary containing the structure of the 10-K
    """
    # Get all parts and items in order
    all_parts = list(parsed_10k.keys())
    
    # For each part, process its items
    for i, part in enumerate(all_parts):
        all_items = list(parsed_10k[part].keys())
        
        # For each item, find its content until the next item or part
        for j, item in enumerate(all_items):
            # Find the element that contains this item
            item_text = f"{item}. {parsed_10k[part][item]['title']}"
            item_elements = soup.find_all(text=lambda text: text and item_text in text)
            
            if not item_elements:
                # Try with a more flexible search
                item_elements = soup.find_all(text=lambda text: text and item.lower() in text.lower() and parsed_10k[part][item]['title'].lower() in text.lower())
            
            if item_elements:
                current_element = item_elements[0].parent
                content = []
                
                # Determine where to stop extracting content
                next_item = all_items[j+1] if j+1 < len(all_items) else None
                next_part = all_parts[i+1] if i+1 < len(all_parts) else None
                
                # Extract content until next item or part is found
                while current_element and current_element.next_sibling:
                    current_element = current_element.next_sibling
                    
                    # Check if we've reached the next item or part
                    elem_text = current_element.get_text().strip() if hasattr(current_element, 'get_text') else ""
                    
                    if next_item and next_item in elem_text:
                        break
                    if next_part and next_part in elem_text:
                        break
                    
                    if hasattr(current_element, 'get_text'):
                        content.append(current_element.get_text().strip())
                
                parsed_10k[part][item]['content'] = "\n".join(content)

def load_10k_from_url(url):
    """
    Load a 10-K document from a URL.
    
    Args:
        url (str): The URL of the 10-K document
        
    Returns:
        str: The HTML content of the 10-K document
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to load document from {url}. Status code: {response.status_code}")

def load_10k_from_file(file_path):
    """
    Load a 10-K document from a local file.
    
    Args:
        file_path (str): The path to the local 10-K HTML file
        
    Returns:
        str: The HTML content of the 10-K document
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_to_csv(parsed_10k, output_file):
    """
    Save the parsed 10-K to a CSV file.
    
    Args:
        parsed_10k (dict): The parsed 10-K document
        output_file (str): The path to save the CSV file
    """
    rows = []
    for part, items in parsed_10k.items():
        for item, details in items.items():
            rows.append({
                'Part': part,
                'Item': item,
                'Title': details['title'],
                'Page': details['page'],
                'Content': details['content']
            })
    
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)
    print(f"Saved parsed 10-K to {output_file}")

# Example usage
if __name__ == "__main__":
    # Option 1: Load from URL
    # url = "https://example.com/company-10k.html"
    # html_content = load_10k_from_url(url)
    
    # Option 2: Load from local file
    file_path = "company_10k.html"
    html_content = load_10k_from_file(file_path)
    
    # Parse the 10-K
    parsed_10k = parse_10k(html_content)
    
    # Print the structure
    for part, items in parsed_10k.items():
        print(f"\n{part}")
        for item, details in items.items():
            print(f"  {item}: {details['title']} (Page {details['page']})")
            # Uncomment to print content snippets
            # content_preview = details['content'][:100] + '...' if details['content'] else 'No content extracted'
            # print(f"    {content_preview}")
    
    
    # Save to CSV
    save_to_csv(parsed_10k, "parsed_10k.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'company_10k.html'

In [7]:
import re
from bs4 import BeautifulSoup
import pandas as pd

def parse_10k(html_content):
    """
    Parse a 10-K HTML document and extract sections organized by Parts and Items.
    
    Args:
        html_content (str): The HTML content of the 10-K document
        
    Returns:
        dict: A nested dictionary with Parts and Items from the 10-K
    """
    # Create BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Dictionary to store the parsed results
    parsed_10k = {}
    
    # Regular expressions for matching Parts and Items
    part_pattern = re.compile(r'\s*PART\s+([IVX]+)\s*', re.IGNORECASE)
    item_pattern = re.compile(r'\s*Item\s+(\d+[A-Z]?)\.?\s*(.*?)(?:\s*\d+\s*)?$', re.IGNORECASE)
    
    # Find all headings that might contain Parts or Items
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'div', 'p', 'span'], 
                             class_=lambda c: c and ('title' in c.lower() or 'heading' in c.lower() or 'header' in c.lower()))
    
    # Add all strong and b tags which might also contain headers
    headings.extend(soup.find_all(['strong', 'b']))
    
    current_part = None
    current_item = None
    
    # Process all potential headings
    for heading in headings:
        text = heading.get_text().strip()
        
        # Check if it's a Part
        part_match = part_pattern.match(text)
        if part_match:
            current_part = f"Part {part_match.group(1)}"
            parsed_10k[current_part] = {}
            continue
            
        # Check if it's an Item
        item_match = item_pattern.match(text)
        if item_match and current_part:
            item_num = item_match.group(1)
            item_title = item_match.group(2).strip()
            current_item = f"Item {item_num}"
            
            # Create the item entry in the current part
            parsed_10k[current_part][current_item] = {
                'title': item_title,
                'content': '',  # We'll extract content in a second pass
                'page': None
            }
            
            # Try to find the page number that might follow the item title
            page_match = re.search(r'\d+$', text)
            if page_match:
                parsed_10k[current_part][current_item]['page'] = int(page_match.group())
    
    # Extract content for each item
    extract_item_content(soup, parsed_10k)
    
    return parsed_10k

def extract_item_content(soup, parsed_10k):
    """
    Extract content for each item in the 10-K document.
    
    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the HTML document
        parsed_10k (dict): The dictionary containing the structure of the 10-K
    """
    # Get all parts and items in order
    all_parts = list(parsed_10k.keys())
    
    # For each part, process its items
    for i, part in enumerate(all_parts):
        all_items = list(parsed_10k[part].keys())
        
        # For each item, find its content until the next item or part
        for j, item in enumerate(all_items):
            # Find the element that contains this item
            item_text = f"{item}. {parsed_10k[part][item]['title']}"
            item_elements = soup.find_all(text=lambda text: text and item_text in text)
            
            if not item_elements:
                # Try with a more flexible search
                item_elements = soup.find_all(text=lambda text: text and item.lower() in text.lower() and parsed_10k[part][item]['title'].lower() in text.lower())
            
            if item_elements:
                current_element = item_elements[0].parent
                content = []
                
                # Determine where to stop extracting content
                next_item = all_items[j+1] if j+1 < len(all_items) else None
                next_part = all_parts[i+1] if i+1 < len(all_parts) else None
                
                # Extract content until next item or part is found
                while current_element and current_element.next_sibling:
                    current_element = current_element.next_sibling
                    
                    # Check if we've reached the next item or part
                    elem_text = current_element.get_text().strip() if hasattr(current_element, 'get_text') else ""
                    
                    if next_item and next_item in elem_text:
                        break
                    if next_part and next_part in elem_text:
                        break
                    
                    if hasattr(current_element, 'get_text'):
                        content.append(current_element.get_text().strip())
                
                parsed_10k[part][item]['content'] = "\n".join(content)

def load_10k_from_file(file_path):
    """
    Load a 10-K document from a local file.
    
    Args:
        file_path (str): The path to the local 10-K HTML file
        
    Returns:
        str: The HTML content of the 10-K document
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_to_csv(parsed_10k, output_file):
    """
    Save the parsed 10-K to a CSV file.
    
    Args:
        parsed_10k (dict): The parsed 10-K document
        output_file (str): The path to save the CSV file
    """
    rows = []
    for part, items in parsed_10k.items():
        for item, details in items.items():
            rows.append({
                'Part': part,
                'Item': item,
                'Title': details['title'],
                'Page': details['page'],
                'Content': details['content']
            })
    
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)
    print(f"Saved parsed 10-K to {output_file}")

# Use your specific file path
if __name__ == "__main__":
    file_path = "sec-edgar-filings/NVDA/10-K/0001045810-25-000023/full-submission.html"
    
    try:
        html_content = load_10k_from_file(file_path)
        print(f"Successfully loaded HTML file from {file_path}")
        
        # Parse the 10-K
        parsed_10k = parse_10k(html_content)
        
        # Print the structure
        for part, items in parsed_10k.items():
            print(f"\n{part}")
            for item, details in items.items():
                print(f"  {item}: {details['title']} (Page {details['page']})")
        
        # Save to CSV
        output_file = "nvidia_10k_parsed.csv"
        save_to_csv(parsed_10k, output_file)
        
    except Exception as e:
        print(f"Error processing the file: {e}")

Successfully loaded HTML file from sec-edgar-filings/NVDA/10-K/0001045810-25-000023/full-submission.html
Saved parsed 10-K to nvidia_10k_parsed.csv


In [8]:
import re
from bs4 import BeautifulSoup
import pandas as pd

def parse_10k(html_content):
    """
    Parse a 10-K HTML document and extract sections organized by Parts and Items.
    
    Args:
        html_content (str): The HTML content of the 10-K document
        
    Returns:
        dict: A nested dictionary with Parts and Items from the 10-K
    """
    # Create BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Dictionary to store the parsed results
    parsed_10k = {}
    
    # Regular expressions for matching Parts and Items
    part_pattern = re.compile(r'\s*PART\s+([IVX]+)\s*', re.IGNORECASE)
    item_pattern = re.compile(r'\s*Item\s+(\d+[A-Z]?)\.?\s*(.*?)(?:\s*\d+\s*)?$', re.IGNORECASE)
    
    # Find all headings that might contain Parts or Items
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'div', 'p', 'span'], 
                             class_=lambda c: c and ('title' in c.lower() or 'heading' in c.lower() or 'header' in c.lower()))
    
    # Add all strong and b tags which might also contain headers
    headings.extend(soup.find_all(['strong', 'b']))
    
    current_part = None
    current_item = None
    
    # Process all potential headings
    for heading in headings:
        text = heading.get_text().strip()
        
        # Check if it's a Part
        part_match = part_pattern.match(text)
        if part_match:
            current_part = f"Part {part_match.group(1)}"
            parsed_10k[current_part] = {}
            continue
            
        # Check if it's an Item
        item_match = item_pattern.match(text)
        if item_match and current_part:
            item_num = item_match.group(1)
            item_title = item_match.group(2).strip()
            current_item = f"Item {item_num}"
            
            # Create the item entry in the current part
            parsed_10k[current_part][current_item] = {
                'title': item_title,
                'content': '',  # We'll extract content in a second pass
                'page': None
            }
            
            # Try to find the page number that might follow the item title
            page_match = re.search(r'\d+$', text)
            if page_match:
                parsed_10k[current_part][current_item]['page'] = int(page_match.group())
    
    # Extract content for each item
    extract_item_content(soup, parsed_10k)
    
    return parsed_10k

def extract_item_content(soup, parsed_10k):
    """
    Extract content for each item in the 10-K document.
    
    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the HTML document
        parsed_10k (dict): The dictionary containing the structure of the 10-K
    """
    # Get all parts and items in order
    all_parts = list(parsed_10k.keys())
    
    # For each part, process its items
    for i, part in enumerate(all_parts):
        all_items = list(parsed_10k[part].keys())
        
        # For each item, find its content until the next item or part
        for j, item in enumerate(all_items):
            # Find the element that contains this item
            item_text = f"{item}. {parsed_10k[part][item]['title']}"
            item_elements = soup.find_all(text=lambda text: text and item_text in text)
            
            if not item_elements:
                # Try with a more flexible search
                item_elements = soup.find_all(text=lambda text: text and item.lower() in text.lower() and parsed_10k[part][item]['title'].lower() in text.lower())
            
            if item_elements:
                current_element = item_elements[0].parent
                content = []
                
                # Determine where to stop extracting content
                next_item = all_items[j+1] if j+1 < len(all_items) else None
                next_part = all_parts[i+1] if i+1 < len(all_parts) else None
                
                # Extract content until next item or part is found
                while current_element and current_element.next_sibling:
                    current_element = current_element.next_sibling
                    
                    # Check if we've reached the next item or part
                    elem_text = current_element.get_text().strip() if hasattr(current_element, 'get_text') else ""
                    
                    if next_item and next_item in elem_text:
                        break
                    if next_part and next_part in elem_text:
                        break
                    
                    if hasattr(current_element, 'get_text'):
                        content.append(current_element.get_text().strip())
                
                parsed_10k[part][item]['content'] = "\n".join(content)

def load_10k_from_file(file_path):
    """
    Load a 10-K document from a local file.
    
    Args:
        file_path (str): The path to the local 10-K HTML file
        
    Returns:
        str: The HTML content of the 10-K document
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_to_csv(parsed_10k, output_file):
    """
    Save the parsed 10-K to a CSV file.
    
    Args:
        parsed_10k (dict): The parsed 10-K document
        output_file (str): The path to save the CSV file
    """
    rows = []
    for part, items in parsed_10k.items():
        for item, details in items.items():
            rows.append({
                'Part': part,
                'Item': item,
                'Title': details['title'],
                'Page': details['page'],
                'Content': details['content']
            })
    
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)
    print(f"Saved parsed 10-K to {output_file}")

# Use your specific file path
if __name__ == "__main__":
    file_path = "sec-edgar-filings/NVDA/10-K/0001045810-25-000023/full-submission.html"
    
    try:
        html_content = load_10k_from_file(file_path)
        print(f"Successfully loaded HTML file from {file_path}")
        
        # Parse the 10-K
        parsed_10k = parse_10k(html_content)
        
        # Print the structure
        for part, items in parsed_10k.items():
            print(f"\n{part}")
            for item, details in items.items():
                print(f"  {item}: {details['title']} (Page {details['page']})")
        
        # Save to CSV
        output_file = "nvidia_10k_parsed.csv"
        save_to_csv(parsed_10k, output_file)
        
    except Exception as e:
        print(f"Error processing the file: {e}")

Successfully loaded HTML file from sec-edgar-filings/NVDA/10-K/0001045810-25-000023/full-submission.html
Saved parsed 10-K to nvidia_10k_parsed.csv


In [9]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.
