In [2]:
from sec_edgar_downloader import Downloader

In [76]:
dl = Downloader("Analyst Copilot", "liamdrew92@icloud.com", "../filings")
dl.get("10-K", "AAPL", limit=1,download_details=True)

1

In [12]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re

def parse_xbrl_tables(file_path):
    """
    Parse all tables from an XBRL/HTML document and store them as pandas DataFrames.
    
    Args:
        file_path (str): Path to the XBRL/HTML file
        
    Returns:
        dict: Dictionary of DataFrames, where keys are auto-generated table identifiers
    """
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')
    
    # Find all tables
    tables = soup.find_all('table')
    print(f"Found {len(tables)} tables in the document")
    
    # Dictionary to store DataFrames
    dataframes = {}
    
    # Process each table
    for i, table in enumerate(tables):
        try:
            # Generate a table identifier
            # Try to find a caption or a title for better identification
            caption = table.find('caption')
            title_element = table.find(lambda tag: tag.name in ['h1', 'h2', 'h3', 'h4', 'h5'] and tag.text.strip())
            
            if caption and caption.text.strip():
                table_id = f"table_{i}_{clean_title(caption.text.strip())}"
            elif title_element and title_element.text.strip():
                table_id = f"table_{i}_{clean_title(title_element.text.strip())}"
            else:
                # Try to find nearby headings
                prev_heading = table.find_previous(['h1', 'h2', 'h3', 'h4', 'h5'])
                if prev_heading and prev_heading.text.strip():
                    table_id = f"table_{i}_{clean_title(prev_heading.text.strip())}"
                else:
                    table_id = f"table_{i}"
            
            # Parse the table into a pandas DataFrame
            df = parse_table_to_dataframe(table)
            
            # Store the DataFrame
            dataframes[table_id] = df
            print(f"Successfully parsed table: {table_id}")
            print(f"Shape: {df.shape}")
            
        except Exception as e:
            print(f"Error parsing table {i}: {str(e)}")
    
    return dataframes

def clean_title(title):
    """Clean a title string to make it suitable for a filename or dict key"""
    # Replace multiple spaces with a single underscore
    title = re.sub(r'\s+', '_', title)
    # Remove special characters
    title = re.sub(r'[^\w]', '', title)
    # Truncate long titles
    return title[:50].lower()

def parse_table_to_dataframe(table):
    """
    Parse an HTML table into a pandas DataFrame.
    
    Args:
        table (bs4.element.Tag): BeautifulSoup table element
        
    Returns:
        pandas.DataFrame: DataFrame containing the table data
    """
    # Extract headers
    headers = []
    header_row = table.find('tr')
    
    # If there's a thead, use it for headers
    thead = table.find('thead')
    if thead:
        header_row = thead.find('tr')
    
    if header_row:
        headers = [th.text.strip() for th in header_row.find_all(['th', 'td'])]
    
    # Extract rows
    rows = []
    tbody = table.find('tbody')
    if tbody:
        # If tbody exists, get rows from there
        table_rows = tbody.find_all('tr')
    else:
        # Otherwise get all rows and skip the header if it exists
        table_rows = table.find_all('tr')
        if headers and len(table_rows) > 0:
            table_rows = table_rows[1:]
    
    for row in table_rows:
        cells = [td.text.strip() for td in row.find_all(['td', 'th'])]
        if cells:  # Skip empty rows
            rows.append(cells)
    
    # Create DataFrame
    if headers and rows:
        # Make sure all rows have the same length as headers
        for i, row in enumerate(rows):
            if len(row) < len(headers):
                # Pad with empty strings
                rows[i] = row + [''] * (len(headers) - len(row))
            elif len(row) > len(headers):
                # Truncate
                rows[i] = row[:len(headers)]
                
        df = pd.DataFrame(rows, columns=headers)
    elif rows:
        # No headers, use generic column names
        max_cols = max(len(row) for row in rows)
        cols = [f'Column_{i}' for i in range(max_cols)]
        
        # Ensure all rows have the same length
        for i, row in enumerate(rows):
            if len(row) < max_cols:
                rows[i] = row + [''] * (max_cols - len(row))
                
        df = pd.DataFrame(rows, columns=cols)
    else:
        # Empty table
        df = pd.DataFrame()
    
    return df

def save_dataframes_to_csv(dataframes, output_dir):
    """
    Save DataFrames to CSV files.
    
    Args:
        dataframes (dict): Dictionary of DataFrames
        output_dir (str): Directory to save the CSV files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    for table_id, df in dataframes.items():
        csv_path = os.path.join(output_dir, f"{table_id}.csv")
        df.to_csv(csv_path, index=False)
        print(f"Saved: {csv_path}")



In [13]:
file_path = "../filings/sec-edgar-filings/NVDA/10-K/0001045810-25-000023/primary-document.html"
tables = parse_xbrl_tables(file_path)

Found 68 tables in the document
Successfully parsed table: table_0
Shape: (1, 6)
Successfully parsed table: table_1
Shape: (1, 6)
Successfully parsed table: table_2
Shape: (6, 6)
Successfully parsed table: table_3
Shape: (2, 9)
Successfully parsed table: table_4
Shape: (1, 30)
Successfully parsed table: table_5
Shape: (29, 9)
Successfully parsed table: table_6
Shape: (6, 15)
Successfully parsed table: table_7
Shape: (5, 27)
Successfully parsed table: table_8
Shape: (4, 36)
Successfully parsed table: table_9
Shape: (10, 18)
Successfully parsed table: table_10
Shape: (17, 12)
Successfully parsed table: table_11
Shape: (7, 24)
Successfully parsed table: table_12
Shape: (8, 24)
Successfully parsed table: table_13
Shape: (7, 12)
Successfully parsed table: table_14
Shape: (10, 24)
Successfully parsed table: table_15
Shape: (8, 20)
Successfully parsed table: table_16
Shape: (6, 12)
Successfully parsed table: table_17
Shape: (7, 12)
Successfully parsed table: table_18
Shape: (9, 9)
Successfull

In [64]:
# Filter tables that contain "Income Statement"
financials = {}

for table_id, df in tables.items():

    df_text = df.to_string().lower()

    if "total liabilities and shareholders' equity" in df_text:
        financials[table_id] = df
        print(f"FOUND BALANCE SHEET: {table_id}")
    elif "cash and cash equivalents at end of period" in df_text:
        financials[table_id] = df
        print(f"FOUND CASH FLOWS: {table_id}")
    elif "total operating expenses" in df_text and "net income per share" in df_text:
        financials[table_id] = df
        print(f"FOUND INCOME STATEMENT: {table_id}")

for keys in financials.keys():
    print(keys)
    print(financials[keys])

FOUND INCOME STATEMENT: table_21
FOUND BALANCE SHEET: table_23
FOUND CASH FLOWS: table_25


In [71]:
financials['table_21']

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,,Year Ended,,,,,,,,,,,,,,,,
1,,"Jan 26, 2025",,"Jan 28, 2024",,"Jan 29, 2023",,,,,,,,,,,,
2,Revenue,$,130497.0,,,$,60922.0,,,$,26974.0,,,,,,,
3,Cost of revenue,32639,,,16621,,,11618,,,,,,,,,,
4,Gross profit,97858,,,44301,,,15356,,,,,,,,,,
5,Operating expenses,,,,,,,,,,,,,,,,,
6,Research and development,12914,,,8675,,,7339,,,,,,,,,,
7,"Sales, general and administrative",3491,,,2654,,,2440,,,,,,,,,,
8,Acquisition termination cost,—,,,—,,,1353,,,,,,,,,,
9,Total operating expenses,16405,,,11329,,,11132,,,,,,,,,,


In [72]:
financials['table_23']

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,,"Jan 26, 2025",,"Jan 28, 2024",,,,,,,,
1,Assets,,,,,,,,,,,
2,Current assets:,,,,,,,,,,,
3,Cash and cash equivalents,$,8589.0,,,$,7280.0,,,,,
4,Marketable securities,34621,,,18704,,,,,,,
5,"Accounts receivable, net",23065,,,9999,,,,,,,
6,Inventories,10080,,,5282,,,,,,,
7,Prepaid expenses and other current assets,3771,,,3080,,,,,,,
8,Total current assets,80126,,,44345,,,,,,,
9,"Property and equipment, net",6283,,,3914,,,,,,,


In [73]:
financials['table_25']

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,,Year Ended,,,,,,,,,,,,,,,,
1,,"Jan 26, 2025",,"Jan 28, 2024",,"Jan 29, 2023",,,,,,,,,,,,
2,Cash flows from operating activities:,,,,,,,,,,,,,,,,,
3,Net income,$,72880.0,,,$,29760.0,,,$,4368.0,,,,,,,
4,Adjustments to reconcile net income to net cas...,,,,,,,,,,,,,,,,,
5,Stock-based compensation expense,4737,,,3549,,,2709,,,,,,,,,,
6,Depreciation and amortization,1864,,,1508,,,1544,,,,,,,,,,
7,Deferred income taxes,"(4,477)",,,"(2,489)",,,"(2,164)",,,,,,,,,,
8,(Gains) losses on non-marketable equity securi...,"(1,030)",,,(238),,,45,,,,,,,,,,
9,Acquisition termination cost,—,,,—,,,1353,,,,,,,,,,
