In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re

def extract_financial_tables(file_path, output_dir="financial_tables"):
    """
    Extract and save only the key financial tables (balance sheet, income statement, cash flow)
    from an XBRL/HTML document.
    
    Args:
        file_path (str): Path to the XBRL/HTML file
        output_dir (str): Directory to save the CSV files
        
    Returns:
        dict: Dictionary of financial DataFrames with table identifiers as keys
    """
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')
    
    # Find all tables
    tables = soup.find_all('table')
    print(f"Found {len(tables)} tables in the document")
    
    # Dictionary to store financial DataFrames
    financials = {}
    
    # Process each table
    for i, table in enumerate(tables):
        try:
            # Generate a table identifier
            # Try to find a caption or a title for better identification
            caption = table.find('caption')
            title_element = table.find(lambda tag: tag.name in ['h1', 'h2', 'h3', 'h4', 'h5'] and tag.text.strip())
            
            if caption and caption.text.strip():
                table_id = f"table_{i}_{clean_title(caption.text.strip())}"
            elif title_element and title_element.text.strip():
                table_id = f"table_{i}_{clean_title(title_element.text.strip())}"
            else:
                # Try to find nearby headings
                prev_heading = table.find_previous(['h1', 'h2', 'h3', 'h4', 'h5'])
                if prev_heading and prev_heading.text.strip():
                    table_id = f"table_{i}_{clean_title(prev_heading.text.strip())}"
                else:
                    table_id = f"table_{i}"
            
            # Parse the table into a pandas DataFrame
            df = parse_table_to_dataframe(table)
            
            # Skip empty tables and very small tables (likely not financial statements)
            if df.empty or (df.shape[0] < 5 and df.shape[1] < 3):
                continue
                
            # Convert DataFrame to string for keyword search
            df_text = df.to_string().lower()
            
            # Check for specific financial tables using key phrases
            if "total liabilities and shareholders' equity" in df_text:
                financials["balance_sheet"] = df
                print(f"FOUND BALANCE SHEET: {table_id}")
            elif "cash and cash equivalents at end of period" in df_text:
                financials["cash_flow_statement"] = df
                print(f"FOUND CASH FLOWS: {table_id}")
            elif "total operating expenses" in df_text and "net income per share" in df_text:
                financials["income_statement"] = df
                print(f"FOUND INCOME STATEMENT: {table_id}")
                
        except Exception as e:
            print(f"Error parsing table {i}: {str(e)}")
    
    # Create output directory if specified and financials found
    if output_dir and financials:
        os.makedirs(output_dir, exist_ok=True)
        
        # Save each financial table as CSV
        for table_name, df in financials.items():
            csv_path = os.path.join(output_dir, f"{table_name}.csv")
            df.to_csv(csv_path, index=False)
            print(f"Saved: {csv_path}")
    
    # Print summary of found financial tables
    print(f"\nFound {len(financials)} financial tables:")
    for table_name in financials.keys():
        print(f"- {table_name}")
    
    return financials

def clean_title(title):
    """Clean a title string to make it suitable for a filename or dict key"""
    # Replace multiple spaces with a single underscore
    title = re.sub(r'\s+', '_', title)
    # Remove special characters
    title = re.sub(r'[^\w]', '', title)
    # Truncate long titles
    return title[:50].lower()

def parse_table_to_dataframe(table):
    """
    Parse an HTML table into a pandas DataFrame.
    
    Args:
        table (bs4.element.Tag): BeautifulSoup table element
        
    Returns:
        pandas.DataFrame: DataFrame containing the table data
    """
    # Extract headers
    headers = []
    header_row = table.find('tr')
    
    # If there's a thead, use it for headers
    thead = table.find('thead')
    if thead:
        header_row = thead.find('tr')
    
    if header_row:
        headers = [th.text.strip() for th in header_row.find_all(['th', 'td'])]
    
    # Extract rows
    rows = []
    tbody = table.find('tbody')
    if tbody:
        # If tbody exists, get rows from there
        table_rows = tbody.find_all('tr')
    else:
        # Otherwise get all rows and skip the header if it exists
        table_rows = table.find_all('tr')
        if headers and len(table_rows) > 0:
            table_rows = table_rows[1:]
    
    for row in table_rows:
        cells = [td.text.strip() for td in row.find_all(['td', 'th'])]
        if cells:  # Skip empty rows
            rows.append(cells)
    
    # Create DataFrame
    if headers and rows:
        # Make sure all rows have the same length as headers
        for i, row in enumerate(rows):
            if len(row) < len(headers):
                # Pad with empty strings
                rows[i] = row + [''] * (len(headers) - len(row))
            elif len(row) > len(headers):
                # Truncate
                rows[i] = row[:len(headers)]
                
        df = pd.DataFrame(rows, columns=headers)
    elif rows:
        # No headers, use generic column names
        max_cols = max(len(row) for row in rows)
        cols = [f'Column_{i}' for i in range(max_cols)]
        
        # Ensure all rows have the same length
        for i, row in enumerate(rows):
            if len(row) < max_cols:
                rows[i] = row + [''] * (max_cols - len(row))
                
        df = pd.DataFrame(rows, columns=cols)
    else:
        # Empty table
        df = pd.DataFrame()
    
    return df

# Example usage
if __name__ == "__main__":
    # Path to the XBRL file
    file_path = "../filings/sec-edgar-filings/NVDA/10-K/0001045810-25-000023/nvda_primary-document.html"
    
    # Extract and save financial tables
    financial_tables = extract_financial_tables(file_path)
    
    # Display the found tables
    for table_name, df in financial_tables.items():
        print(f"\n{table_name.upper()}:")
        print(df.head())

Found 68 tables in the document
FOUND INCOME STATEMENT: table_21
FOUND BALANCE SHEET: table_23
FOUND CASH FLOWS: table_25
Saved: financial_tables/income_statement.csv
Saved: financial_tables/balance_sheet.csv
Saved: financial_tables/cash_flow_statement.csv

Found 3 financial tables:
- income_statement
- balance_sheet
- cash_flow_statement

INCOME_STATEMENT:
                                                                               \
0                     Year Ended                                                
1                   Jan 26, 2025           Jan 28, 2024          Jan 29, 2023   
2          Revenue             $  130,497                                   $   
3  Cost of revenue        32,639                         16,621                 
4     Gross profit        97,858                         44,301                 

                                              
0                                             
1                                             
2  60,922   

In [18]:
# import os
# from bs4 import BeautifulSoup
# import re

# def extract_risk_factors(file_path, output_file=None):
#     """
#     Extract the Risk Factors section from an XBRL/HTML 10-K document.
    
#     Args:
#         file_path (str): Path to the XBRL/HTML file
#         output_file (str, optional): Path to save the extracted text
        
#     Returns:
#         str: The extracted Risk Factors text
#     """
#     # Check if file exists
#     if not os.path.exists(file_path):
#         raise FileNotFoundError(f"File not found: {file_path}")
    
#     # Read the file
#     with open(file_path, 'r', encoding='utf-8') as file:
#         content = file.read()
    
#     # Parse the HTML content
#     soup = BeautifulSoup(content, 'html.parser')
    
#     # Search for the Risk Factors section
#     # We'll try different approaches to find it
    
#     # First approach: Look for heading elements containing the text
#     risk_start = None
#     risk_end = None
    
#     # Try to find the section headings (they might be in different heading levels)
#     for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'p']):
#         text = heading.get_text().strip()
        
#         # Look for the start of the Risk Factors section
#         if re.search(r'item\s+1a\.?\s+risk\s+factors', text, re.IGNORECASE):
#             risk_start = heading
#             print(f"Found Risk Factors section start: {text}")
            
#         # Look for the start of the next section
#         if risk_start and re.search(r'item\s+1b\.?\s+unresolved\s+staff\s+comments', text, re.IGNORECASE):
#             risk_end = heading
#             print(f"Found next section start: {text}")
#             break
    
#     # If we found both the start and end points
#     if risk_start and risk_end:
#         # Method 1: Extract all elements between start and end
#         risk_text = ""
#         current = risk_start.next_element
        
#         while current and current != risk_end:
#             if hasattr(current, 'get_text'):
#                 text = current.get_text().strip()
#                 if text:
#                     risk_text += text + "\n\n"
#             # Move to the next element
#             try:
#                 current = current.next_element
#             except AttributeError:
#                 break
                
#     else:
#         # Alternative method: Try to find the sections by text search
#         print("Trying alternative method: text search")
        
#         # Find the Risk Factors section by text search
#         risk_pattern = re.compile(r'item\s+1a\.?\s+risk\s+factors', re.IGNORECASE)
#         next_section_pattern = re.compile(r'item\s+1b\.?\s+unresolved\s+staff\s+comments', re.IGNORECASE)
        
#         # Get the full text
#         full_text = soup.get_text()
        
#         # Find the indices of the sections
#         matches = list(risk_pattern.finditer(full_text))
#         if matches:
#             start_idx = matches[0].start()
            
#             # Find the next section
#             next_matches = list(next_section_pattern.finditer(full_text))
#             if next_matches:
#                 end_idx = next_matches[0].start()
                
#                 # Extract the text between the two sections
#                 risk_text = full_text[start_idx:end_idx].strip()
#                 print(f"Found Risk Factors section via text search: {len(risk_text)} characters")
#             else:
#                 risk_text = "Next section marker not found."
#         else:
#             risk_text = "Risk Factors section not found."
    
#     # Clean up the text
#     # Remove excessive whitespace and normalize line breaks
#     risk_text = re.sub(r'\n\s*\n', '\n\n', risk_text)
#     risk_text = re.sub(r' +', ' ', risk_text)
    
#     # Save to file if requested
#     if output_file and risk_text:
#         with open(output_file, 'w', encoding='utf-8') as out_file:
#             out_file.write(risk_text)
#         print(f"Saved Risk Factors to: {output_file}")
    
#     return risk_text

In [16]:
# # Example usage
# if __name__ == "__main__":
#     # Path to the XBRL file
#     file_path = "../filings/sec-edgar-filings/NVDA/10-K/0001045810-25-000023/nvda_primary-document.html"
    
#     # Output file path
#     output_file = "nvidia_risk_factors.txt"
    
#     # Extract risk factors
#     risk_factors = extract_risk_factors(file_path, output_file)
    
#     # Print the first 500 characters as a preview
#     print("\nRISK FACTORS PREVIEW:")
#     # print(risk_factors[:500] + "...")
#     print(risk_factors)

In [15]:
# import os
# import re
# from bs4 import BeautifulSoup

# def extract_risk_factors(file_path, output_file=None):
#     """
#     Extract the Risk Factors section from an XBRL/HTML 10-K document.
    
#     Args:
#         file_path (str): Path to the XBRL/HTML file
#         output_file (str, optional): Path to save the extracted text
        
#     Returns:
#         str: The extracted Risk Factors text
#     """
#     # Check if file exists
#     if not os.path.exists(file_path):
#         raise FileNotFoundError(f"File not found: {file_path}")
    
#     # Read the file
#     with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
#         content = file.read()
    
#     # Parse the HTML content
#     soup = BeautifulSoup(content, 'html.parser')
    
#     # Remove script and style elements that might interfere with text extraction
#     for script in soup(['script', 'style']):
#         script.decompose()
    
#     # First, try to find the section by looking for specific headers or section markers
#     risk_section_pattern = re.compile(r'item\s+1a\.?\s+risk\s+factors', re.IGNORECASE)
#     next_section_pattern = re.compile(r'item\s+1b\.?\s+unresolved\s+staff\s+comments', re.IGNORECASE)
    
#     # Get all text elements
#     text_elements = []
#     for tag in soup.find_all(['div', 'p', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
#         if tag.get_text().strip():
#             text_elements.append({
#                 'element': tag,
#                 'text': tag.get_text().strip()
#             })
    
#     # Find the Risk Factors section and the next section
#     risk_start_idx = None
#     risk_end_idx = None
    
#     for idx, elem in enumerate(text_elements):
#         if risk_section_pattern.search(elem['text']) and risk_start_idx is None:
#             risk_start_idx = idx
#             print(f"Found Risk Factors section: {elem['text']}")
        
#         if risk_start_idx is not None and next_section_pattern.search(elem['text']):
#             risk_end_idx = idx
#             print(f"Found next section: {elem['text']}")
#             break
    
#     # Extract the risk factors content
#     risk_factors_text = ""
    
#     if risk_start_idx is not None and risk_end_idx is not None:
#         # Extract text between the start and end sections
#         for idx in range(risk_start_idx, risk_end_idx):
#             text = text_elements[idx]['text']
#             if text and not re.match(r'^Table of Contents$', text):
#                 risk_factors_text += text + "\n\n"
#     else:
#         # Alternative method: Try full text search
#         print("Using alternative method: full text search")
#         full_text = soup.get_text()
        
#         matches = list(risk_section_pattern.finditer(full_text))
#         next_matches = list(next_section_pattern.finditer(full_text))
        
#         if matches and next_matches:
#             start_idx = matches[0].start()
#             end_idx = next_matches[0].start()
#             risk_factors_text = full_text[start_idx:end_idx].strip()
    
#     # Clean up the text
#     # Remove page numbers
#     risk_factors_text = re.sub(r'\n\d+\n', '\n', risk_factors_text)
    
#     # Remove "Table of Contents" references
#     risk_factors_text = re.sub(r'Table of Contents', '', risk_factors_text)
    
#     # Remove excessive whitespace and normalize line breaks
#     risk_factors_text = re.sub(r'\n\s*\n', '\n\n', risk_factors_text)
#     risk_factors_text = re.sub(r' +', ' ', risk_factors_text)
    
#     # Extract risk factors summary and categories
#     # Structure the output with headers and bullet points
#     structured_output = ""
    
#     # Add the title
#     if "Item 1A. Risk Factors" in risk_factors_text:
#         structured_output += "Item 1A. Risk Factors\n\n"
    
#     # Process and add the content with better formatting
#     content_lines = risk_factors_text.split('\n')
#     current_section = ""
    
#     for line in content_lines:
#         line = line.strip()
        
#         # Skip empty lines and page numbers
#         if not line or re.match(r'^\d+$', line):
#             continue
            
#         # Check if this is a new section heading
#         if "Risk Factors Summary" in line:
#             structured_output += "Risk Factors Summary\n\n"
#             current_section = "summary"
#         elif re.search(r'Risks Related to', line):
#             structured_output += f"\n{line}\n\n"
#             current_section = "category"
#         elif line.startswith("•") or line.startswith("-"):
#             structured_output += f"{line}\n"
#         else:
#             structured_output += f"{line}\n"
    
#     # Save to file if requested
#     if output_file and structured_output:
#         with open(output_file, 'w', encoding='utf-8') as out_file:
#             out_file.write(structured_output)
#         print(f"Saved Risk Factors to: {output_file}")
    
#     return structured_output

# # Function to clean HTML and extract plain text
# def extract_plain_text_from_html(html_content):
#     """
#     Extracts plain text from HTML content, preserving structure.
    
#     Args:
#         html_content (str): HTML content
        
#     Returns:
#         str: Plain text
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')
    
#     # Remove script and style elements
#     for script in soup(['script', 'style']):
#         script.decompose()
    
#     # Process the HTML structure
#     lines = []
    
#     # Process headings, paragraphs, list items
#     for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'div']):
#         text = tag.get_text(strip=True)
#         if text:
#             if tag.name.startswith('h'):
#                 lines.append(f"\n{text}\n")
#             elif tag.name == 'li':
#                 lines.append(f"• {text}")
#             else:
#                 lines.append(text)
    
#     return '\n'.join(lines)

# # Example usage
# if __name__ == "__main__":
#     # Path to the XBRL file
#     file_path = "../filings/sec-edgar-filings/NVDA/10-K/0001045810-25-000023/nvda_primary-document.html"
    
#     # Output file path
#     output_file = "nvidia_risk_factors.txt"
    
#     # Extract risk factors
#     risk_factors = extract_risk_factors(file_path, output_file)
    
#     # Print the first 1000 characters as a preview
#     print("\nRISK FACTORS PREVIEW:")
#     print(risk_factors[:1000] + "...")

In [2]:
import os
from bs4 import BeautifulSoup
import re

def extract_risk_factors(file_path, output_file=None):
    """
    Extract the Risk Factors section from an XBRL/HTML 10-K document.
    
    Args:
        file_path (str): Path to the XBRL/HTML file
        output_file (str, optional): Path to save the extracted text
        
    Returns:
        str: The extracted Risk Factors text
    """
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Read the file
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')
    
    # Define patterns for finding the Risk Factors section
    risk_start_pattern = re.compile(r'Item\s+1A\.?\s+Risk\s+Factors', re.IGNORECASE)
    next_section_pattern = re.compile(r'Item\s+1B\.?\s+Unresolved\s+Staff\s+Comments', re.IGNORECASE)
    
    # Get the full text
    full_text = soup.get_text()
    
    # Find the start and end positions of the Risk Factors section
    risk_start_matches = list(risk_start_pattern.finditer(full_text))
    next_section_matches = list(next_section_pattern.finditer(full_text))
    
    if not risk_start_matches or not next_section_matches:
        print("Could not find the Risk Factors section or next section.")
        return ""
    
    # Get the text between the start and end positions
    risk_start_pos = risk_start_matches[0].start()
    next_section_pos = next_section_matches[0].start()
    
    risk_factors_text = full_text[risk_start_pos:next_section_pos].strip()
    print(f"Extracted {len(risk_factors_text)} characters of Risk Factors text")
    
    # Clean up the text
    # Remove page numbers
    risk_factors_text = re.sub(r'\n\d+\n', '\n', risk_factors_text)
    
    # Remove "Table of Contents" references
    risk_factors_text = re.sub(r'Table of Contents', '', risk_factors_text)
    
    # Remove excessive whitespace and normalize line breaks
    risk_factors_text = re.sub(r'\n\s*\n', '\n\n', risk_factors_text)
    risk_factors_text = re.sub(r' +', ' ', risk_factors_text)
    
    # Deduplicate the content
    content_lines = risk_factors_text.split('\n')
    unique_lines = []
    seen_lines = set()
    
    for line in content_lines:
        line = line.strip()
        # Skip empty lines and already seen lines
        if not line or line in seen_lines:
            continue
        seen_lines.add(line)
        unique_lines.append(line)
    
    deduped_text = '\n'.join(unique_lines)
    
    # Save to file if requested
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as out_file:
            out_file.write(deduped_text)
        print(f"Saved Risk Factors to: {output_file}")
    
    return deduped_text

# Example usage
if __name__ == "__main__":
    # Path to the XBRL file
    file_path = "../filings/sec-edgar-filings/NVDA/10-K/0001045810-25-000023/nvda_primary-document.html"
    
    # Output file path
    output_file = "nvidia_risk_factors.txt"
    
    # Extract risk factors
    risk_factors = extract_risk_factors(file_path, output_file)
    
    # Print the first 500 characters as a preview
    if risk_factors:
        print("\nRISK FACTORS PREVIEW:")
        print(risk_factors[:500] + "...")

Extracted 125813 characters of Risk Factors text
Saved Risk Factors to: nvidia_risk_factors.txt

RISK FACTORS PREVIEW:
Item 1A. Risk Factors – Risks Related to Regulatory, Legal, Our Stock and Other Matters” for a discussion of this potential impact.10Compliance with laws, rules, and regulations has not otherwise had a material effect upon our capital expenditures, results of operations, or competitive position and we do not currently anticipate material capital expenditures for environmental control facilities. Compliance with existing or future governmental regulations, including, but not limited to, those per...
