# Companies competitive relationship

In [1]:
import sys
import pandas as pd
from collections import defaultdict
sys.path.append("../")
from wallstreet_quant.edgar_extractor import fetch_10K_and_10Q_filings, extract_items_from_filing

In [2]:
symbols = pd.read_csv("russel1000.csv")
symbols = symbols['Ticker'].to_list()
filings = defaultdict(list)
symbols = ['TSLA', 'NVDA', 'AAPL', 'AMZN', 'GOOGL', 'MSFT', 'META', 'NFLX', 'AMD', 'INTC']
for s in symbols:
    try:
        filings[s] = fetch_10K_and_10Q_filings(s, "2023-01-01", "2025-6-6",form=["10-K"])
    except Exception as e:
        print(e)

In [3]:
import re
def get_table_of_contents(filing_obj):
    """
    Extracts the table of contents and returns it as a list of dictionaries.
    Uses the original logic to find items in the TOC.
    
    Returns:
        list: List of dictionaries with 'item' and 'title' keys
    """
    try:
        full_text = filing_obj.text()
        full_text = re.sub(r'[ \t]+', ' ', full_text)
        
        # Define expected keywords for each item type (original logic)
        item_keywords = {
            '1': ['BUSINESS', 'OVERVIEW', 'DESCRIPTION', 'OPERATIONS'],
            '1A': ['RISK FACTORS', 'RISKS', 'RISK'],
            '1B': ['UNRESOLVED', 'STAFF', 'COMMENTS'],
            '1C': ['CYBERSECURITY'],
            '2': ['PROPERTIES', 'PROPERTY'],
            '3': ['LEGAL PROCEEDINGS', 'LITIGATION', 'LEGAL'],
            '4': ['MINE SAFETY', 'MINING'],
            '5': ['MARKET', 'REGISTRANT', 'SECURITIES'],
            '6': ['SELECTED FINANCIAL', 'FINANCIAL DATA'],
            '7': ['MANAGEMENT\'S DISCUSSION', 'MD&A', 'FINANCIAL CONDITION', 'RESULTS OF OPERATIONS'],
            '7A': ['QUANTITATIVE', 'QUALITATIVE', 'MARKET RISK'],
            '8': ['FINANCIAL STATEMENTS', 'CONSOLIDATED'],
            '9': ['CHANGES', 'DISAGREEMENTS', 'ACCOUNTANTS'],
            '9A': ['CONTROLS', 'PROCEDURES', 'INTERNAL CONTROL'],
            '9B': ['OTHER INFORMATION'],
            '10': ['DIRECTORS', 'EXECUTIVE OFFICERS', 'GOVERNANCE'],
            '11': ['EXECUTIVE COMPENSATION', 'COMPENSATION'],
            '12': ['SECURITY OWNERSHIP', 'BENEFICIAL OWNERSHIP'],
            '13': ['CERTAIN RELATIONSHIPS', 'RELATED TRANSACTIONS'],
            '14': ['PRINCIPAL ACCOUNTING', 'FEES', 'SERVICES'],
            '15': ['EXHIBITS', 'FINANCIAL STATEMENT']
        }
        
        toc_list = []
        all_possible_items = ['1', '1A', '1B', '1C', '2', '3', '4', '5', '6', '7', '7A', '8', '9', '9A', '9B', '10', '11', '12', '13', '14', '15', '16']
        
        for item_num in all_possible_items:
            # Find all mentions of this item (original pattern)
            item_pattern = re.compile(rf'ITEM[\s\u00A0\u2000-\u200B\u2028\u2029]+{re.escape(item_num)}(?:\.|:|\s|\u00A0|$)', re.IGNORECASE)
            
            for match in item_pattern.finditer(full_text):
                start_pos = match.start()
                
                # Check context around this mention for relevant keywords (original logic)
                context_start = max(0, start_pos - 200)
                context_end = min(len(full_text), start_pos + 500)
                context = full_text[context_start:context_end].upper()
                
                # Check if any of the expected keywords appear in the context
                keywords = item_keywords.get(item_num, [])
                keyword_found = any(keyword in context for keyword in keywords)
                
                if keyword_found:
                    # Find where this item would end (next ITEM or end of line for TOC)
                    next_item_pattern = re.compile(rf'ITEM[\s\u00A0\u2000-\u200B\u2028\u2029]+\d+[A-Z]?', re.IGNORECASE)
                    next_match = next_item_pattern.search(full_text, start_pos + len(match.group(0)))
                    
                    if next_match:
                        content_end = next_match.start()
                    else:
                        # Look for end of line or section
                        line_end = full_text.find('\n', start_pos + len(match.group(0)))
                        content_end = line_end if line_end != -1 else len(full_text)
                    
                    # Extract the potential TOC entry
                    toc_entry = full_text[start_pos:content_end].strip()
                    
                    # If this looks like a TOC entry (short, contains item keywords), save it
                    if len(toc_entry) < 1000:  # TOC entries are typically short
                        # Extract the full title (everything after "ITEM X" part)
                        title_match = re.search(rf'ITEM[\s\u00A0\u2000-\u200B\u2028\u2029]+{re.escape(item_num)}[.\s]*(.+)', toc_entry, re.IGNORECASE)
                        if title_match:
                            full_title = title_match.group(1).strip()
                            # Clean up the title (remove page numbers, dots, etc.)
                            full_title = re.sub(r'\.{2,}.*$', '', full_title)  # Remove dotted leaders and page numbers
                            full_title = re.sub(r'\s+\d+\s*$', '', full_title)  # Remove trailing page numbers
                            full_title = full_title.strip()
                            
                            if full_title:
                                toc_list.append({
                                    'item': item_num,
                                    'title': full_title,
                                    'toc_entry': toc_entry
                                })
                                break  # Found the TOC entry for this item, stop looking
        
        return toc_list
        
    except Exception as e:
        print(f"Failed to extract TOC: {e}")
        return []

In [67]:


def extract_items_from_filing(filing_obj, items_to_extract):
    """
    Extracts textual sections (e.g., Item 1A, Item 7) from a filing object.
    First finds the table of contents, then uses full titles to locate sections.
    
    Parameters:
        filing_obj: A filing object returned by the 'edgar' package.
        items_to_extract: List of item names (e.g., ["1", "1A", "7", "3"]).
        
    Returns:
        dict: Dictionary of item number → extracted text
    """
    try:
        # Get the full filing text
        full_text = filing_obj.text()
        
        # Light normalization - preserve structure
        full_text = re.sub(r'[ \t]+', ' ', full_text)  # Normalize spaces/tabs only
        
        # Step 1: Get table of contents as a list
        toc_list = get_table_of_contents(filing_obj)
        
        # Step 2: Use TOC to find actual sections
        all_item_positions = find_sections_using_toc(full_text, toc_list)
        
        # Sort all found items by position
        all_item_positions.sort(key=lambda x: x['position'])
        
        # Extract content between item positions
        all_extracted = {}
        
        for i, item_info in enumerate(all_item_positions):
            item_num = item_info['item']
            start_pos = item_info['position']
            
            # Find where this item's content ends
            if i + 1 < len(all_item_positions):
                # Next item starts here
                end_pos = all_item_positions[i + 1]['position']
            else:
                # Last item - look for common ending markers
                end_markers = ['SIGNATURES', 'EXHIBIT INDEX', 'EXHIBITS']
                end_pos = len(full_text)
                
                for marker in end_markers:
                    marker_pattern = re.compile(rf'(?:^|\n)\s*{marker}', re.IGNORECASE | re.MULTILINE)
                    marker_match = marker_pattern.search(full_text, start_pos)
                    if marker_match:
                        end_pos = min(end_pos, marker_match.start())
            
            # Extract the content
            content = full_text[start_pos:end_pos].strip()
            
            if content:
                all_extracted[item_num] = content
        
        # Now filter to return only the requested items
        extracted = {}
        for item_num in items_to_extract:
            if item_num in all_extracted:
                extracted[item_num] = all_extracted[item_num]
        
        return extracted
        
    except Exception as e:
        print(f"Failed to extract items: {e}")
        return {}


# Note: get_table_of_contents method should be inserted here
# This method is provided separately and should not be modified

def find_sections_using_toc(full_text, toc_list):
    """
    Find actual sections using the titles from the table of contents.
    Uses the second match (first is TOC, second is actual section).
    """
    all_item_positions = []
   
    for toc_item in toc_list:
        item_num = toc_item['item']
        title = toc_item['title']

        # Clean the title to get the core title without page references
        # Remove common suffixes like "Pages X-Y", "Page X", "None"
        clean_title = re.sub(r'\s+Pages?\s*\d+.*$', '', title, flags=re.IGNORECASE)
        clean_title = re.sub(r'\s+None\s*$', '', clean_title, flags=re.IGNORECASE)
        if ':' in clean_title:
            clean_title = clean_title.split(':')[0].strip()    
        if not clean_title:
            continue

        # Create search patterns to find the actual section
        """
        escaped_title = re.escape(clean_title)
        # Allow flexible spacing  # After creating escaped_title, make apostrophes flexible
        # Start with your existing space normalizer
        flexible_title = escaped_title.replace(r'\ ', r'[\s\u00A0\u2000-\u200B\u2028\u2029]+')

        flexible_title = flexible_title.replace("’", r"['\\\'\u2019\u0027]")    # smart apostrophe (Unicode 8217)
        flexible_title = flexible_title.replace("'", r"['\\\'\u2019\u0027]")    # regular apostrophes
        flexible_title = flexible_title.replace(r"\'", r"['\\\'\u2019\u0027]")  # escaped apostrophe
        
        flexible_title = flexible_title.replace(r'\"', r'["\"""]')              # quotes
        flexible_title = flexible_title.replace(r'\-', r'[-–—]')                # dashes
        flexible_title = flexible_title.replace(r'\&', r'(?:&|and)')            # ampersand (remove extra backslash)
        flexible_title = flexible_title.replace(r'\.', r'\.?')                  # optional periods
        flexible_title = flexible_title.replace(r'\,', r'\,?')                  # optional commas
        flexible_title = flexible_title.replace(r'\:', r'\:?')                  # optional colons"""

        # --- build flexible_title --------------------------------------------------
        # 1) normalize smart apostrophes  ’/‘ → '
        #clean_title = clean_title.replace('\u2019', "'").replace('\u2018', "'")
        clean_title = clean_title.replace('\u2019', "'").replace('\u2018', "'")
        # 2) escape regex metachars
        escaped_title = re.escape(clean_title)

        # 3) relax spacing, punctuation, quotes, etc.
        escaped_title = escaped_title.replace(r'\ ', r'[\s\u00A0\u2000-\u200B\u2028\u2029]+')
        escaped_title = escaped_title.replace(r"'", r"[\'’]")    # straight or curly apostrophe, or none
        escaped_title = escaped_title.replace(r'\"', r'["“”]?')   # straight or curly quote, or none
        escaped_title = escaped_title.replace(r'\-', r'[-–—]')
        escaped_title = escaped_title.replace(r'\&', r'(?:&|and)')
        escaped_title = escaped_title.replace(r'\.', r'\.?')
        escaped_title = escaped_title.replace(r'\,', r'\,?')
        escaped_title = escaped_title.replace(r'\:', r'\:?')

        # 4) assign
        flexible_title = escaped_title


        print(f"Original title: '{title}'")
        print(f"Clean title: '{clean_title}'") 
        print(f"Flexible title: '{flexible_title}'")

        # Multiple patterns to try
        patterns = [
            rf'ITEM\s+{re.escape(item_num)}\b[\s\S]*?{flexible_title}',
        ]

        found = False
        for pattern in patterns:
            try:
                item_pattern = re.compile(pattern, re.IGNORECASE | re.MULTILINE)

                # Find ALL matches in the document
                matches = list(item_pattern.finditer(full_text))

                # Take the second match if it exists (first is TOC, second is actual section)
                if len(matches) >= 2:
                    actual_section_match = matches[1]
                    position = actual_section_match.start()

                    all_item_positions.append({
                        'item': item_num,
                        'position': position,
                        'match_text': actual_section_match.group(0).strip(),
                        'title': clean_title
                    })
                    found = True
                    break

            except re.error:
                continue
            
        # If we didn't find a second match with any pattern, this item might not have a section
        if not found:
            print(f"Warning: Could not find actual section for Item {item_num}")

    return all_item_positions


In [68]:
items_needed = ['1', '7']  
curr = extract_items_from_filing(filings['NVDA'][0], items_needed)
curr



Original title: 'Business'
Clean title: 'Business'
Flexible title: 'Business'
Original title: 'Risk Factors'
Clean title: 'Risk Factors'
Flexible title: 'Risk[\s\u00A0\u2000-\u200B\u2028\u2029]+Factors'
Original title: 'Unresolved Staff Comments'
Clean title: 'Unresolved Staff Comments'
Flexible title: 'Unresolved[\s\u00A0\u2000-\u200B\u2028\u2029]+Staff[\s\u00A0\u2000-\u200B\u2028\u2029]+Comments'
Original title: 'Cybersecurity'
Clean title: 'Cybersecurity'
Flexible title: 'Cybersecurity'
Original title: 'Properties'
Clean title: 'Properties'
Flexible title: 'Properties'
Original title: 'Legal Proceedings'
Clean title: 'Legal Proceedings'
Flexible title: 'Legal[\s\u00A0\u2000-\u200B\u2028\u2029]+Proceedings'
Original title: 'Mine Safety Disclosures'
Clean title: 'Mine Safety Disclosures'
Flexible title: 'Mine[\s\u00A0\u2000-\u200B\u2028\u2029]+Safety[\s\u00A0\u2000-\u200B\u2028\u2029]+Disclosures'
Original title: 'Market for Registrant’s Common Equity, Related Stockholder Matters and 

{'1': "Item 1. Business │\n│ │\n╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n \n Our Company \n \nNVIDIA pioneered accelerated computing to help solve the most challenging computational problems. NVIDIA is now a\nfull-stack computing infrastructure company with data-center-scale offerings that are reshaping industry.\n\nOur full-stack includes the foundational CUDA programming model that runs on all NVIDIA GPUs, as well as hundreds\nof domain-specific software libraries, software development kits, or SDKs, and Application Programming Interfaces,\nor APIs. This deep and broad software stack accelerates the performance and eases the deployment of NVIDIA\naccelerated computing for computationally intensive workloads such as artificial intelligence, or AI, model\ntraining and inference, data analytics, scientific computing, and 3D graphics, with vertical-specific optimizations\nto address industries ranging from health

In [11]:
import re

s = filings['NVDA'][0].text().lower()

# matches “item 1” (word boundary after the 1) … up to “business”
pattern = re.compile(r'item\s*7\b[\s\S]*?management\'s discussion and analysis of financial condition and results of operations', re.IGNORECASE | re.MULTILINE)

positions = [m.start() for m in pattern.finditer(s)]

print(positions)

[11976]


In [12]:
print(s[219876: 219876+8000])

item 7. management's discussion and analysis of financial condition and results of operations                  │
│                                                                                                                 │
╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
the following discussion and analysis of our financial condition and results of operations should be read in
conjunction with “item 1a. risk factors,” our consolidated financial statements and related notes thereto, as well
as other cautionary statements and risks described elsewhere in this annual report on form 10-k, before deciding to
purchase, hold, or sell shares of our common stock.
                                                                                                                   
   overview                                                                                                        
                               

In [6]:
get_table_of_contents(filings['NVDA'][0])

[{'item': '1', 'title': 'Business', 'toc_entry': 'Item 1. Business 4'},
 {'item': '1A',
  'title': 'Risk Factors',
  'toc_entry': 'Item 1A. Risk Factors 13'},
 {'item': '1B',
  'title': 'Unresolved Staff Comments',
  'toc_entry': 'Item 1B. Unresolved Staff Comments 32'},
 {'item': '1C',
  'title': 'Cybersecurity',
  'toc_entry': 'Item 1C Cybersecurity 32'},
 {'item': '2', 'title': 'Properties', 'toc_entry': 'Item 2. Properties 33'},
 {'item': '3',
  'title': 'Legal Proceedings',
  'toc_entry': 'Item 3. Legal Proceedings 33'},
 {'item': '4',
  'title': 'Mine Safety Disclosures',
  'toc_entry': 'Item 4. Mine Safety Disclosures 33 \n Part II'},
 {'item': '5',
  'title': 'Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases',
  'toc_entry': 'Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases 33 \n of Equity Securities'},
 {'item': '7',
  'title': 'Management’s Discussion and Analysis of Financial Condition and 