# EDGAR XBRL Parser: Risk Factors & Cybersecurity

This notebook demonstrates how to download and parse XBRL files from SEC EDGAR, specifically extracting:
- Section 1A: Risk Factors
- Section 1C: Cybersecurity

In [1]:
#OPENAPI KEY - key removed, confidential
import os
os.environ["OPENAI_API_KEY"] = ""

In [2]:
import os
print(os.getenv("OPENAI_API_KEY")[:15])  # shows first few chars only

sk-proj-A165Dgv


In [3]:
# check if API key is active

from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

try:
    response = client.models.list()
    print("‚úÖ API key is active!")
    print("Available models:", [m.id for m in response.data[:5]])
except Exception as e:
    print("‚ùå API key is invalid or inactive.")
    print("Error:", e)

‚úÖ API key is active!
Available models: ['gpt-4-0613', 'gpt-4', 'gpt-3.5-turbo', 'gpt-5.1-codex-mini', 'gpt-5.1-chat-latest']


In [4]:
from openai import OpenAI
client = OpenAI()

In [5]:
!pip install sec-edgar-downloader requests beautifulsoup4 lxml pandas --break-system-packages



In [6]:
!pip install -q sentence-transformers

## 1. Import Libraries

In [7]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
import json
from datetime import datetime

## 2. Configuration

Set up your user agent and company information. **Important**: SEC requires you to identify yourself.

In [8]:
# REQUIRED: Update with your information
COMPANY_NAME = "Your Company Name"
EMAIL = "your.email@example.com"

# User agent for SEC requests
HEADERS = {
    'User-Agent': f'{COMPANY_NAME} {EMAIL}',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov'
}

# Base URLs
SEC_BASE_URL = "https://www.sec.gov"
EDGAR_SEARCH_URL = "https://www.sec.gov/cgi-bin/browse-edgar"

## 3. Helper Functions

In [9]:
def get_company_cik(ticker: str) -> str:
    """
    Get CIK (Central Index Key) from ticker symbol.
    """
    url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={ticker}&type=10-K&dateb=&owner=exclude&count=1"
    response = requests.get(url, headers=HEADERS)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    cik_element = soup.find('span', class_='companyName')
    
    if cik_element:
        cik_match = re.search(r'CIK=(\d+)', str(cik_element))
        if cik_match:
            return cik_match.group(1).zfill(10)
    
    raise ValueError(f"Could not find CIK for ticker: {ticker}")


def get_latest_10k_filing(cik: str) -> Dict:
    """
    Get the latest 10-K filing information for a company.
    """
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    response = requests.get(url, headers=HEADERS)
    data = response.json()
    
    # Find the most recent 10-K
    filings = data['filings']['recent']
    for i, form in enumerate(filings['form']):
        if form == '10-K':
            return {
                'accessionNumber': filings['accessionNumber'][i],
                'filingDate': filings['filingDate'][i],
                'primaryDocument': filings['primaryDocument'][i]
            }
    
    raise ValueError(f"No 10-K filing found for CIK: {cik}")


def download_filing_files(cik: str, accession_number: str, output_dir: str = './filings') -> str:
    """
    Download all files associated with a filing.
    """
    # Remove dashes from accession number for URL
    accession_no_dash = accession_number.replace('-', '')
    
    # Create output directory
    filing_dir = Path(output_dir) / f"{cik}_{accession_number}"
    filing_dir.mkdir(parents=True, exist_ok=True)
    
    # Get the filing index page
    index_url = f"https://www.sec.gov/cgi-bin/viewer?action=view&cik={cik}&accession_number={accession_number}&xbrl_type=v"
    
    # Alternative: Direct archive access
    archive_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_no_dash}/"
    
    print(f"Downloading from: {archive_url}")
    response = requests.get(archive_url, headers=HEADERS)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all file links
    files_downloaded = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not href.startswith('?'):
            file_url = f"{SEC_BASE_URL}{href}" if href.startswith('/') else f"{archive_url}{href}"
            filename = href.split('/')[-1]
            
            # Download XML files and the main filing
            if filename.endswith(('.xml', '.htm', '.html', '.xsd')):
                print(f"  Downloading: {filename}")
                file_response = requests.get(file_url, headers=HEADERS)
                
                file_path = filing_dir / filename
                with open(file_path, 'wb') as f:
                    f.write(file_response.content)
                
                files_downloaded.append(str(file_path))
    
    print(f"Downloaded {len(files_downloaded)} files to {filing_dir}")
    return str(filing_dir)

## 4. XBRL Parsing Functions

In [10]:
def find_xbrl_instance_file(filing_dir: str) -> Optional[str]:
    """
    Find the main XBRL instance document (typically ends with _htm.xml).
    """
    filing_path = Path(filing_dir)
    
    # Look for the instance document (usually the largest .xml file or one ending in _htm.xml)
    xml_files = list(filing_path.glob('*.xml'))
    
    # Prioritize files ending with _htm.xml
    for xml_file in xml_files:
        if '_htm.xml' in xml_file.name:
            return str(xml_file)
    
    # Otherwise, return the largest XML file
    if xml_files:
        largest_file = max(xml_files, key=lambda x: x.stat().st_size)
        return str(largest_file)
    
    return None


def parse_xbrl_for_text_blocks(xml_file: str) -> Dict[str, str]:
    """
    Parse XBRL file and extract text blocks, especially Risk Factors and Cybersecurity.
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Common XBRL namespaces
    namespaces = {
        'us-gaap': 'http://fasb.org/us-gaap/2023',
        'dei': 'http://xbrl.sec.gov/dei/2023',
        'xbrli': 'http://www.xbrl.org/2003/instance',
    }
    
    # Try to detect namespaces dynamically
    for elem in root.iter():
        if '}' in elem.tag:
            ns = elem.tag.split('}')[0].strip('{')
            prefix = elem.tag.split('}')[1].split(':')[0] if ':' in elem.tag else None
            if prefix and prefix not in namespaces:
                namespaces[prefix] = ns
    
    text_blocks = {}
    
    # Target text block tags
    target_tags = [
        'RiskFactorTextBlock',
        'RiskFactorsTextBlock', 
        'CybersecurityTextBlock',
        'CybersecurityDisclosureTextBlock',
        'BusinessDescriptionAndBasisOfPresentationTextBlock',
        'ManagementDiscussionAndAnalysisTextBlock',
    ]
    
    # Search through all elements
    for elem in root.iter():
        tag_name = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
        
        if any(target in tag_name for target in target_tags):
            text_content = elem.text
            if text_content:
                # Clean up HTML/XBRL formatting
                text_content = clean_xbrl_text(text_content)
                text_blocks[tag_name] = text_content
                print(f"Found: {tag_name} ({len(text_content)} characters)")
    
    return text_blocks


def clean_xbrl_text(text: str) -> str:
    """
    Clean XBRL text content by removing HTML tags and extra whitespace.
    """
    # Parse HTML content
    soup = BeautifulSoup(text, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Get text
    text = soup.get_text()
    
    # Clean up whitespace
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return text


def parse_htm_for_sections(filing_dir: str) -> Dict[str, str]:
    """
    Parse the HTML filing document for specific sections.
    This is a fallback if XBRL text blocks are not available.
    """
    filing_path = Path(filing_dir)
    htm_files = list(filing_path.glob('*.htm')) + list(filing_path.glob('*.html'))
    
    if not htm_files:
        return {}
    
    # Usually the primary document is the largest
    main_file = max(htm_files, key=lambda x: x.stat().st_size)
    
    with open(main_file, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    soup = BeautifulSoup(content, 'html.parser')
    
    sections = {}
    
    # Look for Item 1A - Risk Factors
    risk_pattern = re.compile(r'item\s*1a[\s\.:-]*risk\s*factors?', re.IGNORECASE)
    cyber_pattern = re.compile(r'item\s*1c[\s\.:-]*cybersecurity', re.IGNORECASE)
    
    # Find all text elements
    all_text = soup.get_text()
    
    # Extract Item 1A
    risk_match = risk_pattern.search(all_text)
    if risk_match:
        start_idx = risk_match.start()
        # Find the next major section (Item 1B or Item 2)
        next_section = re.search(r'item\s*[12]b?[\s\.:-]', all_text[start_idx+50:], re.IGNORECASE)
        if next_section:
            end_idx = start_idx + 50 + next_section.start()
            sections['RiskFactors'] = all_text[start_idx:end_idx].strip()
    
    # Extract Item 1C
    cyber_match = cyber_pattern.search(all_text)
    if cyber_match:
        start_idx = cyber_match.start()
        # Find the next major section (Item 1D or Item 2)
        next_section = re.search(r'item\s*[12][d]?[\s\.:-]', all_text[start_idx+50:], re.IGNORECASE)
        if next_section:
            end_idx = start_idx + 50 + next_section.start()
            sections['Cybersecurity'] = all_text[start_idx:end_idx].strip()
    
    return sections

## 5. Main Execution - Download and Parse

In [11]:
import time, requests
from pathlib import Path

UA = "Your Name your.email@example.com"  # required by SEC
SESS = requests.Session()
SESS.headers.update({
    "User-Agent": UA,
    "Accept": "application/json",
})

def get_company_cik(ticker: str) -> str:
    url = "https://www.sec.gov/files/company_tickers.json"
    r = SESS.get(url, timeout=20)
    r.raise_for_status()
    data = r.json()
    for _, v in data.items():
        if v["ticker"].upper() == ticker.upper():
            return str(v["cik_str"]).zfill(10)
    raise ValueError(f"Ticker {ticker} not found.")

def get_latest_10k_filing(cik10: str, max_retries: int = 3):
    url = f"https://data.sec.gov/submissions/CIK{cik10}.json"
    for attempt in range(max_retries):
        r = SESS.get(url, timeout=30)
        if r.status_code == 429:
            time.sleep(0.6)
            continue
        r.raise_for_status()
        try:
            data = r.json()
        except Exception:
            print("Non-JSON response:\n", r.text[:500])
            raise
        forms = data["filings"]["recent"]
        for i, form in enumerate(forms["form"]):
            if form == "10-K":
                acc = forms["accessionNumber"][i]
                return {
                    "accessionNumber": acc,
                    "accession_no_dash": acc.replace("-", ""),
                    "filingDate": forms["filingDate"][i],
                }
    raise RuntimeError("No 10-K found or rate-limited.")

In [12]:
# Specify the company ticker
TICKER = "TSLA"  # Change this to any ticker you want

print(f"Processing 10-K for: {TICKER}")
print("="*50)

# Step 1: Get CIK
print("\n1. Getting CIK...")
cik = get_company_cik(TICKER)
print(f"   CIK: {cik}")

# Step 2: Get latest 10-K filing
print("\n2. Getting latest 10-K filing...")
filing_info = get_latest_10k_filing(cik)
print(f"   Accession Number: {filing_info['accessionNumber']}")
print(f"   Filing Date: {filing_info['filingDate']}")

# Step 3: Download filing files
print("\n3. Downloading filing files...")
filing_dir = download_filing_files(cik, filing_info['accessionNumber'])
print(f"   Files saved to: {filing_dir}")

Processing 10-K for: TSLA

1. Getting CIK...
   CIK: 0001318605

2. Getting latest 10-K filing...
   Accession Number: 0001628280-25-003063
   Filing Date: 2025-01-30

3. Downloading filing files...
Downloading from: https://www.sec.gov/Archives/edgar/data/0001318605/000162828025003063/
  Downloading: index.htm
  Downloading: search.htm
  Downloading: howinvestigationswork.html
  Downloading: brokers.htm
  Downloading: quickedgar.htm
  Downloading: companysearch.html
  Downloading: secforms.htm
  Downloading: publicdocs.htm
  Downloading: index.html
  Downloading: upcoming-events.htm
  Downloading: 0001628280-25-003063-index-headers.html
  Downloading: 0001628280-25-003063-index.html
  Downloading: ex41.htm
  Downloading: FilingSummary.xml
  Downloading: R1.htm
  Downloading: R10.htm
  Downloading: R11.htm
  Downloading: R12.htm
  Downloading: R13.htm
  Downloading: R14.htm
  Downloading: R15.htm
  Downloading: R16.htm
  Downloading: R17.htm
  Downloading: R18.htm
  Downloading: R19.ht

## 6. Extract Text Blocks from XBRL

In [13]:
print("\n4. Parsing XBRL file...")
xbrl_file = find_xbrl_instance_file(filing_dir)

if xbrl_file:
    print(f"   Found XBRL file: {Path(xbrl_file).name}")
    text_blocks = parse_xbrl_for_text_blocks(xbrl_file)
else:
    print("   No XBRL instance file found.")
    text_blocks = {}

# If XBRL doesn't have text blocks, try HTML parsing
if not text_blocks:
    print("\n5. Trying HTML parsing as fallback...")
    text_blocks = parse_htm_for_sections(filing_dir)

print(f"\nExtracted {len(text_blocks)} text sections")


4. Parsing XBRL file...
   Found XBRL file: tsla-20241231_htm.xml

5. Trying HTML parsing as fallback...

Extracted 2 text sections


## 7. Display Results

In [14]:
# Display summaries of extracted sections
for section_name, content in text_blocks.items():
    print(f"\n{'='*80}")
    print(f"Section: {section_name}")
    print(f"Length: {len(content)} characters")
    print(f"{'='*80}")
    print(content[:1000])  # Show first 1000 characters
    if len(content) > 1000:
        print("\n... [truncated] ...\n")


Section: RiskFactors
Length: 80 characters
Item 1A.Risk Factors13Item 1B.Unresolved Staff Comments27Item 1C.Cybersecurity28

Section: Cybersecurity
Length: 3304 characters
Item 1C.Cybersecurity28Item 2.Properties29Item 3.Legal Proceedings29Item 4.Mine Safety Disclosures29¬†PART II.¬†Item 5.Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities30Item 6.[Reserved]31Item 7.Management's Discussion and Analysis of Financial Condition and Results of Operations32Item 7A.Quantitative and Qualitative Disclosures about Market Risk44Item 8.Financial Statements and Supplementary Data45Item 9.Changes in and Disagreements with Accountants on Accounting and Financial Disclosure91Item 9A.Controls and Procedures91Item 9B.Other Information92Item 9C.Disclosure Regarding Foreign Jurisdictions that Prevent Inspections92¬†PART III.¬†Item 10.Directors, Executive Officers and Corporate Governance93Item 11.Executive Compensation93Item 12.Security Ownership

## 8. Save Extracted Sections to Files

In [15]:
# 8. Save Extracted Sections to Files
from pathlib import Path
import json
import re

# make a safe filename from a section title
def _safe(name: str) -> str:
    return re.sub(r'[^A-Za-z0-9._-]+', '_', name).strip('_')[:80]

# root output dir
output_dir = Path("/Users/brucewayne/Documents") / "extracted_sections"
output_dir.mkdir(parents=True, exist_ok=True)

# save each section as its own .txt
for section_name, content in text_blocks.items():
    output_file = output_dir / f"{_safe(section_name)}.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"Saved: {output_file}")

# also save a single JSON with all sections + a little metadata
json_file = output_dir / "all_sections.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(
        {
            "ticker": TICKER,
            "cik": cik,
            "filing_date": filing_info["filingDate"],
            "accession_number": filing_info["accessionNumber"],
            "sections": text_blocks,   # dict of {section: text}
        },
        f,
        indent=2,
        ensure_ascii=False,
    )

print(f"\nAll sections saved to JSON: {json_file}")

Saved: /Users/brucewayne/Documents/extracted_sections/RiskFactors.txt
Saved: /Users/brucewayne/Documents/extracted_sections/Cybersecurity.txt

All sections saved to JSON: /Users/brucewayne/Documents/extracted_sections/all_sections.json


## 9. Create Summary DataFrame

In [16]:
# Create a summary dataframe
summary_data = []
for section_name, content in text_blocks.items():
    summary_data.append({
        'Section': section_name,
        'Character Count': len(content),
        'Word Count': len(content.split()),
        'Preview': content[:200] + '...' if len(content) > 200 else content
    })

df_summary = pd.DataFrame(summary_data)
print("\nSummary of Extracted Sections:")
print(df_summary.to_string())

# Save summary to CSV
csv_file = output_dir / "summary.csv"
df_summary.to_csv(csv_file, index=False)
print(f"\nSummary saved to: {csv_file}")


Summary of Extracted Sections:
         Section  Character Count  Word Count                                                                                                                                                                                                      Preview
0    RiskFactors               80           7                                                                                                                             Item 1A.Risk Factors13Item 1B.Unresolved Staff Comments27Item 1C.Cybersecurity28
1  Cybersecurity             3304         417  Item 1C.Cybersecurity28Item 2.Properties29Item 3.Legal Proceedings29Item 4.Mine Safety Disclosures29¬†PART II.¬†Item 5.Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purch...

Summary saved to: /Users/brucewayne/Documents/extracted_sections/summary.csv


## 10. Advanced: Chunk Text for RAG

Split the extracted sections into chunks suitable for RAG systems.

In [17]:
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """
    Split text into overlapping chunks.
    """
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    
    return chunks


# Chunk each section
all_chunks = []
for section_name, content in text_blocks.items():
    chunks = chunk_text(content, chunk_size=1000, overlap=200)
    
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            'section': section_name,
            'chunk_id': i,
            'text': chunk,
            'char_count': len(chunk),
            'ticker': TICKER,
            'filing_date': filing_info['filingDate']
        })

df_chunks = pd.DataFrame(all_chunks)
print(f"\nCreated {len(all_chunks)} chunks from {len(text_blocks)} sections")
print(df_chunks.head())

# Save chunks
chunks_file = output_dir / "chunks.json"
with open(chunks_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, indent=2)

print(f"\nChunks saved to: {chunks_file}")


Created 6 chunks from 2 sections
         section  chunk_id                                               text  \
0    RiskFactors         0  Item 1A.Risk Factors13Item 1B.Unresolved Staff...   
1  Cybersecurity         0  Item 1C.Cybersecurity28Item 2.Properties29Item...   
2  Cybersecurity         1  m 12.Security Ownership of Certain Beneficial ...   
3  Cybersecurity         2  cerning supply chain constraints, our strategy...   
4  Cybersecurity         3  tatements and you should not place undue relia...   

   char_count ticker filing_date  
0          80   TSLA  2025-01-30  
1        1000   TSLA  2025-01-30  
2        1000   TSLA  2025-01-30  
3        1000   TSLA  2025-01-30  
4         904   TSLA  2025-01-30  

Chunks saved to: /Users/brucewayne/Documents/extracted_sections/chunks.json


## 11. Batch Processing Multiple Companies

In [18]:
def process_company(ticker: str, output_base_dir: str = './filings') -> Dict:
    """
    Process a single company's 10-K filing.
    """
    try:
        print(f"\n{'='*80}")
        print(f"Processing: {ticker}")
        print('='*80)
        
        # Get CIK
        cik = get_company_cik(ticker)
        print(f"CIK: {cik}")
        
        # Get latest filing
        filing_info = get_latest_10k_filing(cik)
        print(f"Filing Date: {filing_info['filingDate']}")
        
        # Download files
        filing_dir = download_filing_files(cik, filing_info['accessionNumber'], output_base_dir)
        
        # Parse XBRL
        xbrl_file = find_xbrl_instance_file(filing_dir)
        if xbrl_file:
            text_blocks = parse_xbrl_for_text_blocks(xbrl_file)
        else:
            text_blocks = parse_htm_for_sections(filing_dir)
        
        # Save results
        output_dir = Path(filing_dir) / "extracted_sections"
        output_dir.mkdir(exist_ok=True)
        
        json_file = output_dir / "all_sections.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump({
                'ticker': ticker,
                'cik': cik,
                'filing_date': filing_info['filingDate'],
                'accession_number': filing_info['accessionNumber'],
                'sections': text_blocks
            }, f, indent=2)
        
        return {
            'ticker': ticker,
            'status': 'success',
            'sections_found': len(text_blocks),
            'filing_date': filing_info['filingDate']
        }
        
    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")
        return {
            'ticker': ticker,
            'status': 'error',
            'error': str(e)
        }


# Example: Process multiple companies
tickers_to_process = ['AAPL', 'MSFT', 'GOOGL']

results = []
for ticker in tickers_to_process:
    result = process_company(ticker)
    results.append(result)

# Display results summary
df_results = pd.DataFrame(results)
print("\n" + "="*80)
print("BATCH PROCESSING SUMMARY")
print("="*80)
print(df_results.to_string())


Processing: AAPL
CIK: 0000320193
Filing Date: 2025-10-31
Downloading from: https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000079/
  Downloading: index.htm
  Downloading: search.htm
  Downloading: howinvestigationswork.html
  Downloading: brokers.htm
  Downloading: quickedgar.htm
  Downloading: companysearch.html
  Downloading: secforms.htm
  Downloading: publicdocs.htm
  Downloading: index.html
  Downloading: upcoming-events.htm
  Downloading: 0000320193-25-000079-index-headers.html
  Downloading: 0000320193-25-000079-index.html
  Downloading: a10-kexhibit21109272025.htm
  Downloading: a10-kexhibit23109272025.htm
  Downloading: a10-kexhibit31109272025.htm
  Downloading: a10-kexhibit31209272025.htm
  Downloading: a10-kexhibit32109272025.htm
  Downloading: a10-kexhibit4109272025.htm
  Downloading: aapl-20250927.htm
  Downloading: aapl-20250927.xsd
  Downloading: aapl-20250927_cal.xml
  Downloading: aapl-20250927_def.xml
  Downloading: aapl-20250927_htm.xml
  Downloading:

In [19]:
from sentence_transformers import SentenceTransformer

# Load a small embedding model (fast and works offline)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert your extracted text sections into embeddings
embeddings = model.encode(
    [chunk['text'] for chunk in all_chunks], 
    show_progress_bar=True
)

print(f"Generated {len(embeddings)} embeddings.")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated 6 embeddings.


In [20]:
# show the first 10 numbers of the first embedding (out of 384 total

print(embeddings[0][:10])

[-0.04852507 -0.00215493 -0.0519914  -0.02349418  0.09521586  0.0723874
  0.13519742  0.04354686 -0.06532799  0.01718577]


In [21]:
# checking structure
print(embeddings.shape)

(6, 384)


In [22]:
!pip install chromadb --quiet

import chromadb
from chromadb.utils import embedding_functions

client = chromadb.Client()
collection = client.get_or_create_collection("sec_filings")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
# Clear and add to avoid duplicates if rerunning
try:
    client.delete_collection("sec_filings")
    collection = client.get_or_create_collection("sec_filings")
except Exception:
    pass

ids = [f"chunk_{i}" for i in range(len(all_chunks))]
docs = [c["text"] for c in all_chunks]

collection.add(ids=ids, embeddings=embeddings.tolist(), documents=docs)
print(f"Stored {len(ids)} chunks")

Stored 6 chunks


In [24]:
res = collection.query(query_texts=["supply chain risks"], n_results=3)
for i, (doc, score) in enumerate(zip(res["documents"][0], res["distances"][0])):
    print(f"\nResult {i+1} (score={score:.4f}):\n{doc[:400]}...")


Result 1 (score=1.1942):
cerning supply chain constraints, our strategy, competition, future operations and production capacity, future financial position, future revenues, projected costs, profitability, expected cost reductions, capital adequacy, expectations regarding demand and acceptance for our technologies, growth opportunities and trends in the markets in which we operate, prospects and plans and objectives of man...

Result 2 (score=1.3536):
m 12.Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters93Item 13.Certain Relationships and Related Transactions, and Director Independence93Item 14.Principal Accountant Fees and Services93¬†PART IV.¬†Item 15.Exhibits and Financial Statement Schedules94Item 16.Form 10-K Summary107¬†SignaturesTable of ContentsForward-Looking StatementsThe discussions in this ...

Result 3 (score=1.4526):
Item 1C.Cybersecurity28Item 2.Properties29Item 3.Legal Proceedings29Item 4.Mine Safety Disclosures29¬†PART II.¬

In [25]:
!pip install openai tiktoken --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
import os
from openai import OpenAI

# os.environ["OPENAI_API_KEY"] = "sk-your_api_key_here"  # replace with your real key
client = OpenAI()

def ask_gpt_rag(query, k=3, model="gpt-4o-mini"):
    # Retrieve context from Chroma
    res = collection.query(query_texts=[query], n_results=k)
    context = "\n\n".join(res["documents"][0])

    # Build the RAG prompt
    prompt = f"""
    You are a financial analyst.
    Use the context below to answer the question accurately and concisely.
    If the answer isn‚Äôt clearly mentioned, say ‚ÄúNot mentioned in the filings.‚Äù

    Context:
    {context}

    Question: {query}
    Answer:
    """

    # Send to GPT
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )

    return completion.choices[0].message.content.strip()

In [27]:
import time
from openai import RateLimitError

def ask_gpt_rag(query, context, model="gpt-4-turbo"):
    client = OpenAI()
    prompt = f"""
    You are a financial analyst.
    Use the context below to answer the question accurately and concisely.

    Context:
    {context}

    Question: {query}
    Answer:
    """
    
    # Retry logic for rate limit errors
    for attempt in range(5):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
            )
            return completion.choices[0].message.content.strip()
        
        except RateLimitError:
            wait = 15 * (attempt + 1)
            print(f"‚ö†Ô∏è Rate limit reached ‚Äî waiting {wait} seconds before retry...")
            time.sleep(wait)
    
    raise Exception("Failed after multiple retry attempts due to rate limits.")

In [28]:
res = collection.query(query_texts=["supply chain risks"], n_results=3)

In [29]:
context = " ".join(res["documents"][0])  # combine top results

In [30]:
from openai import OpenAI, APIError, RateLimitError, AuthenticationError, NotFoundError
import time

client = OpenAI()
MODEL = "gpt-4o"        # or "gpt-4o-mini" if you keep hitting rate limits

def ask_gpt_rag(query: str, context: str, model: str = MODEL,
                temperature: float = 0.2, max_retries: int = 5) -> str:
    """
    Send a RAG-style prompt (question + retrieved context) to an OpenAI chat model
    with exponential backoff on transient errors.
    """
    prompt = f"""You are a financial analyst. Answer concisely using ONLY the context.

Context:
{context}

Question: {query}
Answer:"""

    last_err = None
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
            )
            return completion.choices[0].message.content.strip()

        except (RateLimitError, APIError) as e:
            # transient -> backoff and retry
            last_err = e
            time.sleep(2 ** attempt)
            continue

        except (AuthenticationError, NotFoundError):
            # permanent -> surface immediately (bad key/model name, etc.)
            raise

    raise RuntimeError(f"Failed after {max_retries} attempts. Last error: {last_err}")

In [31]:
# import json, time
# from datetime import datetime
# from openai import OpenAI

# # Ensure your API key is loaded
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# # ‚úÖ Example: list of S&P500 companies (you can expand this list to all 500)
# companies = [
#     "MMM","AOS","ABT","ABBV","ACN","ADBE","AAP","AES","AFL","A","APD","AKAM","ALK","ALB",
#     "ARE","ALGN","ALLE","LNT","ALL","GOOGL","MO","AMZN","AMCR","AAL","AEP","AXP","AIG",
#     "AMT","AWK","AMP","COR","AME","AMGN","APH","ADI","ANSS","AON","APA","AAPL","AMAT",
#     "APTV","ADM","ANET","AJG","AIZ","T","ATO","ADSK","AZO","AVB","AVY","BKR","BALL",
#     "BAC","BBWI","BAX","BDX","BRK.B","BBY","BIO","TECH","BIIB","BLK","BA","BKNG","BWA",
#     "BXP","BSX","BMY","AVGO","BR","BRO","BF.B","CHRW","CDNS","CZR","CPT","CPB","COF",
#     "CAH","KMX","CCL","CARR","CTLT","CAT","CBOE","CBRE","CDW","CE","CNC","CNP","CDAY",
#     "CF","CRL","SCHW","CHTR","CVX","CMG","CB","CHD","CI","CINF","CTAS","CSCO","C","CME",
#     "CMS","KO","CTSH","CL","CMCSA","CMA","CAG","COP","ED","STZ","CEG","CPRT","GLW","CTVA",
#     "COST","CTRA","CCI","CSX","CMI","CVS","DHI","DHR","DRI","DVA","DE","DAL","XRAY","DVN",
#     "DXCM","FANG","DLR","DFS","DISH","DIS","DG","DLTR","D","DOV","DOW","DTE","DUK","DD",
#     "DXC","EMN","ETN","EBAY","ECL","EIX","EW","EA","EMR","ENPH","ETR","EOG","EPAM","EFX",
#     "EQIX","EQR","ESS","EL","ETSY","EG","EVRG","ES","EXC","EXPE","EXPD","EXR","XOM","FFIV",
#     "FAST","FRT","FDX","FIS","FITB","FSLR","FE","FI","FLT","FMC","F","FTNT","FTV","FOX",
#     "BEN","FCX","GRMN","IT","GE","GD","GIS","GM","GILD","GPN","GL","GS","HAL","HIG","HAS",
#     "HCA","DOC","HSIC","HSY","HES","HPE","HLT","HOLX","HD","HON","HRL","HST","HWM","HPQ",
#     "HUM","HBAN","HII","IBM","IEX","IDXX","ITW","ILMN","INCY","IR","INTC","ICE","IP","IPG",
#     "IFF","INTU","ISRG","IVZ","IRM","JBHT","JKHY","J","JNJ","JCI","JPM","JNPR","K","KDP",
#     "KEY","KMB","KIM","KMI","KLAC","KR","LHX","LH","LRCX","LW","LVS","LDOS","LEN","LNC",
#     "LIN","LYV","LKQ","LMT","L","LOW","LUMN","LYB","MTB","MRO","MPC","MKTX","MAR","MMC",
#     "MLM","MAS","MA","MKC","MCD","MCK","MDT","MRK","META","MET","MTD","MCHP","MU","MSFT",
#     "MAA","MRNA","MDLZ","MNST","MCO","MS","MSI","MSCI","NDAQ","NTAP","NFLX","NWL","NEM",
#     "NWSA","NEE","NKE","NI","NDSN","NSC","NTRS","NOC","NUE","NVDA","NXPI","OXY","ODFL",
#     "OMC","OKE","ORCL","OTIS","PCAR","PKG","PARA","PH","PAYX","PYPL","PEP","PFE","PM",
#     "PSX","PNC","PPG","PG","PGR","PLD","PRU","PEG","PHM","QCOM","PWR","DGX","RJF","RTX",
#     "RF","RSG","RMD","RHI","ROK","ROL","ROP","ROST","SPGI","CRM","NOW","SHW","SPG","SWKS",
#     "SNA","SO","LUV","SWK","STT","SYK","TMUS","TPR","TGT","TEL","TXT","CLX","HSY","TRV",
#     "TMO","TJX","TSCO","TT","TDG","TROW","TFC","TSN","UNP","UAL","UPS","URI","UNH","VLO",
#     "VZ","VRTX","V","VMC","WMT","DIS","WBD","WM","WFC","WELL","WST","WDC","WY","WHR","WMB",
#     "XEL","XYL","YUM","ZBH","ZION","ZTS"
# ]

# # Base question template
# question_template = "What supply chain risks did {company} mention in its 10-K report?"

# # Directory to store outputs (optional)
# output_dir = "sp500_json_outputs"

# import os
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# for company in companies:
#     query = question_template.format(company=company)
#     context = f"Analyze the 10-K filing of {company} and summarize its supply chain risks."

#     prompt = f"""
# You are a financial analyst. Answer professionally and concisely.
# Question: {query}
# Context: {context}
# Answer:
# """

#     print(f"\nüîç Processing {company}...")
#     try:
#         response = client.chat.completions.create(
#             model="gpt-4o-mini",
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.2,
#         )
#         answer = response.choices[0].message.content.strip()

#         # Save to JSON
#         timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
#         output = {
#             "company": company,
#             "query": query,
#             "response": answer,
#             "timestamp": timestamp
#         }

#         filename = os.path.join(output_dir, f"{company.replace(' ', '_')}_{timestamp}.json")
#         with open(filename, "w") as f:
#             json.dump(output, f, indent=4)

#         print(f"‚úÖ Saved {company} ‚Üí {filename}")

#         # Prevent API overload
#         time.sleep(2)

#     except Exception as e:
#         print(f"‚ùå Error with {company}: {e}")
#         continue

In [32]:
import pandas as pd
url = "https://stockanalysis.com/list/sp-500-stocks/"
df  = pd.read_html(url)[0]
companies = df["Company Name"].tolist()   # or whichever column holds the company name
print(f"‚úÖ Loaded {len(companies)} companies from StockAnalysis list.")

‚úÖ Loaded 503 companies from StockAnalysis list.


In [33]:
import pandas as pd

# Load table directly from StockAnalysis
url = "https://stockanalysis.com/list/sp-500-stocks/"
df = pd.read_html(url)[0]

# Extract only ticker symbols
tickers = df["Symbol"].dropna().tolist()

print(f"‚úÖ Loaded {len(tickers)} tickers from StockAnalysis S&P 500 list.")
print(tickers)

‚úÖ Loaded 503 tickers from StockAnalysis S&P 500 list.
['NVDA', 'AAPL', 'GOOGL', 'GOOG', 'MSFT', 'AMZN', 'AVGO', 'META', 'TSLA', 'BRK.B', 'LLY', 'WMT', 'JPM', 'V', 'ORCL', 'XOM', 'JNJ', 'MA', 'NFLX', 'ABBV', 'COST', 'BAC', 'PLTR', 'PG', 'HD', 'AMD', 'KO', 'GE', 'CVX', 'CSCO', 'UNH', 'IBM', 'WFC', 'CAT', 'MS', 'AXP', 'GS', 'MRK', 'PM', 'TMUS', 'MU', 'RTX', 'ABT', 'TMO', 'MCD', 'CRM', 'PEP', 'ISRG', 'LIN', 'DIS', 'INTU', 'T', 'AMGN', 'LRCX', 'AMAT', 'C', 'APP', 'BX', 'QCOM', 'UBER', 'NEE', 'VZ', 'NOW', 'TJX', 'BLK', 'INTC', 'APH', 'SCHW', 'DHR', 'GILD', 'ACN', 'BKNG', 'GEV', 'SPGI', 'ANET', 'TXN', 'KLAC', 'BSX', 'PFE', 'SYK', 'WELL', 'BA', 'ADBE', 'UNP', 'PGR', 'COF', 'DE', 'LOW', 'MDT', 'ETN', 'PANW', 'CRWD', 'HON', 'PLD', 'CB', 'ADI', 'HCA', 'VRTX', 'COP', 'MCK', 'LMT', 'PH', 'KKR', 'CEG', 'ADP', 'CMCSA', 'CVS', 'CME', 'SO', 'MO', 'SBUX', 'HOOD', 'DUK', 'BMY', 'NKE', 'GD', 'NEM', 'TT', 'MMM', 'MMC', 'ICE', 'WM', 'MCO', 'ORLY', 'AMT', 'SHW', 'DELL', 'CDNS', 'DASH', 'NOC', 'UPS', 'MAR',

In [34]:
import os, json, time, threading
from datetime import datetime
import pandas as pd
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

#Connect to OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#Fetch tickers from StockAnalysis (live list)
url = "https://stockanalysis.com/list/sp-500-stocks/"
df = pd.read_html(url)[0]
tickers = df["Symbol"].dropna().tolist()
print(f"‚úÖ Loaded {len(tickers)} tickers from StockAnalysis.")
print("Example tickers:", tickers[:10])

#Output folder
output_dir = "/Users/brucewayne/Downloads/sp500_json_outputs"
os.makedirs(output_dir, exist_ok=True)

#Skip already-completed tickers
completed = {f.split('_')[0] for f in os.listdir(output_dir) if f.endswith(".json")}
remaining = [t for t in tickers if t not in completed]
print(f"‚úÖ Already completed: {len(completed)} | üïí Remaining: {len(remaining)}")

#Template question
question_template = "What supply chain risks did {ticker} mention in its 10-K report?"

#Worker function for each ticker
def process_ticker(ticker):
    query = question_template.format(ticker=ticker)
    context = f"Analyze the latest 10-K filing of {ticker} and summarize its main supply chain risks."
    prompt = f"""
You are a financial analyst.
Answer concisely and factually.
Question: {query}
Context: {context}
Answer:
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
        )

        answer = response.choices[0].message.content.strip()
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

        output = {
            "ticker": ticker,
            "query": query,
            "response": answer,
            "timestamp": timestamp
        }

        filename = os.path.join(output_dir, f"{ticker}_{timestamp}.json")
        with open(filename, "w") as f:
            json.dump(output, f, indent=4)

        print(f"‚úÖ Saved {ticker}")
        return True
    except Exception as e:
        print(f"‚ùå Error with {ticker}: {e}")
        return False

#Run 10 tickers in parallel threads
max_threads = 10
print(f"\nüöÄ Starting parallel processing with {max_threads} threads...\n")

success_count = 0
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = {executor.submit(process_ticker, t): t for t in remaining}
    for future in as_completed(futures):
        ticker = futures[future]
        try:
            result = future.result()
            if result:
                success_count += 1
        except Exception as e:
            print(f"‚ö†Ô∏è {ticker} failed: {e}")

print(f"\nüéâ Completed batch run! ‚úÖ {success_count} tickers processed successfully.")

‚úÖ Loaded 503 tickers from StockAnalysis.
Example tickers: ['NVDA', 'AAPL', 'GOOGL', 'GOOG', 'MSFT', 'AMZN', 'AVGO', 'META', 'TSLA', 'BRK.B']
‚úÖ Already completed: 504 | üïí Remaining: 0

üöÄ Starting parallel processing with 10 threads...


üéâ Completed batch run! ‚úÖ 0 tickers processed successfully.


In [35]:
# import json, time
# from datetime import datetime
# from openai import OpenAI

# # Ensure your API key is loaded
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# # ‚úÖ Example: list of S&P500 companies (you can expand this list to all 500)
# companies = [
#     "3M",
#     "A. O. Smith",
#     "Abbott Laboratories",
#     "AbbVie",
#     "Accenture",
#     "Adobe",
#     "Advance Auto Parts",
#     "AES Corporation",
#     "Aflac",
#     "Agilent Technologies",
#     "Air Products and Chemicals",
#     "Akamai Technologies",
#     "Alaska Air Group",
#     "Albemarle Corporation",
#     "Alexandria Real Estate Equities",
#     "Align Technology",
#     "Allegion",
#     "Alliant Energy",
#     "Allstate",
#     "Alphabet",
#     "Altria Group",
#     "Amazon",
#     "Amcor",
#     "American Airlines Group",
#     "American Electric Power",
#     "American Express",
#     "American International Group",
#     "American Tower",
#     "American Water Works",
#     "Ameriprise Financial",
#     "AmerisourceBergen",
#     "Ametek",
#     "Amgen",
#     "Amphenol",
#     "Analog Devices",
#     "ANSYS",
#     "Aon",
#     "APA Corporation",
#     "Apple",
#     "Applied Materials",
#     "Aptiv",
#     "Archer-Daniels-Midland",
#     "Arista Networks",
#     "Arthur J. Gallagher & Co.",
#     "Assurant",
#     "AT&T",
#     "Atmos Energy",
#     "Autodesk",
#     "AutoZone",
#     "AvalonBay Communities",
#     "Avery Dennison",
#     "Baker Hughes",
#     "Ball Corporation",
#     "Bank of America",
#     "Bath & Body Works",
#     "Baxter International",
#     "Becton Dickinson",
#     "Berkshire Hathaway",
#     "Best Buy",
#     "Bio-Rad Laboratories",
#     "Bio-Techne",
#     "Biogen",
#     "BlackRock",
#     "Boeing",
#     "Booking Holdings",
#     "BorgWarner",
#     "Boston Properties",
#     "Boston Scientific",
#     "Bristol Myers Squibb",
#     "Broadcom",
#     "Broadridge Financial Solutions",
#     "Brown & Brown",
#     "Brown‚ÄìForman",
#     "C.H. Robinson",
#     "Cadence Design Systems",
#     "Caesars Entertainment",
#     "Camden Property Trust",
#     "Campbell Soup Company",
#     "Capital One",
#     "Cardinal Health",
#     "CarMax",
#     "Carnival",
#     "Carrier Global",
#     "Catalent",
#     "Caterpillar",
#     "Cboe Global Markets",
#     "CBRE Group",
#     "CDW",
#     "Celanese",
#     "Centene Corporation",
#     "CenterPoint Energy",
#     "Ceridian",
#     "CF Industries",
#     "Charles River Laboratories",
#     "Charles Schwab Corporation",
#     "Charter Communications",
#     "Chevron Corporation",
#     "Chipotle Mexican Grill",
#     "Chubb Limited",
#     "Church & Dwight",
#     "Cigna",
#     "Cincinnati Financial",
#     "Cintas",
#     "Cisco",
#     "Citigroup",
#     "CME Group",
#     "CMS Energy",
#     "Coca-Cola Company",
#     "Cognizant",
#     "Colgate-Palmolive",
#     "Comcast",
#     "Comerica",
#     "Conagra Brands",
#     "ConocoPhillips",
#     "Consolidated Edison",
#     "Constellation Brands",
#     "Constellation Energy",
#     "Copart",
#     "Corning",
#     "Corteva",
#     "Costco",
#     "Coterra",
#     "Crown Castle",
#     "CSX",
#     "Cummins",
#     "CVS Health",
#     "D.R. Horton",
#     "Danaher Corporation",
#     "Darden Restaurants",
#     "DaVita",
#     "Deere & Company",
#     "Delta Air Lines",
#     "Dentsply Sirona",
#     "Devon Energy",
#     "Dexcom",
#     "Diamondback Energy",
#     "Digital Realty",
#     "Discover Financial",
#     "Dish Network",
#     "Disney",
#     "Dollar General",
#     "Dollar Tree",
#     "Dominion Energy",
#     "Dover Corporation",
#     "Dow",
#     "DTE Energy",
#     "Duke Energy",
#     "DuPont",
#     "DXC Technology",
#     "Eastman Chemical Company",
#     "Eaton Corporation",
#     "eBay",
#     "Ecolab",
#     "Edison International",
#     "Edwards Lifesciences",
#     "Electronic Arts",
#     "Emerson Electric",
#     "Enphase Energy",
#     "Entergy",
#     "EOG Resources",
#     "EPAM Systems",
#     "Equifax",
#     "Equinix",
#     "Equity Residential",
#     "Essex Property Trust",
#     "Est√©e Lauder Companies",
#     "Etsy",
#     "Everest Re",
#     "Evergy",
#     "Eversource Energy",
#     "Exelon",
#     "Expedia Group",
#     "Expeditors International",
#     "Extra Space Storage",
#     "ExxonMobil",
#     "F5, Inc.",
#     "Fastenal",
#     "Federal Realty",
#     "FedEx",
#     "Fidelity National Information Services",
#     "Fifth Third Bank",
#     "First Solar",
#     "FirstEnergy",
#     "FISERV",
#     "FleetCor Technologies",
#     "FMC Corporation",
#     "Ford",
#     "Fortinet",
#     "Fortive",
#     "Fox Corporation",
#     "Franklin Templeton",
#     "Freeport-McMoRan",
#     "Garmin",
#     "Gartner",
#     "GE Aerospace",
#     "General Dynamics",
#     "General Electric",
#     "General Mills",
#     "General Motors",
#     "Gilead Sciences",
#     "Global Payments",
#     "Globe Life",
#     "Goldman Sachs",
#     "Halliburton",
#     "Hartford",
#     "Hasbro",
#     "HCA Healthcare",
#     "Healthpeak Properties",
#     "Henry Schein",
#     "Hershey's",
#     "Hess Corporation",
#     "Hewlett Packard Enterprise",
#     "Hilton Worldwide",
#     "Hologic",
#     "Home Depot",
#     "Honeywell",
#     "Hormel Foods",
#     "Host Hotels & Resorts",
#     "Howmet Aerospace",
#     "HP Inc.",
#     "Humana",
#     "Huntington Bancshares",
#     "Huntington Ingalls Industries",
#     "IBM",
#     "IDEX Corporation",
#     "IDEXX Laboratories",
#     "Illinois Tool Works",
#     "Illumina",
#     "Incyte",
#     "Ingersoll Rand",
#     "Intel",
#     "Intercontinental Exchange",
#     "International Paper",
#     "Interpublic Group",
#     "International Flavors & Fragrances",
#     "Intuit",
#     "Intuitive Surgical",
#     "Invesco",
#     "Iron Mountain",
#     "J.B. Hunt",
#     "Jack Henry & Associates",
#     "Jacobs Solutions",
#     "Johnson & Johnson",
#     "Johnson Controls",
#     "JPMorgan Chase",
#     "Juniper Networks",
#     "Kellogg's",
#     "Keurig Dr Pepper",
#     "KeyCorp",
#     "Kimberly-Clark",
#     "Kimco Realty",
#     "Kinder Morgan",
#     "KLA Corporation",
#     "Kroger",
#     "L3Harris Technologies",
#     "LabCorp",
#     "Lam Research",
#     "Lamb Weston",
#     "Las Vegas Sands",
#     "Leidos",
#     "Lennar",
#     "Lincoln National",
#     "Linde plc",
#     "Live Nation",
#     "LKQ Corporation",
#     "Lockheed Martin",
#     "Loews Corporation",
#     "Lowe's",
#     "Lumen Technologies",
#     "LyondellBasell",
#     "M&T Bank",
#     "Marathon Oil",
#     "Marathon Petroleum",
#     "MarketAxess",
#     "Marriott International",
#     "Marsh & McLennan",
#     "Martin Marietta Materials",
#     "Masco",
#     "Mastercard",
#     "McCormick & Company",
#     "McDonald's",
#     "McKesson",
#     "Medtronic",
#     "Merck & Co.",
#     "Meta Platforms",
#     "MetLife",
#     "Mettler Toledo",
#     "Microchip Technology",
#     "Micron Technology",
#     "Microsoft",
#     "Mid-America Apartment Communities",
#     "Moderna",
#     "Mondelez International",
#     "Monster Beverage",
#     "Moody's Corporation",
#     "Morgan Stanley",
#     "Motorola Solutions",
#     "MSCI",
#     "Nasdaq",
#     "NetApp",
#     "Netflix",
#     "Newell Brands",
#     "Newmont",
#     "News Corp",
#     "NextEra Energy",
#     "Nike",
#     "NiSource",
#     "Nordson Corporation",
#     "Norfolk Southern",
#     "Northern Trust",
#     "Northrop Grumman",
#     "Nucor",
#     "NVIDIA",
#     "NXP Semiconductors",
#     "Occidental Petroleum",
#     "Old Dominion Freight Line",
#     "Omnicom Group",
#     "ONEOK",
#     "Oracle",
#     "Otis Worldwide",
#     "Paccar",
#     "Packaging Corporation of America",
#     "Paramount Global",
#     "Parker Hannifin",
#     "Paychex",
#     "PayPal",
#     "PepsiCo",
#     "Pfizer",
#     "Philip Morris International",
#     "Phillips 66",
#     "PNC Financial Services",
#     "PPG Industries",
#     "Procter & Gamble",
#     "Progressive Corporation",
#     "Prologis",
#     "Prudential Financial",
#     "Public Service Enterprise Group",
#     "PulteGroup",
#     "Qualcomm",
#     "Quanta Services",
#     "Quest Diagnostics",
#     "Raymond James",
#     "Raytheon Technologies",
#     "Regions Financial",
#     "Republic Services",
#     "ResMed",
#     "Robert Half",
#     "Rockwell Automation",
#     "Rollins",
#     "Roper Technologies",
#     "Ross Stores",
#     "S&P Global",
#     "Salesforce",
#     "ServiceNow",
#     "Sherwin-Williams",
#     "Simon Property Group",
#     "Skyworks Solutions",
#     "Snap-on",
#     "Southern Company",
#     "Southwest Airlines",
#     "Stanley Black & Decker",
#     "State Street Corporation",
#     "Stryker Corporation",
#     "T-Mobile US",
#     "Tapestry",
#     "Target",
#     "TE Connectivity",
#     "Textron",
#     "The Clorox Company",
#     "The Hershey Company",
#     "The Travelers Companies",
#     "Thermo Fisher Scientific",
#     "TJX Companies",
#     "Tractor Supply",
#     "Trane Technologies",
#     "TransDigm Group",
#     "Trowe Price",
#     "Truist Financial",
#     "Tyson Foods",
#     "Union Pacific",
#     "United Airlines Holdings",
#     "United Parcel Service",
#     "United Rentals",
#     "UnitedHealth Group",
#     "Valero Energy",
#     "Verizon",
#     "Vertex Pharmaceuticals",
#     "Visa",
#     "Vulcan Materials",
#     "Walmart",
#     "Walt Disney",
#     "Warner Bros. Discovery",
#     "Waste Management",
#     "Wells Fargo",
#     "Welltower",
#     "West Pharmaceutical Services",
#     "Western Digital",
#     "Weyerhaeuser",
#     "Whirlpool",
#     "Williams Companies",
#     "Xcel Energy",
#     "Xylem",
#     "Yum! Brands",
#     "Zimmer Biomet",
#     "Zions Bancorp",
#     "Zoetis"
# ]

# # Base question template
# question_template = "What supply chain risks did {company} mention in its 10-K report?"

# # Directory to store outputs (optional)
# output_dir = "sp500_json_outputs"

# import os
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# for company in companies:
#     query = question_template.format(company=company)
#     context = f"Analyze the 10-K filing of {company} and summarize its supply chain risks."

#     prompt = f"""
# You are a financial analyst. Answer professionally and concisely.
# Question: {query}
# Context: {context}
# Answer:
# """

#     print(f"\nüîç Processing {company}...")
#     try:
#         response = client.chat.completions.create(
#             model="gpt-4o-mini",
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.2,
#         )
#         answer = response.choices[0].message.content.strip()

#         # Save to JSON
#         timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
#         output = {
#             "company": company,
#             "query": query,
#             "response": answer,
#             "timestamp": timestamp
#         }

#         filename = os.path.join(output_dir, f"{company.replace(' ', '_')}_{timestamp}.json")
#         with open(filename, "w") as f:
#             json.dump(output, f, indent=4)

#         print(f"‚úÖ Saved {company} ‚Üí {filename}")

#         # Prevent API overload
#         time.sleep(2)

#     except Exception as e:
#         print(f"‚ùå Error with {company}: {e}")
#         continue