In [None]:
!pip install sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.9.0-py3-none-any.whl.metadata (28 kB)
Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.9.0-py3-none-any.whl (33 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.9.0 sec-edgar-downloader-5.0.3


In [None]:
from sec_edgar_downloader import Downloader

In [None]:
dl = Downloader("MyRAGProject", "my.email@example.com")

In [None]:
try:
    # Get the single latest 10-K filing for Apple (ticker: AAPL)
    dl.get("10-K", "AAPL", limit=1)

    print("✅ Download successful!")
    print("Check the 'sec-edgar-filings' folder in the Colab file browser.")

except Exception as e:
    print(f"An error occurred: {e}")

✅ Download successful!
Check the 'sec-edgar-filings' folder in the Colab file browser.


In [None]:
import os
import re

# The exact path you provided for the downloaded filing
file_path = "sec-edgar-filings/AAPL/10-K/0000320193-24-000123/full-submission.txt"

# Check if the file exists before proceeding
if not os.path.exists(file_path):
    print(f"Error: The file was not found at the specified path: {file_path}")
    print("Please make sure the download was successful and the path is correct.")
else:
    print(f"Processing file: {file_path}")

    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Define the regular expression to find the 10-K document
    doc_start_pattern = re.compile(r'<DOCUMENT>\s*<TYPE>10-K', re.IGNORECASE)
    doc_end_pattern = re.compile(r'</DOCUMENT>')

    # Find the start of the 10-K document
    doc_start_match = doc_start_pattern.search(content)

    if doc_start_match:
        # Find the end of the document, starting the search after the start match
        doc_end_match = doc_end_pattern.search(content, doc_start_match.end())

        if doc_end_match:
            # Extract the content
            document_content = content[doc_start_match.start():doc_end_match.end()]

            # Save the extracted content to an HTML file
            output_filename = 'extracted_10k_report.html'
            with open(output_filename, 'w', encoding='utf-8') as f:
                f.write(document_content)

            print(f"✅ Successfully extracted the 10-K report.")
            print(f"Clean report saved as: '{output_filename}'")
        else:
            print("Error: Could not find the end of the 10-K document.")
    else:
        print("Error: Could not find the start of a 10-K document in the file.")

Processing file: sec-edgar-filings/AAPL/10-K/0000320193-24-000123/full-submission.txt
✅ Successfully extracted the 10-K report.
Clean report saved as: 'extracted_10k_report.html'


# COMPANY: APPLE

#### Downloading the .txt files


In [None]:
!pip install sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.9.0-py3-none-any.whl.metadata (28 kB)
Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.9.0-py3-none-any.whl (33 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.9.0 sec-edgar-downloader-5.0.3


In [None]:
from sec_edgar_downloader import Downloader

In [None]:
dl = Downloader("MyRAGProject", "my.email@example.com")


In [None]:
# Run in a notebook cell (Colab). Re-run if you restart the runtime.
!pip install --upgrade pip
!pip install "unstructured[all-docs]" lxml beautifulsoup4 html5lib


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting unstructured[all-docs]
  Downloading unstructured-0.18.14-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured[all-docs])
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured[all-docs])
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured[all-docs])
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting dataclasses-json (from unstructured[all-docs])
  Downloading dataclasses_json-0

In [None]:
tickers = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "TSLA", "META", "BRK-B", "JPM",
    "JNJ", "V", "PG", "MA", "UNH", "HD", "BAC", "PFE", "XOM", "DIS", "WMT",
    "CSCO", "ADBE", "NFLX", "KO", "PEP", "TMO", "AVGO", "COST", "ORCL", "CRM"
]



In [None]:
print(f"Starting the download process for {len(tickers)} companies...")
print("This may take a significant amount of time depending on the connection speed.")

for ticker in tickers:
    try:
        print(f"Downloading 10-K filings for {ticker}...")
        # Get the 8 most recent 10-K filings
        dl.get("10-K", ticker, limit=8)
        print(f"Successfully downloaded filings for {ticker}.")
    except Exception as e:
        print(f"Could not download filings for {ticker}. Error: {e}")

print("\n✅ All downloads attempted. Check the 'sec-edgar-filings' folder in the Colab file browser.")

Starting the download process for 30 companies...
This may take a significant amount of time depending on the connection speed.
Downloading 10-K filings for AAPL...
Successfully downloaded filings for AAPL.
Downloading 10-K filings for MSFT...
Successfully downloaded filings for MSFT.
Downloading 10-K filings for GOOGL...
Successfully downloaded filings for GOOGL.
Downloading 10-K filings for AMZN...
Successfully downloaded filings for AMZN.
Downloading 10-K filings for NVDA...
Successfully downloaded filings for NVDA.
Downloading 10-K filings for TSLA...
Successfully downloaded filings for TSLA.
Downloading 10-K filings for META...
Successfully downloaded filings for META.
Downloading 10-K filings for BRK-B...
Successfully downloaded filings for BRK-B.
Downloading 10-K filings for JPM...
Successfully downloaded filings for JPM.
Downloading 10-K filings for JNJ...
Successfully downloaded filings for JNJ.
Downloading 10-K filings for V...
Successfully downloaded filings for V.
Downloadi

## Using SEC-API to extract 10K SEC FILE: AAPL 2024:

#### EXTRACTING .txt and .html files

In [None]:
# Google Colab-ready script: download metadata + all 10-K items using sec-api
# Replace API_KEY with your API key before running.
# Tested approach: Query API for metadata, Extractor API for each item.

# Install dependencies (uncomment in Colab if not already installed)
!pip install requests tqdm

import os
import time
import json
import requests
from urllib.parse import urlparse
from tqdm import tqdm

# ---------------------------
# CONFIGURATION (edit this)
# ---------------------------
API_KEY = "cae6da746ce1ec9bc2226c594762a7607a52d94cb2e650dd37cd0ea7dd9333f2"   # <-- substitute your sec-api key here
FILING_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"
OUTPUT_DIR = "./sec_extraction_output"
REQUEST_DELAY_SEC = 0.25        # polite delay between requests (adjust if necessary)
# ---------------------------

# Items to extract for a 10-K (supported list by sec-api)
TEN_K_ITEMS = [
    "1","1A","1B","1C","2","3","4","5","6","7","7A","8","9","9A","9B",
    "10","11","12","13","14","15"
]

# Per your instruction: these items should be requested as HTML (tables preserved)
HTML_ITEMS = {"5", "6", "7A", "8", "15"}

# API endpoints
QUERY_API_URL = "https://api.sec-api.io"       # POST for query metadata
EXTRACTOR_API_URL = "https://api.sec-api.io/extractor"  # GET for item extraction

# Ensure output directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
items_dir = os.path.join(OUTPUT_DIR, "items")
os.makedirs(items_dir, exist_ok=True)

# Utility: get filename part of filing URL
def get_filing_filename(filing_url):
    p = urlparse(filing_url)
    return os.path.basename(p.path)

# 1) Query API: get metadata about this filing
def fetch_filing_metadata(filing_url, api_key):
    """
    Uses Query API to find metadata for the filing. Searches by the filing filename.
    Returns the first matching filing metadata JSON object (or None).
    """
    filename = get_filing_filename(filing_url)
    payload = {
        "query": f'linkToFilingDetails:\"{filename}\"',
        "from": "0",
        "size": "1",
        "sort": [{ "filedAt": { "order": "desc" }}]
    }
    headers = {
        "Authorization": api_key,
        "Content-Type": "application/json"
    }
    print(f"[metadata] querying for filing with filename: {filename}")
    resp = requests.post(QUERY_API_URL, headers=headers, json=payload, timeout=30)
    if resp.status_code != 200:
        raise RuntimeError(f"Query API error {resp.status_code}: {resp.text}")
    rj = resp.json()
    # result shape: rj may have 'results' or direct array; doc sample shows array top-level response
    # Try common shapes:
    hits = None
    if isinstance(rj, dict):
        # Many responses include 'results' or 'hits'
        if "results" in rj:
            hits = rj["results"]
        elif "hits" in rj:
            hits = rj["hits"]
        elif "data" in rj:
            hits = rj["data"]
        else:
            # sometimes the API returns array under 'filings' or root 'results'
            # fallback: try top-level as list (rare)
            hits = rj.get("items") or rj.get("filings") or []
    elif isinstance(rj, list):
        hits = rj
    else:
        hits = []

    if not hits:
        print("[metadata] no hits found via Query API for that filename. Falling back to minimal metadata produced from URL.")
        # fallback metadata from URL alone:
        return {
            "source_url": filing_url,
            "note": "No metadata found via Query API for filename. Use raw filing URL as source."
        }
    # pick first
    metadata = hits[0]
    # save metadata raw to disk
    metadata_path = os.path.join(OUTPUT_DIR, "metadata.json")
    with open(metadata_path, "w", encoding="utf-8") as fh:
        json.dump(metadata, fh, indent=2, ensure_ascii=False)
    print(f"[metadata] saved metadata to {metadata_path}")
    return metadata

# 2) Extractor API: fetch a single item
def fetch_item_section(filing_url, item_code, return_type, api_key):
    """
    Calls the Extractor API: GET https://api.sec-api.io/extractor?url=...&item=...&type=...&token=...
    return_type: 'text' or 'html'
    Returns tuple(status_code, text_response, response_headers)
    """
    params = {
        "url": filing_url,
        "item": item_code,
        "type": return_type,
        "token": api_key
    }
    resp = requests.get(EXTRACTOR_API_URL, params=params, timeout=60)
    return resp.status_code, resp.text, resp.headers

# 3) Orchestrator
def extract_all_items(filing_url, api_key, output_dir, items_list, html_items_set, delay=0.25):
    """
    Extract the requested items. Save to disk per item as .txt (for text) or .html (for html).
    Also create an index JSON describing saved files and extraction status.
    """
    index = {
        "filing_url": filing_url,
        "extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "items": {}
    }

    for item in tqdm(items_list, desc="Items"):
        typ = "html" if item.upper() in html_items_set else "text"
        print(f"\n[extract] item={item}  type={typ}")
        try:
            status_code, body, headers = fetch_item_section(filing_url, item, typ, api_key)
        except Exception as e:
            print(f"[error] request failed for item {item}: {e}")
            index["items"][item] = {
                "status": "error",
                "error": str(e)
            }
            # polite delay and continue
            time.sleep(delay)
            continue

        if status_code != 200:
            print(f"[error] Extractor API returned {status_code} for item {item}. Response snippet: {body[:500]}")
            index["items"][item] = {
                "status": "http_error",
                "http_status": status_code,
                "response_snippet": body[:1000]
            }
            # delay and continue
            time.sleep(delay)
            continue

        # choose extension
        ext = "html" if typ == "html" else "txt"
        safe_item_name = f"item_{item}.{ext}"
        outpath = os.path.join(output_dir, safe_item_name)
        with open(outpath, "w", encoding="utf-8") as fh:
            fh.write(body)

        print(f"[saved] item {item} -> {outpath} (size={len(body):,} bytes)")
        index["items"][item] = {
            "status": "ok",
            "type": typ,
            "path": outpath,
            "size_bytes": len(body)
        }

        # polite delay to avoid rate-limiting
        time.sleep(delay)

    # save index
    index_path = os.path.join(output_dir, "items_index.json")
    with open(index_path, "w", encoding="utf-8") as fh:
        json.dump(index, fh, indent=2, ensure_ascii=False)
    print(f"\n[index] saved to {index_path}")
    return index

# ---------------------------
# MAIN
# ---------------------------
def main():
    if API_KEY == "" or API_KEY.startswith("YOUR_API_KEY"):
        raise RuntimeError("Please set API_KEY variable in the script to your sec-api key before running.")

    print("Starting sec-api extraction for filing:", FILING_URL)
    # 1) fetch metadata via Query API
    metadata = fetch_filing_metadata(FILING_URL, API_KEY)

    # save a copy of the original filing URL HTML as well (optional)
    try:
        r = requests.get(FILING_URL, timeout=30)
        if r.status_code == 200:
            raw_filing_path = os.path.join(OUTPUT_DIR, get_filing_filename(FILING_URL))
            with open(raw_filing_path, "w", encoding="utf-8") as fh:
                fh.write(r.text)
            print(f"[raw] saved raw filing HTML to {raw_filing_path}")
        else:
            print(f"[raw] couldn't download raw filing HTML; status {r.status_code}")
    except Exception as e:
        print(f"[raw] error downloading raw filing HTML: {e}")

    # 2) extract all items
    index = extract_all_items(
        filing_url=FILING_URL,
        api_key=API_KEY,
        output_dir=items_dir,
        items_list=TEN_K_ITEMS,
        html_items_set=set([x.upper() for x in HTML_ITEMS]),
        delay=REQUEST_DELAY_SEC
    )

    print("\nAll done. Output directory:", OUTPUT_DIR)
    print("Files saved:")
    for it, info in index["items"].items():
        print("  ", it, "->", info.get("path", info.get("error")))

if __name__ == "__main__":
    main()


Starting sec-api extraction for filing: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
[metadata] querying for filing with filename: aapl-20240928.htm
[metadata] saved metadata to ./sec_extraction_output/metadata.json
[raw] couldn't download raw filing HTML; status 403


Items:   0%|          | 0/21 [00:00<?, ?it/s]


[extract] item=1  type=text
[saved] item 1 -> ./sec_extraction_output/items/item_1.txt (size=16,339 bytes)


Items:   5%|▍         | 1/21 [00:00<00:11,  1.76it/s]


[extract] item=1A  type=text
[saved] item 1A -> ./sec_extraction_output/items/item_1A.txt (size=70,585 bytes)


Items:  10%|▉         | 2/21 [00:01<00:11,  1.66it/s]


[extract] item=1B  type=text
[saved] item 1B -> ./sec_extraction_output/items/item_1B.txt (size=46 bytes)


Items:  14%|█▍        | 3/21 [00:01<00:10,  1.74it/s]


[extract] item=1C  type=text
[saved] item 1C -> ./sec_extraction_output/items/item_1C.txt (size=2,840 bytes)


Items:  19%|█▉        | 4/21 [00:02<00:09,  1.74it/s]


[extract] item=2  type=text
[saved] item 2 -> ./sec_extraction_output/items/item_2.txt (size=499 bytes)


Items:  24%|██▍       | 5/21 [00:02<00:09,  1.70it/s]


[extract] item=3  type=text
[saved] item 3 -> ./sec_extraction_output/items/item_3.txt (size=4,496 bytes)


Items:  29%|██▊       | 6/21 [00:03<00:08,  1.72it/s]


[extract] item=4  type=text
[saved] item 4 -> ./sec_extraction_output/items/item_4.txt (size=98 bytes)


Items:  33%|███▎      | 7/21 [00:04<00:07,  1.75it/s]


[extract] item=5  type=html
[saved] item 5 -> ./sec_extraction_output/items/item_5.html (size=31,969 bytes)


Items:  38%|███▊      | 8/21 [00:04<00:07,  1.74it/s]


[extract] item=6  type=html
[saved] item 6 -> ./sec_extraction_output/items/item_6.html (size=710 bytes)


Items:  43%|████▎     | 9/21 [00:05<00:06,  1.77it/s]


[extract] item=7  type=text
[saved] item 7 -> ./sec_extraction_output/items/item_7.txt (size=16,122 bytes)


Items:  48%|████▊     | 10/21 [00:05<00:06,  1.75it/s]


[extract] item=7A  type=html
[saved] item 7A -> ./sec_extraction_output/items/item_7A.html (size=11,355 bytes)


Items:  52%|█████▏    | 11/21 [00:06<00:05,  1.77it/s]


[extract] item=8  type=html
[saved] item 8 -> ./sec_extraction_output/items/item_8.html (size=865,455 bytes)


Items:  57%|█████▋    | 12/21 [00:07<00:05,  1.64it/s]


[extract] item=9  type=text
[saved] item 9 -> ./sec_extraction_output/items/item_9.txt (size=104 bytes)


Items:  62%|██████▏   | 13/21 [00:07<00:04,  1.70it/s]


[extract] item=9A  type=text
[saved] item 9A -> ./sec_extraction_output/items/item_9A.txt (size=4,665 bytes)


Items:  67%|██████▋   | 14/21 [00:08<00:04,  1.74it/s]


[extract] item=9B  type=text
[saved] item 9B -> ./sec_extraction_output/items/item_9B.txt (size=1,401 bytes)


Items:  71%|███████▏  | 15/21 [00:08<00:03,  1.76it/s]


[extract] item=10  type=text
[saved] item 10 -> ./sec_extraction_output/items/item_10.txt (size=1,074 bytes)


Items:  76%|███████▌  | 16/21 [00:09<00:02,  1.78it/s]


[extract] item=11  type=text
[saved] item 11 -> ./sec_extraction_output/items/item_11.txt (size=162 bytes)


Items:  81%|████████  | 17/21 [00:09<00:02,  1.76it/s]


[extract] item=12  type=text
[saved] item 12 -> ./sec_extraction_output/items/item_12.txt (size=234 bytes)


Items:  86%|████████▌ | 18/21 [00:10<00:01,  1.78it/s]


[extract] item=13  type=text
[saved] item 13 -> ./sec_extraction_output/items/item_13.txt (size=213 bytes)


Items:  90%|█████████ | 19/21 [00:10<00:01,  1.79it/s]


[extract] item=14  type=text
[saved] item 14 -> ./sec_extraction_output/items/item_14.txt (size=223 bytes)


Items:  95%|█████████▌| 20/21 [00:11<00:00,  1.80it/s]


[extract] item=15  type=html
[saved] item 15 -> ./sec_extraction_output/items/item_15.html (size=145,349 bytes)


Items: 100%|██████████| 21/21 [00:12<00:00,  1.75it/s]


[index] saved to ./sec_extraction_output/items/items_index.json

All done. Output directory: ./sec_extraction_output
Files saved:
   1 -> ./sec_extraction_output/items/item_1.txt
   1A -> ./sec_extraction_output/items/item_1A.txt
   1B -> ./sec_extraction_output/items/item_1B.txt
   1C -> ./sec_extraction_output/items/item_1C.txt
   2 -> ./sec_extraction_output/items/item_2.txt
   3 -> ./sec_extraction_output/items/item_3.txt
   4 -> ./sec_extraction_output/items/item_4.txt
   5 -> ./sec_extraction_output/items/item_5.html
   6 -> ./sec_extraction_output/items/item_6.html
   7 -> ./sec_extraction_output/items/item_7.txt
   7A -> ./sec_extraction_output/items/item_7A.html
   8 -> ./sec_extraction_output/items/item_8.html
   9 -> ./sec_extraction_output/items/item_9.txt
   9A -> ./sec_extraction_output/items/item_9A.txt
   9B -> ./sec_extraction_output/items/item_9B.txt
   10 -> ./sec_extraction_output/items/item_10.txt
   11 -> ./sec_extraction_output/items/item_11.txt
   12 -> ./sec_e




In [None]:
# Google Colab-ready script: download metadata + all 10-K items using sec-api
# Replace API_KEY with your API key before running.
# Tested approach: Query API for metadata, Extractor API for each item.
#
# EDIT: For Item 5 this script will fetch and save BOTH type=text and type=html outputs.

# Install dependencies (uncomment in Colab if not already installed)
# !pip install requests tqdm

import os
import time
import json
import requests
from urllib.parse import urlparse
from tqdm import tqdm

# ---------------------------
# CONFIGURATION (edit this)
# ---------------------------
API_KEY = "cae6da746ce1ec9bc2226c594762a7607a52d94cb2e650dd37cd0ea7dd9333f2"   # <-- substitute your sec-api key here
FILING_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"
OUTPUT_DIR = "./sec_extraction_output"
REQUEST_DELAY_SEC = 0.25        # polite delay between requests (adjust if necessary)
# ---------------------------

# Items to extract for a 10-K (supported list by sec-api)
TEN_K_ITEMS = [
    "1","1A","1B","1C","2","3","4","5","6","7","7A","8","9","9A","9B",
    "10","11","12","13","14","15"
]

# Per your instruction: these items should be requested as HTML (tables preserved)
HTML_ITEMS = {"5", "6", "7A", "8", "15"}

# API endpoints
QUERY_API_URL = "https://api.sec-api.io"       # POST for query metadata
EXTRACTOR_API_URL = "https://api.sec-api.io/extractor"  # GET for item extraction

# Ensure output directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
items_dir = os.path.join(OUTPUT_DIR, "items")
os.makedirs(items_dir, exist_ok=True)

# Utility: get filename part of filing URL
def get_filing_filename(filing_url):
    p = urlparse(filing_url)
    return os.path.basename(p.path)

# 1) Query API: get metadata about this filing
def fetch_filing_metadata(filing_url, api_key):
    """
    Uses Query API to find metadata for the filing. Searches by the filing filename.
    Returns the first matching filing metadata JSON object (or None).
    """
    filename = get_filing_filename(filing_url)
    payload = {
        "query": f'linkToFilingDetails:\"{filename}\"',
        "from": "0",
        "size": "1",
        "sort": [{ "filedAt": { "order": "desc" }}]
    }
    headers = {
        "Authorization": api_key,
        "Content-Type": "application/json"
    }
    print(f"[metadata] querying for filing with filename: {filename}")
    resp = requests.post(QUERY_API_URL, headers=headers, json=payload, timeout=30)
    if resp.status_code != 200:
        raise RuntimeError(f"Query API error {resp.status_code}: {resp.text}")
    rj = resp.json()
    # result shape: rj may have 'results' or direct array; doc sample shows array top-level response
    # Try common shapes:
    hits = None
    if isinstance(rj, dict):
        # Many responses include 'results' or 'hits'
        if "results" in rj:
            hits = rj["results"]
        elif "hits" in rj:
            hits = rj["hits"]
        elif "data" in rj:
            hits = rj["data"]
        else:
            # sometimes the API returns array under 'filings' or root 'results'
            # fallback: try top-level as list (rare)
            hits = rj.get("items") or rj.get("filings") or []
    elif isinstance(rj, list):
        hits = rj
    else:
        hits = []

    if not hits:
        print("[metadata] no hits found via Query API for that filename. Falling back to minimal metadata produced from URL.")
        # fallback metadata from URL alone:
        return {
            "source_url": filing_url,
            "note": "No metadata found via Query API for filename. Use raw filing URL as source."
        }
    # pick first
    metadata = hits[0]
    # save metadata raw to disk
    metadata_path = os.path.join(OUTPUT_DIR, "metadata.json")
    with open(metadata_path, "w", encoding="utf-8") as fh:
        json.dump(metadata, fh, indent=2, ensure_ascii=False)
    print(f"[metadata] saved metadata to {metadata_path}")
    return metadata

# 2) Extractor API: fetch a single item
def fetch_item_section(filing_url, item_code, return_type, api_key):
    """
    Calls the Extractor API: GET https://api.sec-api.io/extractor?url=...&item=...&type=...&token=...
    return_type: 'text' or 'html'
    Returns tuple(status_code, text_response, response_headers)
    """
    params = {
        "url": filing_url,
        "item": item_code,
        "type": return_type,
        "token": api_key
    }
    resp = requests.get(EXTRACTOR_API_URL, params=params, timeout=60)
    return resp.status_code, resp.text, resp.headers

# 3) Orchestrator
def extract_all_items(filing_url, api_key, output_dir, items_list, html_items_set, delay=0.25):
    """
    Extract the requested items. Save to disk per item as .txt (for text) or .html (for html).
    For item '5' this function will save BOTH .txt and .html versions.
    Also create an index JSON describing saved files and extraction status.
    """
    index = {
        "filing_url": filing_url,
        "extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "items": {}
    }

    for item in tqdm(items_list, desc="Items"):
        # For item '5' specifically, fetch BOTH text and html
        if str(item).strip() == "5":
            index["items"][item] = {}
            # 1) fetch text
            try:
                print(f"\n[extract] item={item}  type=text (saving both text and html for this item)")
                status_code, body, headers = fetch_item_section(filing_url, item, "text", api_key)
            except Exception as e:
                print(f"[error] request failed for item {item} (text): {e}")
                index["items"][item]["text"] = {
                    "status": "error",
                    "error": str(e)
                }
                # proceed to try html version after delay
                time.sleep(delay)
                # continue to html fetch below
            else:
                if status_code != 200:
                    print(f"[error] Extractor API returned {status_code} for item {item} (text). Response snippet: {body[:500]}")
                    index["items"][item]["text"] = {
                        "status": "http_error",
                        "http_status": status_code,
                        "response_snippet": body[:1000]
                    }
                else:
                    ext = "txt"
                    safe_item_name = f"item_{item}.{ext}"
                    outpath = os.path.join(output_dir, safe_item_name)
                    with open(outpath, "w", encoding="utf-8") as fh:
                        fh.write(body)
                    print(f"[saved] item {item} (text) -> {outpath} (size={len(body):,} bytes)")
                    index["items"][item]["text"] = {
                        "status": "ok",
                        "type": "text",
                        "path": outpath,
                        "size_bytes": len(body)
                    }
            # polite delay before html fetch
            time.sleep(delay)

            # 2) fetch html
            try:
                print(f"[extract] item={item}  type=html (saving both text and html for this item)")
                status_code, body, headers = fetch_item_section(filing_url, item, "html", api_key)
            except Exception as e:
                print(f"[error] request failed for item {item} (html): {e}")
                index["items"][item]["html"] = {
                    "status": "error",
                    "error": str(e)
                }
                time.sleep(delay)
                continue
            if status_code != 200:
                print(f"[error] Extractor API returned {status_code} for item {item} (html). Response snippet: {body[:500]}")
                index["items"][item]["html"] = {
                    "status": "http_error",
                    "http_status": status_code,
                    "response_snippet": body[:1000]
                }
            else:
                ext = "html"
                safe_item_name = f"item_{item}.{ext}"
                outpath = os.path.join(output_dir, safe_item_name)
                with open(outpath, "w", encoding="utf-8") as fh:
                    fh.write(body)
                print(f"[saved] item {item} (html) -> {outpath} (size={len(body):,} bytes)")
                index["items"][item]["html"] = {
                    "status": "ok",
                    "type": "html",
                    "path": outpath,
                    "size_bytes": len(body)
                }

            # polite delay before moving to next item
            time.sleep(delay)
            continue  # done with item 5, move to next item

        # For all other items, follow original policy: request html if listed in HTML_ITEMS else text
        typ = "html" if str(item).upper() in html_items_set else "text"
        print(f"\n[extract] item={item}  type={typ}")
        try:
            status_code, body, headers = fetch_item_section(filing_url, item, typ, api_key)
        except Exception as e:
            print(f"[error] request failed for item {item}: {e}")
            index["items"][item] = {
                "status": "error",
                "error": str(e)
            }
            # polite delay and continue
            time.sleep(delay)
            continue

        if status_code != 200:
            print(f"[error] Extractor API returned {status_code} for item {item}. Response snippet: {body[:500]}")
            index["items"][item] = {
                "status": "http_error",
                "http_status": status_code,
                "response_snippet": body[:1000]
            }
            # delay and continue
            time.sleep(delay)
            continue

        # choose extension
        ext = "html" if typ == "html" else "txt"
        safe_item_name = f"item_{item}.{ext}"
        outpath = os.path.join(output_dir, safe_item_name)
        with open(outpath, "w", encoding="utf-8") as fh:
            fh.write(body)

        print(f"[saved] item {item} -> {outpath} (size={len(body):,} bytes)")
        index["items"][item] = {
            "status": "ok",
            "type": typ,
            "path": outpath,
            "size_bytes": len(body)
        }

        # polite delay to avoid rate-limiting
        time.sleep(delay)

    # save index
    index_path = os.path.join(output_dir, "items_index.json")
    with open(index_path, "w", encoding="utf-8") as fh:
        json.dump(index, fh, indent=2, ensure_ascii=False)
    print(f"\n[index] saved to {index_path}")
    return index

# ---------------------------
# MAIN
# ---------------------------
def main():
    if API_KEY == "" or API_KEY.startswith("YOUR_API_KEY"):
        raise RuntimeError("Please set API_KEY variable in the script to your sec-api key before running.")

    print("Starting sec-api extraction for filing:", FILING_URL)
    # 1) fetch metadata via Query API
    metadata = fetch_filing_metadata(FILING_URL, API_KEY)

    # save a copy of the original filing URL HTML as well (optional)
    try:
        r = requests.get(FILING_URL, timeout=30)
        if r.status_code == 200:
            raw_filing_path = os.path.join(OUTPUT_DIR, get_filing_filename(FILING_URL))
            with open(raw_filing_path, "w", encoding="utf-8") as fh:
                fh.write(r.text)
            print(f"[raw] saved raw filing HTML to {raw_filing_path}")
        else:
            print(f"[raw] couldn't download raw filing HTML; status {r.status_code}")
    except Exception as e:
        print(f"[raw] error downloading raw filing HTML: {e}")

    # 2) extract all items
    index = extract_all_items(
        filing_url=FILING_URL,
        api_key=API_KEY,
        output_dir=items_dir,
        items_list=TEN_K_ITEMS,
        html_items_set=set([x.upper() for x in HTML_ITEMS]),
        delay=REQUEST_DELAY_SEC
    )

    print("\nAll done. Output directory:", OUTPUT_DIR)
    print("Files saved:")
    for it, info in index["items"].items():
        if isinstance(info, dict) and ("text" in info or "html" in info):
            print("  ", it)
            if "text" in info:
                print("    text ->", info["text"].get("path", info["text"].get("error")))
            if "html" in info:
                print("    html ->", info["html"].get("path", info["html"].get("error")))
        else:
            print("  ", it, "->", info.get("path", info.get("error")))

if __name__ == "__main__":
    main()


Starting sec-api extraction for filing: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
[metadata] querying for filing with filename: aapl-20240928.htm
[metadata] saved metadata to ./sec_extraction_output/metadata.json
[raw] couldn't download raw filing HTML; status 403


Items:   0%|          | 0/21 [00:00<?, ?it/s]


[extract] item=1  type=text
[saved] item 1 -> ./sec_extraction_output/items/item_1.txt (size=16,339 bytes)


Items:   5%|▍         | 1/21 [00:01<00:23,  1.18s/it]


[extract] item=1A  type=text
[saved] item 1A -> ./sec_extraction_output/items/item_1A.txt (size=70,585 bytes)


Items:  10%|▉         | 2/21 [00:02<00:23,  1.24s/it]


[extract] item=1B  type=text
[saved] item 1B -> ./sec_extraction_output/items/item_1B.txt (size=46 bytes)


Items:  14%|█▍        | 3/21 [00:03<00:21,  1.18s/it]


[extract] item=1C  type=text
[saved] item 1C -> ./sec_extraction_output/items/item_1C.txt (size=2,840 bytes)


Items:  19%|█▉        | 4/21 [00:04<00:19,  1.13s/it]


[extract] item=2  type=text
[saved] item 2 -> ./sec_extraction_output/items/item_2.txt (size=499 bytes)


Items:  24%|██▍       | 5/21 [00:05<00:17,  1.10s/it]


[extract] item=3  type=text
[saved] item 3 -> ./sec_extraction_output/items/item_3.txt (size=4,496 bytes)


Items:  29%|██▊       | 6/21 [00:06<00:16,  1.13s/it]


[extract] item=4  type=text
[saved] item 4 -> ./sec_extraction_output/items/item_4.txt (size=98 bytes)


Items:  33%|███▎      | 7/21 [00:07<00:15,  1.09s/it]


[extract] item=5  type=text (saving both text and html for this item)
[saved] item 5 (text) -> ./sec_extraction_output/items/item_5.txt (size=2,725 bytes)
[extract] item=5  type=html (saving both text and html for this item)
[saved] item 5 (html) -> ./sec_extraction_output/items/item_5.html (size=31,969 bytes)


Items:  38%|███▊      | 8/21 [00:10<00:19,  1.46s/it]


[extract] item=6  type=html
[saved] item 6 -> ./sec_extraction_output/items/item_6.html (size=710 bytes)


Items:  43%|████▎     | 9/21 [00:11<00:16,  1.35s/it]


[extract] item=7  type=text
[saved] item 7 -> ./sec_extraction_output/items/item_7.txt (size=16,122 bytes)


Items:  48%|████▊     | 10/21 [00:12<00:14,  1.28s/it]


[extract] item=7A  type=html
[saved] item 7A -> ./sec_extraction_output/items/item_7A.html (size=11,355 bytes)


Items:  52%|█████▏    | 11/21 [00:13<00:12,  1.21s/it]


[extract] item=8  type=html
[saved] item 8 -> ./sec_extraction_output/items/item_8.html (size=865,455 bytes)


Items:  57%|█████▋    | 12/21 [00:14<00:11,  1.29s/it]


[extract] item=9  type=text
[saved] item 9 -> ./sec_extraction_output/items/item_9.txt (size=104 bytes)


Items:  62%|██████▏   | 13/21 [00:15<00:09,  1.24s/it]


[extract] item=9A  type=text
[saved] item 9A -> ./sec_extraction_output/items/item_9A.txt (size=4,665 bytes)


Items:  67%|██████▋   | 14/21 [00:17<00:08,  1.18s/it]


[extract] item=9B  type=text
[saved] item 9B -> ./sec_extraction_output/items/item_9B.txt (size=1,401 bytes)


Items:  71%|███████▏  | 15/21 [00:18<00:06,  1.14s/it]


[extract] item=10  type=text
[saved] item 10 -> ./sec_extraction_output/items/item_10.txt (size=1,074 bytes)


Items:  76%|███████▌  | 16/21 [00:19<00:05,  1.12s/it]


[extract] item=11  type=text
[saved] item 11 -> ./sec_extraction_output/items/item_11.txt (size=162 bytes)


Items:  81%|████████  | 17/21 [00:20<00:04,  1.10s/it]


[extract] item=12  type=text
[saved] item 12 -> ./sec_extraction_output/items/item_12.txt (size=234 bytes)


Items:  86%|████████▌ | 18/21 [00:21<00:03,  1.16s/it]


[extract] item=13  type=text
[saved] item 13 -> ./sec_extraction_output/items/item_13.txt (size=213 bytes)


Items:  90%|█████████ | 19/21 [00:22<00:02,  1.13s/it]


[extract] item=14  type=text
[saved] item 14 -> ./sec_extraction_output/items/item_14.txt (size=223 bytes)


Items:  95%|█████████▌| 20/21 [00:23<00:01,  1.11s/it]


[extract] item=15  type=html
[saved] item 15 -> ./sec_extraction_output/items/item_15.html (size=145,349 bytes)


Items: 100%|██████████| 21/21 [00:24<00:00,  1.18s/it]


[index] saved to ./sec_extraction_output/items/items_index.json

All done. Output directory: ./sec_extraction_output
Files saved:
   1 -> ./sec_extraction_output/items/item_1.txt
   1A -> ./sec_extraction_output/items/item_1A.txt
   1B -> ./sec_extraction_output/items/item_1B.txt
   1C -> ./sec_extraction_output/items/item_1C.txt
   2 -> ./sec_extraction_output/items/item_2.txt
   3 -> ./sec_extraction_output/items/item_3.txt
   4 -> ./sec_extraction_output/items/item_4.txt
   5
    text -> ./sec_extraction_output/items/item_5.txt
    html -> ./sec_extraction_output/items/item_5.html
   6 -> ./sec_extraction_output/items/item_6.html
   7 -> ./sec_extraction_output/items/item_7.txt
   7A -> ./sec_extraction_output/items/item_7A.html
   8 -> ./sec_extraction_output/items/item_8.html
   9 -> ./sec_extraction_output/items/item_9.txt
   9A -> ./sec_extraction_output/items/item_9A.txt
   9B -> ./sec_extraction_output/items/item_9B.txt
   10 -> ./sec_extraction_output/items/item_10.txt
   11




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/item_1.txt'

try:
        with open(file_path, 'r') as file:
            content = file.read()  # Reads the entire content of the file
            print(content)
except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
        print(f"An error occurred: {e}")

 Item 1. Business 

Company Background 

The Company designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services. The Company&#8217;s fiscal year is the 52- or 53-week period that ends on the last Saturday of September. 

Products 

iPhone 

iPhone &#174; is the Company&#8217;s line of smartphones based on its iOS operating system. The iPhone line includes iPhone 16 Pro, iPhone 16, iPhone 15, iPhone 14 and iPhone SE &#174; . 

Mac 

Mac &#174; is the Company&#8217;s line of personal computers based on its macOS &#174; operating system. The Mac line includes laptops MacBook Air &#174; and MacBook Pro &#174; , as well as desktops iMac &#174; , Mac mini &#174; , Mac Studio &#174; and Mac Pro &#174; . 

iPad 

iPad &#174; is the Company&#8217;s line of multipurpose tablets based on its iPadOS &#174; operating system. The iPad line includes iPad Pro &#174; , iPad Air &#174; , iPad and iPad mini &#174; . 

Wea

In [None]:
from google.colab import drive
import os


# 2. Set the folder path where your .txt files are stored
# Example: "/content/drive/My Drive/sec_filings"
folder_path = "/content/drive/My Drive/SEC-API/AAPL/txt_files"

# 3. List and print contents of all .txt files
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(root, file)
            print(f"\n--- {file_path} ---")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    print(content[:2000])  # print first 2000 chars for preview
            except Exception as e:
                print(f"Error reading {file_path}: {e}")



--- /content/drive/My Drive/SEC-API/AAPL/txt_files/item_13.txt ---
 Item 13. Certain Relationships and Related Transactions, and Director Independence 

The information required by this Item will be included in the 2025 Proxy Statement, and is incorporated herein by reference. 



--- /content/drive/My Drive/SEC-API/AAPL/txt_files/item_9B.txt ---
 Item 9B. Other Information 

Insider Trading Arrangements 

On August 27, 2024 , Deirdre O&#8217;Brien , the Company&#8217;s Senior Vice President, Retail , entered into a trading plan intended to satisfy the affirmative defense conditions of Rule 10b5-1(c) under the Exchange Act. The plan provides for the sale, subject to certain price limits, of shares vesting between April 1, 2025 and October 1, 2026, pursuant to certain equity awards granted to Ms. O&#8217;Brien, excluding any shares withheld by the Company to satisfy income tax withholding and remittance obligations. Ms. O&#8217;Brien&#8217;s plan will expire on December 31, 2026 , subj

#### .txt -> JSON conversion

In [None]:
# Converting all the .txt files to JSON files
# Colab: txt -> structured JSON (items, headings, subsections, paragraphs)
# Edit paths below to point to your txt files and desired output locations.

import os, re, json, time, html, unicodedata
from datetime import datetime

# ----------------- CONFIG (edit these) -----------------
INPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/txt_files"   # path with .txt files
OUTPUT_DIR_LOCAL = "/content/converted_json_local"           # local colab output (viewable in Files)
OUTPUT_DIR_DRIVE = "/content/drive/My Drive/SEC-API/AAPL/JSON_files"  # optional: set to None to skip saving to Drive
# ------------------------------------------------------

os.makedirs(OUTPUT_DIR_LOCAL, exist_ok=True)
if OUTPUT_DIR_DRIVE:
    os.makedirs(OUTPUT_DIR_DRIVE, exist_ok=True)

# ----------------- Helpers -----------------
def normalize_text(s: str) -> str:
    """Decode HTML entities and normalize unicode, but keep paragraphs/newlines."""
    if not isinstance(s, str):
        s = str(s)
    s = html.unescape(s)                         # &#8217; -> ’ etc.
    s = unicodedata.normalize("NFKC", s)         # canonicalize
    # Normalize line endings
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    # Remove trailing spaces on each line but preserve blank lines
    s = "\n".join([ln.rstrip() for ln in s.split("\n")])
    # Collapse multiple trailing spaces
    s = re.sub(r"[ \t]+", " ", s)
    return s

def paragraph_iterator_with_offsets(text: str):
    """
    Yield (para_text, start_offset, end_offset) for each paragraph.
    Paragraphs are sequences separated by 1+ blank lines.
    """
    # Use regex to capture blocks separated by blank lines
    pattern = re.compile(r'(.*?)(?:\n\s*\n|$)', re.S)
    for m in pattern.finditer(text):
        para = m.group(1)
        if para is None:
            continue
        # strip leading/trailing newlines/spaces
        stripped = para.strip()
        if not stripped:
            continue
        start = m.start(1) + (len(para) - len(para.lstrip()))  # adjust for leading spaces trimmed by strip
        end = m.end(1) - (len(para) - len(para.rstrip()))
        # ensure offsets correspond to stripped text
        yield stripped, start, end

# Heading heuristics
ITEM_RE = re.compile(r'^\s*Item\s+(\d+[A-Za-z]?)(?:[\.\:\-]?\s*)(.*)$', re.IGNORECASE)
# Titles that strongly indicate a subsection (domain-specific common section titles)
TITLE_KEYWORDS = set([
    "Business","Risk Factors","Management","Financial Statements", "Executive Compensation",
    "Directors", "Legal Proceedings", "Properties", "Market for Registrant", "Controls and Procedures",
    "Management’s Discussion", "MD&A", "Quantitative and Qualitative", "Liquidity", "Capital Resources"
])

def detect_heading(para_text: str):
    """
    Returns:
      {
        'is_item': bool,
        'item_num': str or None,
        'item_title': str or None,
        'is_subheading': bool,
        'heading_text': str or None,
        'heading_level': int (1=item,2=subheading,3=subsub),
        'confidence': float 0..1
      }
    """
    pt = para_text.strip()
    # 1) Match explicit Item headings first
    m = ITEM_RE.match(pt)
    if m:
        num = m.group(1).strip()
        title = m.group(2).strip() if m.group(2) else ""
        if not title:
            # Sometimes the next paragraph is the title; still classify as item header
            title = ""
        return {
            "is_item": True,
            "item_num": num,
            "item_title": title,
            "is_subheading": False,
            "heading_text": f"Item {num} {title}".strip(),
            "heading_level": 1,
            "confidence": 0.99
        }

    # 2) Look for short title-like paragraphs (subheadings)
    words = re.findall(r"\w+['’]?\w*|\w+", pt)
    word_count = len(words)
    punct_count = len(re.findall(r'[.!?;:,]', pt))
    # capitalized-word ratio
    cap_words = sum(1 for w in words if len(w)>0 and w[0].isupper())
    cap_ratio = cap_words / max(1, word_count)

    # If ends with colon it's likely a heading
    if pt.endswith(":") or pt.endswith("—"):
        return {
            "is_item": False,
            "item_num": None,
            "item_title": None,
            "is_subheading": True,
            "heading_text": pt.rstrip(":—"),
            "heading_level": 2,
            "confidence": 0.9
        }

    # Short paragraphs (<= 120 chars) with many Title-case words and low punctuation => heading
    if len(pt) <= 140 and cap_ratio > 0.55 and punct_count <= 1 and word_count <= 12:
        # If contains known keyword, raise confidence
        kws = [k for k in TITLE_KEYWORDS if k.lower() in pt.lower()]
        conf = 0.85 if kws else 0.7
        # level: ALL CAPS or high cap_ratio => level 2, else 3
        is_all_caps = pt.upper() == pt and sum(1 for ch in pt if ch.isalpha())>0
        level = 2 if is_all_caps or cap_ratio > 0.8 else 3
        return {
            "is_item": False,
            "item_num": None,
            "item_title": None,
            "is_subheading": True,
            "heading_text": pt,
            "heading_level": level,
            "confidence": conf
        }

    # If paragraph length < ~40 and contains few tokens, treat as possible heading
    if len(pt) < 60 and word_count <= 6 and punct_count <= 1:
        return {
            "is_item": False,
            "item_num": None,
            "item_title": None,
            "is_subheading": True,
            "heading_text": pt,
            "heading_level": 3,
            "confidence": 0.6
        }

    # default: not a heading
    return {
        "is_item": False,
        "item_num": None,
        "item_title": None,
        "is_subheading": False,
        "heading_text": None,
        "heading_level": None,
        "confidence": 0.0
    }

# Detect table placeholders (common markers or cues)
TABLE_MARKERS = re.compile(r'(##TABLE_START|##TABLE_END|\bTable\b|\bTABLE\b|\btable\b|\bSchedule\b)', re.IGNORECASE)

def contains_table_marker(para_text: str):
    return bool(TABLE_MARKERS.search(para_text))

# ----------------- Core conversion -----------------
def parse_txt_file_to_structure(filepath):
    with open(filepath, "r", encoding="utf-8", errors="replace") as fh:
        raw = fh.read()

    clean = normalize_text(raw)

    # Build base document meta
    doc = {
        "file_name": os.path.basename(filepath),
        "file_path": os.path.abspath(filepath),
        "parsed_at": datetime.utcnow().isoformat() + "Z",
        "original_size_bytes": len(raw),
        "items": []
    }

    # iterate paragraphs with offsets
    current_item = None
    # We keep a stack for subsection nesting: list of dicts (subsection nodes)
    subsection_stack = []

    for para_text, start_off, end_off in paragraph_iterator_with_offsets(clean):
        # detect table placeholder
        table_present = contains_table_marker(para_text)

        # detect heading (item or subsection)
        heading = detect_heading(para_text)

        # If it's an Item header -> start new item
        if heading["is_item"]:
            # finalize previous item
            current_item = {
                "item_number": heading["item_num"],
                "item_title": heading["item_title"] or heading["heading_text"],
                "heading_text": heading["heading_text"],
                "start_offset": start_off,
                "end_offset": end_off,
                "confidence": heading["confidence"],
                "raw_text": para_text,
                "clean_text": para_text,
                "subsections": [],
                "paragraphs": [],
                "tables": []
            }
            doc["items"].append(current_item)
            subsection_stack = []  # reset subsection context
            continue

        # If no current item yet, create a default top-level pseudo-item
        if current_item is None:
            current_item = {
                "item_number": None,
                "item_title": None,
                "heading_text": None,
                "start_offset": 0,
                "end_offset": 0,
                "confidence": 0.0,
                "raw_text": "",
                "clean_text": "",
                "subsections": [],
                "paragraphs": [],
                "tables": []
            }
            doc["items"].append(current_item)
            subsection_stack = []

        # If heading detected (subheading)
        if heading["is_subheading"]:
            node = {
                "title": heading["heading_text"],
                "heading_level": heading["heading_level"],
                "confidence": heading["confidence"],
                "start_offset": start_off,
                "end_offset": end_off,
                "raw_text": para_text,
                "clean_text": para_text,
                "paragraphs": [],
                "subsections": []
            }
            # Decide placement by level relative to last subsection on stack
            if not subsection_stack:
                current_item["subsections"].append(node)
                subsection_stack.append(node)
            else:
                # pop levels until stack top has heading_level < current node level
                while subsection_stack and (subsection_stack[-1].get("heading_level") or 2) >= node["heading_level"]:
                    subsection_stack.pop()
                if subsection_stack:
                    subsection_stack[-1]["subsections"].append(node)
                else:
                    current_item["subsections"].append(node)
                subsection_stack.append(node)
            continue

        # Otherwise treat as normal paragraph: attach to deepest subsection if present, else to item paragraphs
        p_obj = {
            "text": para_text,
            "clean_text": para_text,
            "raw_text": para_text,
            "start_offset": start_off,
            "end_offset": end_off,
            "contains_table_marker": table_present
        }
        if subsection_stack:
            subsection_stack[-1]["paragraphs"].append(p_obj)
        else:
            current_item["paragraphs"].append(p_obj)

        # record table placeholders referencing current item/subsection
        if table_present:
            table_obj = {
                "placeholder": True,
                "note": "table marker detected in paragraph",
                "paragraph_start_offset": start_off,
                "paragraph_end_offset": end_off,
                "paragraph_snippet": para_text[:240]
            }
            current_item["tables"].append(table_obj)

        # update item end offset
        current_item["end_offset"] = end_off

    # After iteration, set a few summary fields
    for it in doc["items"]:
        if it.get("item_title") is None:
            # try to infer from first subsection title if present
            if it["subsections"]:
                it["item_title"] = it["subsections"][0].get("title")
        # update sizes
        it["paragraph_count"] = len(it.get("paragraphs", []))
        # count nested paragraphs
        nested_count = 0
        def count_sub_ps(snode):
            nonlocal nested_count
            nested_count += len(snode.get("paragraphs", []))
            for ss in snode.get("subsections", []):
                count_sub_ps(ss)
        for s in it.get("subsections", []):
            count_sub_ps(s)
        it["nested_paragraph_count"] = nested_count

    return doc

# ----------------- Run over INPUT_DIR -----------------
def process_all_txts(input_dir, out_local, out_drive=None, verbose=True):
    stats = {"files_processed": 0, "errors": []}
    for root, dirs, files in os.walk(input_dir):
        for fname in sorted(files):
            if not fname.lower().endswith(".txt"):
                continue
            fpath = os.path.join(root, fname)
            try:
                docobj = parse_txt_file_to_structure(fpath)
                basename = os.path.splitext(fname)[0]
                outname = basename + ".json"
                local_path = os.path.join(out_local, outname)
                with open(local_path, "w", encoding="utf-8") as fh:
                    json.dump(docobj, fh, indent=2, ensure_ascii=False)
                if out_drive:
                    drive_path = os.path.join(out_drive, outname)
                    with open(drive_path, "w", encoding="utf-8") as fh2:
                        json.dump(docobj, fh2, indent=2, ensure_ascii=False)
                stats["files_processed"] += 1
                if verbose:
                    print(f"[OK] parsed: {fpath} → {local_path} (items: {len(docobj['items'])})")
            except Exception as e:
                stats["errors"].append({"file": fpath, "error": str(e)})
                print(f"[ERROR] {fpath}: {e}")
    return stats

# ----------------- Execute -----------------
print("INPUT_DIR:", INPUT_DIR)
print("OUTPUT_DIR_LOCAL:", OUTPUT_DIR_LOCAL)
print("OUTPUT_DIR_DRIVE:", OUTPUT_DIR_DRIVE)
print("Starting processing ...")
t0 = time.time()
stats = process_all_txts(INPUT_DIR, OUTPUT_DIR_LOCAL, OUTPUT_DIR_DRIVE, verbose=True)
t1 = time.time()
print("\nDone. Files processed:", stats["files_processed"], "Errors:", len(stats["errors"]))
print("Elapsed: %.2f sec" % (t1-t0))
if stats["errors"]:
    print("Error samples:", stats["errors"][:3])


INPUT_DIR: /content/drive/My Drive/SEC-API/AAPL/txt_files
OUTPUT_DIR_LOCAL: /content/converted_json_local
OUTPUT_DIR_DRIVE: /content/drive/My Drive/SEC-API/AAPL/JSON_files
Starting processing ...
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_1.txt → /content/converted_json_local/item_1.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_10.txt → /content/converted_json_local/item_10.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_11.txt → /content/converted_json_local/item_11.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_12.txt → /content/converted_json_local/item_12.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_13.txt → /content/converted_json_local/item_13.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_14.txt → /content/converted_json_local/item_14.json (items: 1)


  "parsed_at": datetime.utcnow().isoformat() + "Z",


[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_1A.txt → /content/converted_json_local/item_1A.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_1B.txt → /content/converted_json_local/item_1B.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_1C.txt → /content/converted_json_local/item_1C.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_2.txt → /content/converted_json_local/item_2.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_3.txt → /content/converted_json_local/item_3.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_4.txt → /content/converted_json_local/item_4.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_7.txt → /content/converted_json_local/item_7.json (items: 1)
[OK] parsed: /content/drive/My Drive/SEC-API/AAPL/txt_files/item_9.txt → /content/converted_json_local/item_9.js

raw_text

The exact substring from the normalized input that we used for that unit (paragraph / heading). It preserves any punctuation and spacing left after initial normalization steps (i.e., entity-decoded and unicode-normalized).

Use this for provenance and to display the original snippet.

clean_text

A post-processed, more machine-friendly version of the raw text used for indexing and embeddings: trimmed, entity-decoded, normalized whitespace, and optionally cleaned of odd control characters. It should be friendly for tokenizers and quick display. (In the script we currently set clean_text same as raw_text after normalization; you can further lower-case, remove extra whitespace, or perform additional replacements depending on the model.)

start_offset / end_offset

Character offsets (integers) into the normalized text showing where the unit begins and ends. These let you map back into the original .txt (and into the raw file if you keep that mapping).

Offsets make it possible to extract the exact snippet later for auditing, or to display original formatting.

confidence

A small heuristic score (0–1) expressing how confident the parser is that the paragraph is a heading or an item boundary. Higher numbers mean stronger signals (explicit Item 7. heading gets very high confidence; a small title-case paragraph has lower confidence). Use this to flag low-confidence units for manual review.

What unicodedata.normalize() does — explained deeply

unicodedata.normalize(form, text) performs Unicode canonicalization. There are four common forms:

NFC (Normalization Form C) — Canonical Composition. Characters that can be represented as a single composed code point are composed. (e.g., e + ́ → é). Good default for display and storage.

NFD (Normalization Form D) — Canonical Decomposition. Characters are decomposed into a base character plus combining characters. Useful if you need to strip diacritics.

NFKC / NFKD — Compatibility forms. They additionally map compatibility characters to their equivalents (e.g., ligatures, superscripts, circled numbers, fullwidth characters). NFKC is commonly used to collapse presentation differences (e.g., “①” → “1”, “ﬁ” ligature → fi). It can alter appearance but improves text canonicalization for comparison/search.

unicodedata.normalize by itself is necessary but not always sufficient for clean textual input for NLP. Additional normalization tasks you commonly need:

HTML entity decoding (html.unescape) — converts &#8217; → ’ and &amp; → &. (Always do this before offsets calculation.)

Whitespace normalization — convert \r\n, \r → \n; collapse repeated spaces; standardize NBSP to space.

Control / zero-width char removal — eliminate weird zero-width joiners or other invisible characters.

Smart quotes/dash normalization (optional) — map curly quotes to straight quotes, em-dash to — or - depending on your model’s tokenizer.

De-hyphenation — fix words split at end-of-line hyphens (only if your extraction introduces them).

Numeric normalization — separate processing to canonicalize amounts (remove commas, convert ($1,234) → -1234, apply in thousands scaling when present).

Language-specific fixes — e.g., special punctuation in some filings.

So: use unicodedata.normalize (NFKC recommended for canonicalization) + HTML entity decoding + whitespace/control cleaning. That combination is robust for most SEC TXT content. You may add more domain-specific normalization later.

#### .html -> JSON Conversion

In [None]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Path where your HTML files are stored
folder_path = "/content/drive/My Drive/SEC-API/AAPL/html_files"  # <-- change this

# 3. List and preview .html files
for root, dirs, files in os.walk(folder_path):
    for file in sorted(files):
        if file.endswith(".html"):
            file_path = os.path.join(root, file)
            print(f"\n--- {file_path} ---")
            try:
                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                    content = f.read()
                    print(content[:2000])  # preview first 2000 characters
            except Exception as e:
                print(f"Error reading {file_path}: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

--- /content/drive/My Drive/SEC-API/AAPL/html_files/item_15.html ---
<span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">Item 15. Exhibit and Financial Statement Schedules</span></div><div style="margin-top:9pt;padding-left:18pt;text-align:justify;text-indent:-18pt"><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">(a)</span><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%;padding-left:7.02pt">Documents filed as part of this report</span></div><div style="margin-top:9pt;padding-left:18pt;text-align:justify;text-indent:-18pt"><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">(1)</span><span style="color:#000000;font-family:'H

In [None]:
# 2. Path to the specific HTML file you want to view
html_file_path = "/content/drive/My Drive/SEC-API/AAPL/html_files/item_5.html"  # <-- change this

# 3. Read and print the complete HTML file
with open(html_file_path, "r", encoding="utf-8", errors="ignore") as f:
    html_content = f.read()

print(html_content)  # prints the entire file (can be very large)

<span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">Item 5. Market for Registrant&#8217;s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities</span></div><div style="margin-top:9pt;text-align:justify"><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:120%">The Company&#8217;s common stock is traded on The Nasdaq Stock Market LLC under the symbol AAPL.</span></div><div style="margin-top:18pt;text-align:justify"><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">Holders</span></div><div style="margin-top:6pt;text-align:justify"><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:120%">As of October 18, 2024, there were 23,301 shareholders of record.</span></div><div style="margin-top:18pt;text-align:justify"><span style="color:#000000;f

In [None]:
from IPython.display import HTML, display

display(HTML(html_content))


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
,,,,,,,,,,,,,,,,,,,,,,,,,,
Periods,Periods,Periods,,,,Total Number of Shares Purchased,Total Number of Shares Purchased,Total Number of Shares Purchased,,,,Average Price Paid Per Share,Average Price Paid Per Share,Average Price Paid Per Share,,,,Total Number of Shares Purchased as Part of Publicly Announced Plans or Programs,Total Number of Shares Purchased as Part of Publicly Announced Plans or Programs,Total Number of Shares Purchased as Part of Publicly Announced Plans or Programs,,,,Approximate Dollar Value ofShares That May Yet Be PurchasedUnder the Plans or Programs (1),Approximate Dollar Value ofShares That May Yet Be PurchasedUnder the Plans or Programs (1),Approximate Dollar Value ofShares That May Yet Be PurchasedUnder the Plans or Programs (1)
"June 30, 2024 to August 3, 2024:","June 30, 2024 to August 3, 2024:","June 30, 2024 to August 3, 2024:",,,,,,,,,,,,,,,,,,,,,,,,
Open market and privately negotiated purchases,Open market and privately negotiated purchases,Open market and privately negotiated purchases,,,,35697,35697,,,,,$,224.11,,,,,35697,35697,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,
"August 4, 2024 to August 31, 2024:","August 4, 2024 to August 31, 2024:","August 4, 2024 to August 31, 2024:",,,,,,,,,,,,,,,,,,,,,,,,
Open market and privately negotiated purchases,Open market and privately negotiated purchases,Open market and privately negotiated purchases,,,,42910,42910,,,,,$,221.39,,,,,42910,42910,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,
"September 1, 2024 to September 28, 2024:","September 1, 2024 to September 28, 2024:","September 1, 2024 to September 28, 2024:",,,,,,,,,,,,,,,,,,,,,,,,
Open market and privately negotiated purchases,Open market and privately negotiated purchases,Open market and privately negotiated purchases,,,,33653,33653,,,,,$,222.86,,,,,33653,33653,,,,,,,

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,September 2019,September 2019,September 2019,,,,September 2020,September 2020,September 2020,,,,September 2021,September 2021,September 2021,,,,September 2022,September 2022,September 2022,,,,September 2023,September 2023,September 2023,,,,September 2024,September 2024,September 2024
Apple Inc.,Apple Inc.,Apple Inc.,,,,$,100,,,,,$,207,,,,,$,273,,,,,$,281,,,,,$,322,,,,,$,430,
S&P 500 Index,S&P 500 Index,S&P 500 Index,,,,$,100,,,,,$,113,,,,,$,156,,,,,$,131,,,,,$,155,,,,,$,210,
Dow Jones U.S. Technology Supersector Index,Dow Jones U.S. Technology Supersector Index,Dow Jones U.S. Technology Supersector Index,,,,$,100,,,,,$,146,,,,,$,216,,,,,$,156,,,,,$,215,,,,,$,322,


In [None]:
from bs4 import BeautifulSoup

# Parse HTML and prettify it with indentation + line breaks
soup = BeautifulSoup(html_content, "html.parser")
pretty_html = soup.prettify()

print(pretty_html)


<span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">
 Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities
</span>
<div style="margin-top:9pt;text-align:justify">
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:120%">
  The Company’s common stock is traded on The Nasdaq Stock Market LLC under the symbol AAPL.
 </span>
</div>
<div style="margin-top:18pt;text-align:justify">
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">
  Holders
 </span>
</div>
<div style="margin-top:6pt;text-align:justify">
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:120%">
  As of October 18, 2024, there were 23,301 shareholders of record.
 </span>
</div>
<div style="margin-top:18pt;text-align:justify">
 <span style="

In [None]:
# 2. Path to the specific HTML file you want to view
html_file_path = "/content/drive/My Drive/SEC-API/AAPL/html_files/item_15.html"  # <-- change this

# 3. Read and print the complete HTML file
with open(html_file_path, "r", encoding="utf-8", errors="ignore") as f:
    html_content = f.read()

from bs4 import BeautifulSoup

# Parse HTML and prettify it with indentation + line breaks
soup = BeautifulSoup(html_content, "html.parser")
pretty_html = soup.prettify()

print(pretty_html)



<span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">
 Item 15. Exhibit and Financial Statement Schedules
</span>
<div style="margin-top:9pt;padding-left:18pt;text-align:justify;text-indent:-18pt">
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">
  (a)
 </span>
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%;padding-left:7.02pt">
  Documents filed as part of this report
 </span>
</div>
<div style="margin-top:9pt;padding-left:18pt;text-align:justify;text-indent:-18pt">
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%">
  (1)
 </span>
 <span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:700;line-height:120%;padding-left:7.02pt">
  All financial statements
 </span>
</div>
<div style="margin-top:3pt;text-align:justi

#### APPROACH 6: only parsing via type=text and XBRL facts separately

In [None]:
!pip install -q sec-api


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m61.4/66.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# EXTRACTING THE XBRL FACTS FROM A 10K SEC FILING LINK
# Colab / local script: download XBRL JSON (all facts) using sec-api XbrlApi.xbrl_to_json()
# Edit CONFIG below with your API key and filing URL, then run.

# Install the sec-api client (uncomment in Colab or run once)
# !pip install -q sec-api pandas

import os
import json
import csv
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple
from pprint import pprint

# --------------- CONFIG (EDIT) ----------------
API_KEY = "327f556515bcbb181578971950353ecb41012af4b728189fe6d024bee61b6c8c"   # <<--- put your sec-api key here
FILING_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"
OUTPUT_DIR = "./xbrl_output_aapl_10k"   # local dir to save JSON and CSV output
RETRY_COUNT = 3
RETRY_DELAY_SEC = 3
# ----------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------- Helper functions -----------------
def safe_save_json(obj: Any, path: str):
    with open(path, "w", encoding="utf-8") as fh:
        json.dump(obj, fh, indent=2, ensure_ascii=False)

def flatten_xbrl_json(x: Any, path_stack: List[str] = None, facts_out: List[Dict] = None):
    """
    Walk the returned XBRL JSON recursively and collect fact-like objects.
    We heuristically collect dicts that include a 'value' key or have 'period' keys.
    This returns a list of facts with provenance.
    """
    if path_stack is None:
        path_stack = []
    if facts_out is None:
        facts_out = []

    if isinstance(x, dict):
        # heuristics: if node looks like a fact (has 'value' or 'period' fields), record it
        # Many sec-api outputs use: fact = {"value":..., "period": {"startDate":..,"endDate":..}, ...}
        is_fact = False
        if "value" in x or ("period" in x and ("value" in x or "amount" in x or "fact" in x)):
            is_fact = True
        # also accept nodes with 'val' or 'text' as possible fact fields
        if any(k.lower() in ("value", "val", "amount", "text", "fact") for k in x.keys()):
            # but confirm there is some scalar value
            for k in ("value", "val", "amount", "text", "fact"):
                if k in x and (x[k] is None or isinstance(x[k], (dict, list))):
                    continue
                if k in x and (not isinstance(x[k], (dict, list))):
                    is_fact = True
                    break

        if is_fact:
            fact_entry = {
                "provenance_path": "/".join(path_stack) if path_stack else "",
                "raw": x
            }
            # try to standardize a few common fields
            # concept name may be present in parent key or in node itself:
            concept = x.get("concept") or x.get("name") or x.get("tag") or x.get("label") or None
            # period may be nested
            period = x.get("period") or x.get("context") or None
            unit = x.get("unit") or x.get("unitRef") or None
            value = x.get("value") or x.get("val") or x.get("amount") or x.get("text") or None
            if concept is None and path_stack:
                # last path element sometimes is the GAAP item name
                concept = path_stack[-1]
            fact_entry.update({
                "concept": concept,
                "value": value,
                "unit": unit,
                "period": period
            })
            facts_out.append(fact_entry)

        # recurse into children
        for k, v in x.items():
            flatten_xbrl_json(v, path_stack + [str(k)], facts_out)
    elif isinstance(x, list):
        for idx, v in enumerate(x):
            flatten_xbrl_json(v, path_stack + [f"[{idx}]"], facts_out)
    # else: primitive -> nothing to do
    return facts_out

# ----------------- Main: call sec-api XbrlApi -----------------
def run_xbrl_to_json(api_key: str, htm_url: str, out_dir: str) -> Tuple[str, Dict]:
    """
    Uses sec-api Python client (sec_api.XbrlApi) to call xbrl_to_json(htm_url=...).
    Saves the full returned JSON to out_dir/xbrl_full.json and also writes facts_flat.json and facts_flat.csv.
    Returns (path_to_saved_json, parsed_json)
    """
    # try lazy import (install sec-api if missing)
    try:
        from sec_api import XbrlApi
    except Exception as e:
        raise RuntimeError("Please install the sec-api Python package (pip install sec-api) before running. "
                           "Error: " + str(e))

    xbrl_api = XbrlApi(api_key)

    # retry logic around the call
    last_exc = None
    for attempt in range(1, RETRY_COUNT + 1):
        try:
            print(f"[sec-api] calling xbrl_to_json (attempt {attempt}) for: {htm_url}")
            result = xbrl_api.xbrl_to_json(htm_url=htm_url)
            break
        except Exception as e:
            last_exc = e
            print(f"  [warn] attempt {attempt} failed: {e}")
            if attempt < RETRY_COUNT:
                time.sleep(RETRY_DELAY_SEC * attempt)
            else:
                raise RuntimeError(f"Failed to call xbrl_to_json after {RETRY_COUNT} attempts. Last error: {e}")

    # save full JSON
    full_path = os.path.join(out_dir, "xbrl_full.json")
    safe_save_json(result, full_path)
    print("[ok] saved full XBRL JSON to:", full_path)

    # flatten heuristically
    print("[info] flattening XBRL JSON to extract fact-like objects...")
    facts = flatten_xbrl_json(result, [], [])
    print(f"[info] heuristically found {len(facts)} fact-like objects (may include duplicates/provenance entries).")

    # save flattened JSON
    flat_json_path = os.path.join(out_dir, "xbrl_facts_flat.json")
    safe_save_json(facts, flat_json_path)
    print("[ok] saved flattened facts JSON to:", flat_json_path)

    # also produce a CSV summary with one row per fact (attempt to extract a few fields)
    csv_path = os.path.join(out_dir, "xbrl_facts_flat.csv")
    with open(csv_path, "w", newline='', encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["concept", "value", "unit", "period_summary", "provenance_path"])
        for f in facts:
            concept = f.get("concept")
            value = f.get("value")
            unit = f.get("unit")
            period = f.get("period")
            period_summary = ""
            try:
                if isinstance(period, dict):
                    start = period.get("startDate") or period.get("instant") or ""
                    end = period.get("endDate") or ""
                    period_summary = f"{start}/{end}" if end else str(start)
                else:
                    period_summary = str(period)
            except Exception:
                period_summary = str(period)
            writer.writerow([concept, value, unit, period_summary, f.get("provenance_path")])
    print("[ok] saved CSV summary to:", csv_path)

    return full_path, result

# ----------------- Run -----------------
if __name__ == "__main__":
    if API_KEY is None or API_KEY.strip() == "" or API_KEY.startswith("YOUR_"):
        raise RuntimeError("Please set your SEC-API API_KEY in the top of this script before running.")
    print("Filing URL:", FILING_URL)
    print("Output directory:", OUTPUT_DIR)
    saved_path, parsed_json = run_xbrl_to_json(API_KEY, FILING_URL, OUTPUT_DIR)
    print("Done. Full JSON:", saved_path)
    # Optionally print a few top-level keys to inspect
    print("\nTop-level keys in returned XBRL JSON:")
    try:
        print(list(parsed_json.keys()))
    except Exception:
        pprint(parsed_json if isinstance(parsed_json, dict) else parsed_json[:200])


Filing URL: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
Output directory: ./xbrl_output_aapl_10k
[sec-api] calling xbrl_to_json (attempt 1) for: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
[ok] saved full XBRL JSON to: ./xbrl_output_aapl_10k/xbrl_full.json
[info] flattening XBRL JSON to extract fact-like objects...
[info] heuristically found 2341 fact-like objects (may include duplicates/provenance entries).
[ok] saved flattened facts JSON to: ./xbrl_output_aapl_10k/xbrl_facts_flat.json
[ok] saved CSV summary to: ./xbrl_output_aapl_10k/xbrl_facts_flat.csv
Done. Full JSON: ./xbrl_output_aapl_10k/xbrl_full.json

Top-level keys in returned XBRL JSON:
['CoverPage', 'AuditorInformation', 'StatementsOfIncome', 'StatementsOfComprehensiveIncome', 'BalanceSheets', 'BalanceSheetsParenthetical', 'StatementsOfShareholdersEquity', 'StatementsOfCashFlows', 'SummaryofSignificantAccountingPolicies', 'Revenue', 'EarningsPer

## CHUNKING: on .txt and xbrl json files

### Downloading .txt and XBRL facts

In [None]:
# metadata_extractor_no_links.py
# Reads a sec-api style metadata.json and produces a compact canonical metadata dict
# WITHOUT extracting any document URLs. Also provides a helper to attach this filing
# metadata to per-chunk metadata before embedding.

import json
import os
import re
from datetime import datetime, timezone
from typing import Any, Dict, Optional

# ----------------- CONFIG -----------------
METADATA_PATH = "/content/mnt/data/metadata.json"   # path to input metadata.json (edit if needed)
OUTPUT_DIR = "/content/mnt/data"                    # where to save outputs
EXTRACTED_META_OUT = os.path.join(OUTPUT_DIR, "metadata_extracted_nolinks.json")
# ------------------------------------------

# ---------- utilities ----------
def load_json(path: str) -> Dict[str, Any]:
    with open(path, "r", encoding="utf-8") as fh:
        return json.load(fh)

def save_json(obj: Any, path: str) -> None:
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w", encoding="utf-8") as fh:
        json.dump(obj, fh, indent=2, ensure_ascii=False)

def _parse_iso_to_utc(iso_str: Optional[str]) -> Optional[str]:
    """Try to parse known ISO/date strings and return canonical UTC ISO (Z)."""
    if not iso_str:
        return None
    try:
        dt = datetime.fromisoformat(str(iso_str))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        dt_utc = dt.astimezone(timezone.utc)
        return dt_utc.replace(tzinfo=timezone.utc).isoformat().replace("+00:00", "Z")
    except Exception:
        # fallback: accept YYYY-MM-DD prefix
        m = re.match(r"^(\d{4}-\d{2}-\d{2})", str(iso_str))
        if m:
            try:
                d = datetime.fromisoformat(m.group(1))
                return d.replace(tzinfo=timezone.utc).isoformat().replace("+00:00", "Z")
            except Exception:
                return None
        return None

def normalize_cik(cik_val: Optional[str]) -> Dict[str, Optional[str]]:
    """
    Return canonical CIK forms: raw (digits only), padded 10-digit string, integer.
    """
    if not cik_val:
        return {"cik_raw": None, "cik_padded": None, "cik_int": None}
    s = re.sub(r"\D", "", str(cik_val))
    if not s:
        return {"cik_raw": None, "cik_padded": None, "cik_int": None}
    try:
        cik_int = int(s)
    except Exception:
        cik_int = None
    cik_padded = s.zfill(10)
    return {"cik_raw": s, "cik_padded": cik_padded, "cik_int": cik_int}

def parse_accession(accession: Optional[str]) -> Dict[str, Optional[str]]:
    """
    Break accession like "0000320193-24-000123" into parts and a no-dash form.
    """
    if not accession:
        return {"accession": None, "accession_nodash": None, "accession_parts": None}
    acc = str(accession)
    parts = acc.split("-")
    return {"accession": acc, "accession_nodash": "".join(parts), "accession_parts": parts}

# ---------- core extractor ----------
def extract_canonical_filing_metadata_no_links(raw_metadata: Dict[str, Any]) -> Dict[str, Any]:
    """
    Produce a compact canonical filing-level metadata dictionary without any document URLs.
    """
    out: Dict[str, Any] = {}

    # Basic identity fields
    out["ticker"] = raw_metadata.get("ticker") or raw_metadata.get("symbol") or None
    out["company_name"] = raw_metadata.get("companyName") or raw_metadata.get("companyNameLong") or None
    out["form_type"] = raw_metadata.get("formType") or raw_metadata.get("form_type") or None

    # Accession and CIK
    out["accession_raw"] = raw_metadata.get("accessionNo") or None
    out.update(parse_accession(out["accession_raw"]))

    cik_source = None
    # try typical places for cik
    if raw_metadata.get("cik"):
        cik_source = raw_metadata.get("cik")
    else:
        entities = raw_metadata.get("entities") or []
        if entities and isinstance(entities, list) and len(entities) > 0:
            cik_source = entities[0].get("cik") or cik_source
    out.update(normalize_cik(cik_source))

    # Dates
    filedAt_raw = raw_metadata.get("filedAt") or raw_metadata.get("filingDate") or None
    out["filed_at_raw"] = filedAt_raw
    out["filed_at_utc"] = _parse_iso_to_utc(filedAt_raw)

    out["period_of_report"] = raw_metadata.get("periodOfReport") or raw_metadata.get("period_of_report") or None

    # Entity-level fields (first entity if present)
    entities = raw_metadata.get("entities") or []
    entity0 = entities[0] if (isinstance(entities, list) and len(entities) > 0) else {}
    out["fiscal_year_end"] = entity0.get("fiscalYearEnd") or entity0.get("fiscal_year_end") or None
    out["state_of_incorporation"] = entity0.get("stateOfIncorporation") or None
    out["sic"] = entity0.get("sic") or None
    out["irs_no"] = entity0.get("irsNo") or None
    out["file_no"] = entity0.get("fileNo") or None

    # Optional identifiers & counts
    out["metadata_id"] = raw_metadata.get("id") or None
    out["description"] = raw_metadata.get("description") or None

    # counts (document/data files) - do not store URLs, only counts if present
    out["num_document_files"] = len(raw_metadata.get("documentFormatFiles") or [])
    out["num_data_files"] = len(raw_metadata.get("dataFiles") or [])

    # Do NOT include any document links or linkTo... fields per your request.

    # small note: do not include the full raw metadata by default to keep chunk metadata small
    out["_raw_included"] = False

    return out

# helper to attach to a chunk
def attach_filing_metadata_to_chunk(chunk: Dict[str, Any], filing_meta: Dict[str, Any], keep_fields: Optional[list] = None) -> Dict[str, Any]:
    """
    Attach select filing-level metadata to a chunk's 'metadata' dict.
    - chunk: dict with at least 'chunk_id' and 'text'. If 'metadata' missing, it's created.
    - filing_meta: output of extract_canonical_filing_metadata_no_links.
    - keep_fields: optional list of keys from filing_meta to attach; if None, a default compact set is attached.
    Returns the same chunk (mutated).
    """
    if "metadata" not in chunk or not isinstance(chunk["metadata"], dict):
        chunk["metadata"] = {}

    default_keep = [
        "ticker", "company_name", "form_type",
        "accession", "accession_nodash", "accession_parts",
        "cik_padded", "cik_raw", "cik_int",
        "filed_at_utc", "filed_at_raw", "period_of_report",
        "fiscal_year_end", "state_of_incorporation", "sic",
        "num_document_files", "num_data_files", "metadata_id", "description"
    ]
    keys = keep_fields or default_keep
    for k in keys:
        if k in filing_meta:
            chunk["metadata"][k] = filing_meta[k]

    # add ingestion timestamp for traceability
    chunk["metadata"]["ingested_at_utc"] = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
    return chunk

# ---------- example usage ----------
def main():
    if not os.path.exists(METADATA_PATH):
        raise FileNotFoundError(f"Metadata file not found: {METADATA_PATH}")

    raw = load_json(METADATA_PATH)
    canonical = extract_canonical_filing_metadata_no_links(raw)

    save_json(canonical, EXTRACTED_META_OUT)
    print("Saved canonical (no-links) metadata to:", EXTRACTED_META_OUT)
    print("Preview (selected fields):")
    for k in ["ticker","company_name","cik_padded","accession","filed_at_utc","period_of_report","fiscal_year_end","sic"]:
        print(f" - {k}: {canonical.get(k)}")

    # Example chunk
    example_chunk = {
        "chunk_id": "AAPL_2024_item7_chunk_0001",
        "text": "Example chunk text from Item 7 ...",
        "metadata": {"item": "7", "heading": "Management's Discussion and Analysis"}
    }
    enriched = attach_filing_metadata_to_chunk(example_chunk, canonical)
    example_out = os.path.join(OUTPUT_DIR, "example_chunk_enriched_nolinks.json")
    save_json(enriched, example_out)
    print("Saved example enriched chunk (no-links) to:", example_out)
    print("Example enriched metadata preview:")
    for k,v in enriched["metadata"].items():
        print(" ", k, ":", v)

if __name__ == "__main__":
    main()


Saved canonical (no-links) metadata to: /content/mnt/data/metadata_extracted_nolinks.json
Preview (selected fields):
 - ticker: AAPL
 - company_name: Apple Inc.
 - cik_padded: 0000320193
 - accession: 0000320193-24-000123
 - filed_at_utc: 2024-11-01T10:01:36Z
 - period_of_report: 2024-09-28
 - fiscal_year_end: 0928
 - sic: 3571 Electronic Computers
Saved example enriched chunk (no-links) to: /content/mnt/data/example_chunk_enriched_nolinks.json
Example enriched metadata preview:
  item : 7
  heading : Management's Discussion and Analysis
  ticker : AAPL
  company_name : Apple Inc.
  form_type : 10-K
  accession : 0000320193-24-000123
  accession_nodash : 000032019324000123
  accession_parts : ['0000320193', '24', '000123']
  cik_padded : 0000320193
  cik_raw : 320193
  cik_int : 320193
  filed_at_utc : 2024-11-01T10:01:36Z
  filed_at_raw : 2024-11-01T06:01:36-04:00
  period_of_report : 2024-09-28
  fiscal_year_end : 0928
  state_of_incorporation : CA
  sic : 3571 Electronic Computers
 

In [None]:
# DOWNLOADING ALL THE .TXT FILES
# Google Colab-ready script: download metadata + all 10-K items using sec-api (all items as type=text)
# Replace API_KEY with your API key before running.
# Installs minimal dependencies and saves each item as .txt

!pip install requests tqdm --quiet

import os
import time
import json
import requests
from urllib.parse import urlparse
from tqdm import tqdm

# --------------------------- CONFIGURATION (edit this) ---------------------------
API_KEY = "327f556515bcbb181578971950353ecb41012af4b728189fe6d024bee61b6c8c"   # <-- substitute your sec-api key here
FILING_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"
OUTPUT_DIR = "./sec_extraction_output"
REQUEST_DELAY_SEC = 0.25        # polite delay between requests (adjust if necessary)
# -------------------------------------------------------------------------------

# Items to extract for a 10-K
TEN_K_ITEMS = [
    "1","1A","1B","1C","2","3","4","5","6","7","7A","8","9","9A","9B",
    "10","11","12","13","14","15"
]

# API endpoints
QUERY_API_URL = "https://api.sec-api.io"       # POST for query metadata
EXTRACTOR_API_URL = "https://api.sec-api.io/extractor"  # GET for item extraction

# Ensure output directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
items_dir = os.path.join(OUTPUT_DIR, "items")
os.makedirs(items_dir, exist_ok=True)

# --------------------------- Helpers ---------------------------
def get_filing_filename(filing_url):
    p = urlparse(filing_url)
    return os.path.basename(p.path)

def fetch_filing_metadata(filing_url, api_key):
    """
    Uses Query API to find metadata for the filing. Searches by the filing filename.
    Returns the first matching filing metadata JSON object (or fallback minimal metadata dict).
    """
    filename = get_filing_filename(filing_url)
    payload = {
        "query": f'linkToFilingDetails:\"{filename}\"',
        "from": "0",
        "size": "1",
        "sort": [{ "filedAt": { "order": "desc" }}]
    }
    headers = {
        "Authorization": api_key,
        "Content-Type": "application/json"
    }
    print(f"[metadata] querying for filing with filename: {filename}")
    try:
        resp = requests.post(QUERY_API_URL, headers=headers, json=payload, timeout=30)
    except Exception as e:
        print("[metadata] Query API request failed:", e)
        return {"source_url": filing_url, "note": "Query API request failed; fallback metadata."}

    if resp.status_code != 200:
        print(f"[metadata] Query API returned status {resp.status_code}; returning fallback minimal metadata.")
        return {"source_url": filing_url, "note": f"Query API returned {resp.status_code}."}
    rj = resp.json()

    # Try to find hits in common shapes
    hits = None
    if isinstance(rj, dict):
        if "results" in rj:
            hits = rj["results"]
        elif "hits" in rj:
            hits = rj["hits"]
        elif "data" in rj:
            hits = rj["data"]
        else:
            hits = rj.get("items") or rj.get("filings") or []
    elif isinstance(rj, list):
        hits = rj
    else:
        hits = []

    if not hits:
        print("[metadata] no hits found via Query API for that filename. Returning fallback minimal metadata.")
        return {"source_url": filing_url, "note": "No metadata found via Query API for filename."}

    metadata = hits[0]
    metadata_path = os.path.join(OUTPUT_DIR, "metadata.json")
    with open(metadata_path, "w", encoding="utf-8") as fh:
        json.dump(metadata, fh, indent=2, ensure_ascii=False)
    print(f"[metadata] saved metadata to {metadata_path}")
    return metadata

def fetch_item_section_with_retries(filing_url, item_code, return_type, api_key, max_retries=6):
    """
    Calls the Extractor API with exponential backoff on 429/5xx errors.
    Returns tuple(status_code, text_response, response_headers)
    """
    params = {
        "url": filing_url,
        "item": item_code,
        "type": return_type,
        "token": api_key
    }
    backoff = 1.0
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(EXTRACTOR_API_URL, params=params, timeout=60)
        except Exception as e:
            # network or other unexpected error — retry
            print(f"[warn] network error on attempt {attempt} for item {item_code}: {e}")
            time.sleep(backoff)
            backoff *= 2.0
            continue

        # if success
        if resp.status_code == 200:
            return resp.status_code, resp.text, resp.headers

        # if rate limited or server error, retry with backoff
        if resp.status_code in (429, 500, 502, 503, 504):
            print(f"[warn] HTTP {resp.status_code} -> retry #{attempt} for item {item_code} after {backoff:.1f}s")
            time.sleep(backoff)
            backoff *= 2.0
            continue

        # other HTTP error: return immediately
        return resp.status_code, resp.text, resp.headers

    # final failure after retries
    return resp.status_code if 'resp' in locals() else 0, (resp.text if 'resp' in locals() else ""), (resp.headers if 'resp' in locals() else {})

def extract_all_items_as_text(filing_url, api_key, output_dir, items_list, delay=0.25):
    """
    Extract requested items as type=text and save to disk as .txt files.
    Returns an index dictionary describing saved files and extraction status.
    """
    index = {
        "filing_url": filing_url,
        "extracted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "items": {}
    }

    for item in tqdm(items_list, desc="Items"):
        typ = "text"  # force text for every item
        print(f"\n[extract] item={item}  type={typ}")
        status_code, body, headers = fetch_item_section_with_retries(filing_url, item, typ, api_key)

        if status_code != 200:
            snippet = body[:1000] if isinstance(body, str) else ""
            print(f"[error] Extractor API returned {status_code} for item {item}. Snippet: {snippet[:300]}")
            index["items"][item] = {
                "status": "http_error" if status_code else "network_error",
                "http_status": status_code,
                "response_snippet": snippet
            }
            # polite delay and continue
            time.sleep(delay)
            continue

        # choose extension .txt
        ext = "txt"
        safe_item_name = f"item_{item}.{ext}"
        outpath = os.path.join(output_dir, safe_item_name)
        with open(outpath, "w", encoding="utf-8") as fh:
            fh.write(body)

        print(f"[saved] item {item} -> {outpath} (size={len(body):,} bytes)")
        index["items"][item] = {
            "status": "ok",
            "type": typ,
            "path": outpath,
            "size_bytes": len(body)
        }

        # polite delay to avoid rate-limiting
        time.sleep(delay)

    # save index
    index_path = os.path.join(output_dir, "items_index.json")
    with open(index_path, "w", encoding="utf-8") as fh:
        json.dump(index, fh, indent=2, ensure_ascii=False)
    print(f"\n[index] saved to {index_path}")
    return index

# --------------------------- MAIN ---------------------------
def main():
    if API_KEY == "" or API_KEY.startswith("YOUR_API_KEY"):
        raise RuntimeError("Please set API_KEY variable in the script to your sec-api key before running.")

    print("Starting sec-api extraction for filing:", FILING_URL)

    # 1) fetch metadata via Query API (best-effort)
    try:
        _meta = fetch_filing_metadata(FILING_URL, API_KEY)
    except Exception as e:
        print("[warn] metadata extraction failed:", e)
        _meta = {"source_url": FILING_URL}

    # 2) attempt to download the raw filing HTML as a backup (optional)
    try:
        r = requests.get(FILING_URL, timeout=30)
        if r.status_code == 200:
            raw_filing_path = os.path.join(OUTPUT_DIR, get_filing_filename(FILING_URL))
            with open(raw_filing_path, "w", encoding="utf-8") as fh:
                fh.write(r.text)
            print(f"[raw] saved raw filing HTML to {raw_filing_path}")
        else:
            print(f"[raw] couldn't download raw filing HTML; status {r.status_code}")
    except Exception as e:
        print(f"[raw] error downloading raw filing HTML: {e}")

    # 3) extract all items as text
    index = extract_all_items_as_text(
        filing_url=FILING_URL,
        api_key=API_KEY,
        output_dir=items_dir,
        items_list=TEN_K_ITEMS,
        delay=REQUEST_DELAY_SEC
    )

    print("\nAll done. Output directory:", OUTPUT_DIR)
    print("Files saved:")
    for it, info in index["items"].items():
        print("  ", it, "->", info.get("path", info.get("error")))

if __name__ == "__main__":
    main()


Starting sec-api extraction for filing: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
[metadata] querying for filing with filename: aapl-20240928.htm
[metadata] saved metadata to ./sec_extraction_output/metadata.json
[raw] couldn't download raw filing HTML; status 403


Items:   0%|          | 0/21 [00:00<?, ?it/s]


[extract] item=1  type=text
[saved] item 1 -> ./sec_extraction_output/items/item_1.txt (size=16,339 bytes)


Items:   5%|▍         | 1/21 [00:01<00:23,  1.19s/it]


[extract] item=1A  type=text
[saved] item 1A -> ./sec_extraction_output/items/item_1A.txt (size=70,585 bytes)


Items:  10%|▉         | 2/21 [00:02<00:23,  1.24s/it]


[extract] item=1B  type=text
[saved] item 1B -> ./sec_extraction_output/items/item_1B.txt (size=46 bytes)


Items:  14%|█▍        | 3/21 [00:03<00:20,  1.15s/it]


[extract] item=1C  type=text
[saved] item 1C -> ./sec_extraction_output/items/item_1C.txt (size=2,840 bytes)


Items:  19%|█▉        | 4/21 [00:04<00:19,  1.13s/it]


[extract] item=2  type=text
[saved] item 2 -> ./sec_extraction_output/items/item_2.txt (size=499 bytes)


Items:  24%|██▍       | 5/21 [00:05<00:17,  1.11s/it]


[extract] item=3  type=text
[saved] item 3 -> ./sec_extraction_output/items/item_3.txt (size=4,496 bytes)


Items:  29%|██▊       | 6/21 [00:06<00:16,  1.10s/it]


[extract] item=4  type=text
[saved] item 4 -> ./sec_extraction_output/items/item_4.txt (size=98 bytes)


Items:  33%|███▎      | 7/21 [00:07<00:15,  1.10s/it]


[extract] item=5  type=text
[saved] item 5 -> ./sec_extraction_output/items/item_5.txt (size=2,725 bytes)


Items:  38%|███▊      | 8/21 [00:08<00:14,  1.08s/it]


[extract] item=6  type=text
[saved] item 6 -> ./sec_extraction_output/items/item_6.txt (size=57 bytes)


Items:  43%|████▎     | 9/21 [00:09<00:12,  1.07s/it]


[extract] item=7  type=text
[saved] item 7 -> ./sec_extraction_output/items/item_7.txt (size=16,122 bytes)


Items:  48%|████▊     | 10/21 [00:11<00:11,  1.08s/it]


[extract] item=7A  type=text
[saved] item 7A -> ./sec_extraction_output/items/item_7A.txt (size=3,177 bytes)


Items:  52%|█████▏    | 11/21 [00:12<00:10,  1.07s/it]


[extract] item=8  type=text
[saved] item 8 -> ./sec_extraction_output/items/item_8.txt (size=67,721 bytes)


Items:  57%|█████▋    | 12/21 [00:13<00:10,  1.13s/it]


[extract] item=9  type=text
[saved] item 9 -> ./sec_extraction_output/items/item_9.txt (size=104 bytes)


Items:  62%|██████▏   | 13/21 [00:14<00:09,  1.13s/it]


[extract] item=9A  type=text
[saved] item 9A -> ./sec_extraction_output/items/item_9A.txt (size=4,665 bytes)


Items:  67%|██████▋   | 14/21 [00:15<00:07,  1.11s/it]


[extract] item=9B  type=text
[saved] item 9B -> ./sec_extraction_output/items/item_9B.txt (size=1,401 bytes)


Items:  71%|███████▏  | 15/21 [00:16<00:06,  1.10s/it]


[extract] item=10  type=text
[saved] item 10 -> ./sec_extraction_output/items/item_10.txt (size=1,074 bytes)


Items:  76%|███████▌  | 16/21 [00:17<00:05,  1.08s/it]


[extract] item=11  type=text
[saved] item 11 -> ./sec_extraction_output/items/item_11.txt (size=162 bytes)


Items:  81%|████████  | 17/21 [00:18<00:04,  1.09s/it]


[extract] item=12  type=text
[saved] item 12 -> ./sec_extraction_output/items/item_12.txt (size=234 bytes)


Items:  86%|████████▌ | 18/21 [00:19<00:03,  1.08s/it]


[extract] item=13  type=text
[saved] item 13 -> ./sec_extraction_output/items/item_13.txt (size=213 bytes)


Items:  90%|█████████ | 19/21 [00:20<00:02,  1.10s/it]


[extract] item=14  type=text
[saved] item 14 -> ./sec_extraction_output/items/item_14.txt (size=223 bytes)


Items:  95%|█████████▌| 20/21 [00:22<00:01,  1.08s/it]


[extract] item=15  type=text
[saved] item 15 -> ./sec_extraction_output/items/item_15.txt (size=15,179 bytes)


Items: 100%|██████████| 21/21 [00:23<00:00,  1.10s/it]


[index] saved to ./sec_extraction_output/items/items_index.json

All done. Output directory: ./sec_extraction_output
Files saved:
   1 -> ./sec_extraction_output/items/item_1.txt
   1A -> ./sec_extraction_output/items/item_1A.txt
   1B -> ./sec_extraction_output/items/item_1B.txt
   1C -> ./sec_extraction_output/items/item_1C.txt
   2 -> ./sec_extraction_output/items/item_2.txt
   3 -> ./sec_extraction_output/items/item_3.txt
   4 -> ./sec_extraction_output/items/item_4.txt
   5 -> ./sec_extraction_output/items/item_5.txt
   6 -> ./sec_extraction_output/items/item_6.txt
   7 -> ./sec_extraction_output/items/item_7.txt
   7A -> ./sec_extraction_output/items/item_7A.txt
   8 -> ./sec_extraction_output/items/item_8.txt
   9 -> ./sec_extraction_output/items/item_9.txt
   9A -> ./sec_extraction_output/items/item_9A.txt
   9B -> ./sec_extraction_output/items/item_9B.txt
   10 -> ./sec_extraction_output/items/item_10.txt
   11 -> ./sec_extraction_output/items/item_11.txt
   12 -> ./sec_extra




### Normalizing the text from .txt file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
Normalize SEC item .txt files and REMOVE table blocks.

- Scans INPUT_DIR for .txt files (recursively).
- Removes table blocks delimited by markers (e.g. ##TABLE_START ... ##TABLE_END).
- Runs normalization pipeline:
    * HTML entity decoding
    * Unicode NFKC canonicalization
    * Remove zero-width/control characters
    * Optional quotes/dash normalization
    * De-hyphenation across line breaks and paragraph reflow
    * Collapse extra blank lines, preserve paragraph boundaries
- Writes for each input <basename>.normalized.txt and <basename>.normalized.json
  JSON contains metadata and paragraph list with start/end offsets in the normalized text.

EDIT: set INPUT_DIR and OUTPUT_DIR before running (Google Drive or local).
"""

import os
import re
import json
import html
import unicodedata
from datetime import datetime
from typing import List, Dict

# -------------- CONFIG --------------
INPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/txt_files"   # <-- set to your folder with .txt files
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item"  # <-- output folder

# Behavior toggles
DEHYPHENATE = True               # join words broken by line-end hyphenation
NORMALIZE_QUOTES_DASHES = True   # map curly quotes to straight and long dashes to '-'
INCLUDE_NORMALIZED_TEXT_IN_JSON = True
MAX_JSON_TEXT_CHARS = None       # if not None, truncate normalized_text stored in JSON
# -------------- end CONFIG --------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------- regex / constants --------------
# Remove table blocks that are explicitly delimited in the .txt files.
# Common marker observed: ##TABLE_START ... ##TABLE_END
# This will remove inclusive of markers.
TABLE_BLOCK_RE = re.compile(r"##TABLE_START.*?##TABLE_END", flags=re.S | re.I)

# (Optional additional patterns that sometimes appear in .txt extracts)
# - Remove simple placeholder tokens like [TABLE_1], [TABLE_2] if present
TABLE_PLACEHOLDER_RE = re.compile(r"\[TABLE_\d+\]", flags=re.I)

# Zero-width / invisible characters to remove
ZERO_WIDTH_RE = re.compile(r'[\u200B\u200C\u200D\uFEFF\u00AD]')

# Control chars except newline and tab
CONTROL_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')

# Paragraph split by blank lines
PARA_SPLIT_RE = re.compile(r'\n\s*\n', flags=re.M)

# Collapse many blank lines into two (single paragraph separator)
BLANK_LINES_RE = re.compile(r'\n{3,}')

# Quote/dash normalization mappings
QUOTE_DASH_MAP = {
    "\u2018": "'", "\u2019": "'",  # left/right single quote
    "\u201C": '"', "\u201D": '"',  # left/right double quote
    "\u2013": "-", "\u2014": "-",  # en-dash, em-dash
    "\u00A0": " ",                 # NBSP -> space
    "\u2212": "-"                  # minus sign -> hyphen-minus
}
QUOTE_DASH_PATTERN = re.compile("[" + "".join(re.escape(k) for k in QUOTE_DASH_MAP.keys()) + "]")
# -------------- end regex/constants --------------

# -------------- normalization helpers --------------
def decode_and_nfkc(text: str) -> str:
    """HTML-unescape and Unicode NFKC canonicalization; normalize line endings to \\n."""
    if not isinstance(text, str):
        text = str(text)
    text = html.unescape(text)
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    return text

def remove_invisible_and_control(text: str) -> str:
    """Remove common zero-width characters and control characters (keep newline/tab)."""
    text = ZERO_WIDTH_RE.sub("", text)
    text = CONTROL_RE.sub("", text)
    return text

def normalize_quotes_and_dashes(text: str) -> str:
    """Map curly quotes / long dashes to ASCII equivalents (if enabled)."""
    if not NORMALIZE_QUOTES_DASHES:
        return text
    return QUOTE_DASH_PATTERN.sub(lambda m: QUOTE_DASH_MAP.get(m.group(0), m.group(0)), text)

def dehyphenate_paragraph(par: str) -> str:
    """
    Fix hyphenation caused by line breaks (e.g., 'exam-\nple' -> 'example'),
    then replace internal newlines with spaces and collapse spaces.
    """
    # Join letter-digit sequences split by hyphen+newline
    par = re.sub(r'(?P<prefix>\S)-\n(?P<suffix>\S)', lambda m: m.group('prefix') + m.group('suffix'), par)
    # Replace remaining newlines in paragraph with space
    par = par.replace("\n", " ")
    # Collapse whitespace
    par = re.sub(r"[ \t]+", " ", par)
    return par.strip()

def reflow_paragraph(par: str) -> str:
    """Reflow paragraph text; optionally dehyphenate."""
    if DEHYPHENATE:
        return dehyphenate_paragraph(par)
    else:
        s = par.replace("\n", " ")
        s = re.sub(r"[ \t]+", " ", s)
        return s.strip()

def remove_table_blocks(raw: str) -> str:
    """
    Remove any explicit table blocks marked in the .txt (###TABLE_START...##TABLE_END),
    and remove simple placeholders like [TABLE_n].
    """
    if not raw:
        return raw
    s = TABLE_BLOCK_RE.sub("\n\n", raw)  # replace table block with blank lines to preserve paragraph split
    s = TABLE_PLACEHOLDER_RE.sub(" ", s)
    return s

def normalize_full_text(raw_text: str) -> str:
    """
    Full normalization flow:
    - remove table blocks
    - decode & NFKC
    - remove invisible/control characters
    - normalize quotes/dashes (optional)
    - split into paragraphs on blank lines, reflow each paragraph
    - join paragraphs with single blank line
    - collapse excessive blank lines
    """
    # 1) remove explicit table content first
    without_tables = remove_table_blocks(raw_text)

    # 2) decode html entities and canonicalize unicode
    s = decode_and_nfkc(without_tables)

    # 3) remove invisible & control chars
    s = remove_invisible_and_control(s)

    # 4) normalize quotes/dashes
    s = normalize_quotes_and_dashes(s)

    # 5) split into paragraph blocks and reflow each
    parts = [p for p in PARA_SPLIT_RE.split(s) if p and p.strip()]
    reflowed = [reflow_paragraph(p) for p in parts if p and p.strip()]

    normalized = "\n\n".join(reflowed).strip()
    normalized = BLANK_LINES_RE.sub("\n\n", normalized)
    if normalized and not normalized.endswith("\n"):
        normalized += "\n"
    return normalized

# -------------- paragraph offsets --------------
def paragraphs_with_offsets(normalized_text: str) -> List[Dict]:
    """
    Given normalized_text with paragraphs separated by double-newline,
    return list of dicts: {'text','start_offset','end_offset','char_count'}
    Offsets are 0-based character positions in normalized_text.
    """
    blocks = []
    if not normalized_text:
        return blocks
    idx = 0
    L = len(normalized_text)
    while idx < L:
        # skip leading whitespace/newlines
        if normalized_text[idx] in ("\n", " ", "\t"):
            idx += 1
            continue
        next_break = normalized_text.find("\n\n", idx)
        if next_break == -1:
            para = normalized_text[idx:].rstrip("\n")
            start = idx
            end = idx + len(para)
            blocks.append({"text": para, "start_offset": start, "end_offset": end, "char_count": end - start})
            break
        else:
            para = normalized_text[idx:next_break]
            start = idx
            end = next_break
            blocks.append({"text": para, "start_offset": start, "end_offset": end, "char_count": end - start})
            idx = next_break + 2
    return blocks

# -------------- file processing --------------
def process_file(in_path: str, out_dir: str) -> Dict:
    """
    Read input .txt file, normalize (removing tables), write normalized .txt and .json
    Return summary dict with paths and paragraph count.
    """
    base = os.path.basename(in_path)
    name_noext = os.path.splitext(base)[0]

    with open(in_path, "r", encoding="utf-8", errors="replace") as fh:
        raw = fh.read()

    original_size = len(raw)

    normalized = normalize_full_text(raw)
    normalized_size = len(normalized)

    paragraphs = paragraphs_with_offsets(normalized)

    meta = {
        "file_name": base,
        "input_path": os.path.abspath(in_path),
        "processed_at_utc": datetime.utcnow().isoformat() + "Z",
        "original_size_bytes": original_size,
        "normalized_size_bytes": normalized_size,
        "paragraph_count": len(paragraphs),
        "paragraphs": paragraphs
    }

    # Write normalized text
    out_txt_path = os.path.join(out_dir, f"{name_noext}.normalized.txt")
    with open(out_txt_path, "w", encoding="utf-8") as fh:
        fh.write(normalized)

    # Write JSON metadata (optionally include normalized text)
    out_json_payload = {
        "file_name": meta["file_name"],
        "input_path": meta["input_path"],
        "processed_at_utc": meta["processed_at_utc"],
        "original_size_bytes": meta["original_size_bytes"],
        "normalized_size_bytes": meta["normalized_size_bytes"],
        "paragraph_count": meta["paragraph_count"],
        "paragraphs": meta["paragraphs"]
    }
    if INCLUDE_NORMALIZED_TEXT_IN_JSON:
        out_json_payload["normalized_text"] = normalized if MAX_JSON_TEXT_CHARS is None else normalized[:MAX_JSON_TEXT_CHARS]

    out_json_path = os.path.join(out_dir, f"{name_noext}.normalized.json")
    with open(out_json_path, "w", encoding="utf-8") as fh:
        json.dump(out_json_payload, fh, indent=2, ensure_ascii=False)

    return {
        "input": in_path,
        "normalized_txt": out_txt_path,
        "normalized_json": out_json_path,
        "paragraphs": len(paragraphs)
    }

def batch_normalize(input_dir: str, output_dir: str) -> List[Dict]:
    os.makedirs(output_dir, exist_ok=True)
    results = []
    count = 0
    for root, _, files in os.walk(input_dir):
        for fname in sorted(files):
            if not fname.lower().endswith(".txt"):
                continue
            in_path = os.path.join(root, fname)
            try:
                info = process_file(in_path, output_dir)
                results.append(info)
                count += 1
                print(f"[OK] {count}. {fname} -> paragraphs: {info['paragraphs']}")
            except Exception as e:
                print(f"[ERROR] processing {in_path}: {e}")
    print(f"\nDone. Processed {count} files. Normalized outputs in: {os.path.abspath(output_dir)}")
    return results

# -------------- main --------------
if __name__ == "__main__":
    print("INPUT_DIR:", INPUT_DIR)
    print("OUTPUT_DIR:", OUTPUT_DIR)
    print("DEHYPHENATE:", DEHYPHENATE, "NORMALIZE_QUOTES_DASHES:", NORMALIZE_QUOTES_DASHES)
    batch_normalize(INPUT_DIR, OUTPUT_DIR)


INPUT_DIR: /content/drive/My Drive/SEC-API/AAPL/txt_files
OUTPUT_DIR: /content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item
DEHYPHENATE: True NORMALIZE_QUOTES_DASHES: True


  "processed_at_utc": datetime.utcnow().isoformat() + "Z",


[OK] 1. item_1.txt -> paragraphs: 66
[OK] 2. item_10.txt -> paragraphs: 3
[OK] 3. item_11.txt -> paragraphs: 2
[OK] 4. item_12.txt -> paragraphs: 2
[OK] 5. item_13.txt -> paragraphs: 2
[OK] 6. item_14.txt -> paragraphs: 4
[OK] 7. item_15.txt -> paragraphs: 23
[OK] 8. item_1A.txt -> paragraphs: 128
[OK] 9. item_1B.txt -> paragraphs: 2
[OK] 10. item_1C.txt -> paragraphs: 6
[OK] 11. item_2.txt -> paragraphs: 2
[OK] 12. item_3.txt -> paragraphs: 9
[OK] 13. item_4.txt -> paragraphs: 4
[OK] 14. item_5.txt -> paragraphs: 10
[OK] 15. item_6.txt -> paragraphs: 2
[OK] 16. item_7.txt -> paragraphs: 103
[OK] 17. item_7A.txt -> paragraphs: 9
[OK] 18. item_8.txt -> paragraphs: 211
[OK] 19. item_9.txt -> paragraphs: 2
[OK] 20. item_9A.txt -> paragraphs: 14
[OK] 21. item_9B.txt -> paragraphs: 4

Done. Processed 21 files. Normalized outputs in: /content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item


### Chunking1 : only the .txt files here
Strategy used: item based -> if item's content > token limit then paragraph based -> if paragraphs's content > token limit(this rarely happens) then sentence based

Later Better and more optimal chunking is done

In [None]:
!pip install --upgrade transformers sentencepiece



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Full script: Hybrid chunking with paragraph + sentence fallback, metadata attached to each chunk.
# Colab-ready. Set paths below.

# Uncomment to install if running in fresh Colab:
# !pip install --upgrade transformers sentencepiece

import os
import re
import json
from pathlib import Path
from typing import List, Dict, Any, Tuple
from tqdm import tqdm

# ---------------- CONFIG ----------------
INPUT_JSON_DIR = "/content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item"  # folder with per-item normalized JSONs
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata"

MODEL_NAME = "BAAI/bge-large-en-v1.5"  # tokenizer (used for token-counting)
TOKEN_LIMIT = 512                      # embedding model max tokens
PARA_OVERLAP = 3                       # paragraphs overlap when chunking by paragraphs
SENT_OVERLAP = 3                       # sentences overlap when chunking long paragraphs
MIN_TOKS_HEADING = 12                  # used to detect short headings (optional)
# ---------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- tokenizer ----------
try:
    from transformers import AutoTokenizer
except Exception as e:
    raise RuntimeError("Install transformers (pip install transformers sentencepiece) before running. Error: " + str(e))

print(f"[info] loading tokenizer {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("[ok] tokenizer loaded.")

def count_tokens(text: str) -> int:
    if not text:
        return 0
    return len(tokenizer.encode(text, add_special_tokens=False))

# Simple sentence splitter fallback (keeps punctuation)
SENT_SPLIT_RE = re.compile(r'(?<=[\.\?\!\…])\s+')

def split_into_sentences(paragraph: str) -> List[str]:
    if not paragraph or not paragraph.strip():
        return []
    sents = SENT_SPLIT_RE.split(paragraph.strip())
    # fallback: if no punctuation splitting, split by newlines
    if len(sents) <= 1 and "\n" in paragraph:
        sents = [s.strip() for s in paragraph.split("\n") if s.strip()]
    # strip
    sents = [s.strip() for s in sents if s and s.strip()]
    return sents

# ---------- load metadata ----------
if not os.path.exists(METADATA_FILE):
    raise FileNotFoundError(f"Metadata file not found: {METADATA_FILE}")
with open(METADATA_FILE, "r", encoding="utf-8") as fh:
    filing_metadata = json.load(fh)

# safe subset of metadata fields to attach
META_FIELDS = {
    "ticker": filing_metadata.get("ticker"),
    "company_name": filing_metadata.get("company_name"),
    "form_type": filing_metadata.get("form_type"),
    "accession": filing_metadata.get("accession"),
    "accession_nodash": filing_metadata.get("accession_nodash"),
    "cik": filing_metadata.get("cik_padded") or filing_metadata.get("cik"),
    "period_of_report": filing_metadata.get("period_of_report"),
    "fiscal_year_end": filing_metadata.get("fiscal_year_end"),
    "file_no": filing_metadata.get("file_no"),
    "filed_at_utc": filing_metadata.get("filed_at_utc"),
}

# ---------- utility to extract paragraphs from item JSON ----------
def extract_paragraphs_from_item_json(item_json: Dict[str, Any]) -> List[str]:
    paras: List[str] = []
    # prefer explicit paragraphs list
    if "paragraphs" in item_json and isinstance(item_json["paragraphs"], list) and item_json["paragraphs"]:
        for p in item_json["paragraphs"]:
            if isinstance(p, dict):
                # support 'text', 'clean_text'
                if "text" in p:
                    paras.append(p["text"].strip())
                elif "clean_text" in p:
                    paras.append(p["clean_text"].strip())
                elif "raw_text" in p:
                    paras.append(p["raw_text"].strip())
            elif isinstance(p, str):
                paras.append(p.strip())
    # fallback: normalized_text split by blank lines
    elif "normalized_text" in item_json and isinstance(item_json["normalized_text"], str):
        paras = [s.strip() for s in re.split(r'\n\s*\n', item_json["normalized_text"]) if s.strip()]
    elif "raw_text" in item_json and isinstance(item_json["raw_text"], str):
        paras = [s.strip() for s in re.split(r'\n\s*\n', item_json["raw_text"]) if s.strip()]
    return paras

# ---------- sentence-level chunking for a long paragraph ----------
def chunk_long_paragraph(paragraph: str, paragraph_idx: int, token_limit: int, sent_overlap: int):
    """
    Split a single very long paragraph into sentence-level chunks with overlap.
    Returns list of chunk dicts with fields:
        chunk_text, start_paragraph, end_paragraph, paragraph_indices, sentence_indices, token_count
    """
    sents = split_into_sentences(paragraph)
    if not sents:
        # fallback: force-split paragraph by token windows (rare)
        tokens = tokenizer.encode(paragraph, add_special_tokens=False)
        # approximate by breaking the raw text into pieces of token_limit tokens - this is fallback and unusual
        chunks = []
        tok_text = tokenizer.decode(tokens)
        # but we'll just put entire paragraph as single chunk if splitting not meaningful
        chunks.append({
            "text": paragraph,
            "start_paragraph": paragraph_idx,
            "end_paragraph": paragraph_idx,
            "paragraph_indices": [paragraph_idx],
            "sentence_indices": None,
            "token_count": count_tokens(paragraph)
        })
        return chunks

    s_tok = [count_tokens(s) for s in sents]
    chunks = []
    s_ptr = 0
    chunk_id_local = 0

    while s_ptr < len(sents):
        cur_tokens = 0
        start_s = s_ptr
        included = []
        # add sentences while under token limit
        while s_ptr < len(sents) and cur_tokens + s_tok[s_ptr] <= token_limit:
            included.append(s_ptr)
            cur_tokens += s_tok[s_ptr]
            s_ptr += 1

        if not included:
            # This means single sentence too long; force include single sentence (rare)
            included = [s_ptr]
            cur_tokens = s_tok[s_ptr]
            s_ptr += 1

        chunk_text = " ".join(sents[i] for i in included)
        chunks.append({
            "text": chunk_text,
            "start_paragraph": paragraph_idx,
            "end_paragraph": paragraph_idx,
            "paragraph_indices": [paragraph_idx],
            "sentence_indices": included.copy(),
            "token_count": cur_tokens
        })
        chunk_id_local += 1

        # compute next start with sentence-level overlap
        last_sent = included[-1]
        next_start = last_sent - sent_overlap + 1
        if next_start <= start_s:
            # ensure progress at least by one sentence
            next_start = start_s + 1
        if next_start < 0:
            next_start = 0
        s_ptr = next_start

    return chunks

# ---------- paragraph-level chunking main algorithm ----------
def chunk_paragraphs_with_fallback(paragraphs: List[str],
                                   token_limit: int,
                                   para_overlap: int,
                                   sent_overlap: int,
                                   min_heading_tokens: int) -> Tuple[List[Dict[str, Any]], List[int]]:
    """
    Main chunking algorithm:
      - If whole item <= token_limit, returns single chunk
      - Else chunk by paragraphs (never break paragraphs)
      - If a paragraph > token_limit, fallback to sentence-level chunking for that paragraph
    Returns (chunks, paragraph_token_counts)
    Each produced chunk is a dict with keys:
      text, start_paragraph, end_paragraph, paragraph_indices, token_count, (optional sentence_indices)
    """
    n = len(paragraphs)
    para_tokens = [count_tokens(p) for p in paragraphs]
    chunks: List[Dict[str, Any]] = []

    # if the entire item fits under the token limit -> single chunk
    item_joined = "\n\n".join(paragraphs)
    if count_tokens(item_joined) <= token_limit:
        chunks.append({
            "text": item_joined,
            "start_paragraph": 0,
            "end_paragraph": n - 1,
            "paragraph_indices": list(range(0, n)),
            "token_count": count_tokens(item_joined)
        })
        return chunks, para_tokens

    # otherwise iterate with window
    start = 0
    while start < n:
        # If this paragraph itself is too big, handle via sentence fallback
        if para_tokens[start] > token_limit:
            # finalize no pending window; produce sentence-level chunks for this paragraph
            long_chunks = chunk_long_paragraph(paragraphs[start], start, token_limit, sent_overlap)
            for lc in long_chunks:
                chunks.append(lc)
            start += 1
            continue

        # greedily extend window from 'start' by adding whole paragraphs
        cur_tokens = 0
        included = []
        j = start
        while j < n and para_tokens[j] <= token_limit and cur_tokens + para_tokens[j] <= token_limit:
            cur_tokens += para_tokens[j]
            included.append(j)
            j += 1

        if not included:
            # if still empty (should not happen because we checked para_tokens[start] <= token_limit)
            # force include the start paragraph (defensive)
            included = [start]
            cur_tokens = para_tokens[start]
            j = start + 1

        # create chunk from included paragraphs
        chunk_text = "\n\n".join(paragraphs[i] for i in included)
        chunks.append({
            "text": chunk_text,
            "start_paragraph": included[0],
            "end_paragraph": included[-1],
            "paragraph_indices": included.copy(),
            "token_count": cur_tokens
        })

        # compute paragraph-overlap adjustment: if the last para(s) are tiny headings, expand overlap by 1
        cand_last = included[-para_overlap:] if len(included) >= para_overlap else included[:]
        all_short = all(para_tokens[i] < min_heading_tokens for i in cand_last) if cand_last else False
        overlap_count = para_overlap + 1 if all_short else para_overlap

        # next window start index: keep overlap_count paragraphs from the end of this chunk
        next_start = included[-1] - overlap_count + 1
        # ensure progress: next_start must be > included[0] otherwise advance at least by 1 para
        if next_start <= included[0]:
            next_start = included[0] + 1

        start = min(max(next_start, 0), n)

    return chunks, para_tokens

# ---------- driver: iterate item files and write per-item chunk outputs ----------
def process_all_items(input_dir: str, output_dir: str, token_limit: int,
                      para_overlap: int, sent_overlap: int, min_heading_tokens: int):
    p = Path(input_dir)
    files = sorted([f for f in p.glob("*.json")])
    if not files:
        raise RuntimeError(f"No .json files found in {input_dir}")

    summary = []
    for fp in tqdm(files, desc="Items"):
        try:
            with open(fp, "r", encoding="utf-8") as fh:
                item_json = json.load(fh)
        except Exception as e:
            print(f"[warn] failed to load {fp}: {e}")
            continue

        paragraphs = extract_paragraphs_from_item_json(item_json)
        if not paragraphs:
            print(f"[warn] no paragraphs found in {fp}; skipping")
            continue

        # chunk with fallback
        chunks, para_tok_counts = chunk_paragraphs_with_fallback(
            paragraphs,
            token_limit,
            para_overlap,
            sent_overlap,
            min_heading_tokens
        )

        # enrich with metadata and item id
        item_id = item_json.get("item_number") or item_json.get("item_title") or fp.stem
        enriched = []
        for idx, c in enumerate(chunks):
            enriched_chunk = {
                "chunk_id": f"{fp.stem}_chunk_{idx}",
                "item_id": item_id,
                "text": c["text"],
                "start_paragraph": c.get("start_paragraph"),
                "end_paragraph": c.get("end_paragraph"),
                "paragraph_indices": c.get("paragraph_indices"),
                "sentence_indices": c.get("sentence_indices") if "sentence_indices" in c else c.get("sentence_indices_in_paragraph"),
                "token_count": c["token_count"],
                "metadata": META_FIELDS,
                "source_item_file": str(fp)
            }
            enriched.append(enriched_chunk)

        out_obj = {
            "item_file": str(fp),
            "item_basename": fp.stem,
            "model_name": MODEL_NAME,
            "token_limit": token_limit,
            "para_overlap": para_overlap,
            "sent_overlap": sent_overlap,
            "paragraph_count": len(paragraphs),
            "paragraph_token_counts": para_tok_counts,
            "chunks": enriched
        }

        out_file = Path(output_dir) / f"{fp.stem}.chunks_with_metadata.json"
        with open(out_file, "w", encoding="utf-8") as fh:
            json.dump(out_obj, fh, ensure_ascii=False, indent=2)

        summary.append({
            "item": fp.stem,
            "paragraphs": len(paragraphs),
            "chunks": len(enriched),
            "out_file": str(out_file)
        })

    return summary

# ---------- run ----------
if __name__ == "__main__":
    print("INPUT_JSON_DIR:", INPUT_JSON_DIR)
    print("OUTPUT_DIR:", OUTPUT_DIR)
    print("MODEL:", MODEL_NAME)
    print("TOKEN_LIMIT:", TOKEN_LIMIT, "PARA_OVERLAP:", PARA_OVERLAP, "SENT_OVERLAP:", SENT_OVERLAP)
    summary = process_all_items(INPUT_JSON_DIR, OUTPUT_DIR, TOKEN_LIMIT, PARA_OVERLAP, SENT_OVERLAP, MIN_TOKS_HEADING)
    print("\nDone. Summary:")
    for s in summary:
        print(" ", s)


[info] loading tokenizer BAAI/bge-large-en-v1.5 ...
[ok] tokenizer loaded.
INPUT_JSON_DIR: /content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item
OUTPUT_DIR: /content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata
MODEL: BAAI/bge-large-en-v1.5
TOKEN_LIMIT: 512 PARA_OVERLAP: 3 SENT_OVERLAP: 3


Items:   0%|          | 0/21 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2849 > 512). Running this sequence through the model will result in indexing errors
Items: 100%|██████████| 21/21 [00:00<00:00, 30.06it/s]


Done. Summary:
  {'item': 'item_1.normalized', 'paragraphs': 66, 'chunks': 12, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata/item_1.normalized.chunks_with_metadata.json'}
  {'item': 'item_10.normalized', 'paragraphs': 3, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata/item_10.normalized.chunks_with_metadata.json'}
  {'item': 'item_11.normalized', 'paragraphs': 2, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata/item_11.normalized.chunks_with_metadata.json'}
  {'item': 'item_12.normalized', 'paragraphs': 2, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata/item_12.normalized.chunks_with_metadata.json'}
  {'item': 'item_13.normalized', 'paragraphs': 2, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/chunks_with_metadata/item_13.normalized.chunks_with_metadata.json'}
  {'item': 'item_14.normalized', 'paragraphs': 4, 'chunks': 1, 'out_file': '/co




### CONVERTING the XBRL facts into sentences for better embdeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1st Method: Not so optimal
#----------------------------
# XBRL facts -> row/group-level sentences
# ----------------------------
# Colab / local-ready script.
# Edit INPUT paths below as needed.
# ----------------------------
import os
import json
import re
from decimal import Decimal
from datetime import datetime
from typing import List, Dict, Any, Optional
from collections import defaultdict

# ----------------- CONFIG (EDIT as needed) -----------------
# Path to your files (change to your Drive paths if needed)
INPUT_XBRL_FLAT = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_facts_flat.json"   # path to xbrl_facts_flat.json
INPUT_XBRL_FULL = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_full.json"        # path to xbrl_full.json (optional, used for labels/context)
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output"              # output directory (auto-created)
# ---------------------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Helpers ----------
def safe_load_json(path: str):
    with open(path, "r", encoding="utf-8") as fh:
        return json.load(fh)

def humanize_concept(concept: str) -> str:
    """Turn an XBRL concept code into a readable phrase."""
    if not concept:
        return ""
    # remove array-like placeholders: [0], [1]
    if re.fullmatch(r"\[\d+\]", concept.strip()):
        return concept.strip()
    # some concept names are placeholders like "[0]" - keep as-is in that case
    # Try splitting camelCase or PascalCase or snake_case or dotted names
    s = concept
    # if name contains colon (namespace:LocalName) take local part
    if ":" in s:
        s = s.split(":", 1)[1]
    # replace underscores, dashes
    s = re.sub(r"[_\-\.]+", " ", s)
    # insert space before capital letters (ABAP-safe)
    s = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', s)
    # split camel-case sequences like 'EntityCommonStockSharesOutstanding' -> 'Entity Common Stock Shares Outstanding'
    s = re.sub(r'([A-Z][a-z]+)', r' \1', s).strip()
    # Remove extra spaces and lowerfirst? keep capitalization
    s = re.sub(r'\s+', ' ', s).strip()
    # Make it readable: if all uppercase, keep as is; else title-case
    if s.isupper():
        return s
    return s.replace("  ", " ").strip()

def format_number_str(val_str: str, decimals: Optional[int] = None, unit: Optional[str] = None) -> str:
    """Format value with commas and apply decimals if provided in XBRL raw."""
    # Some XBRL values are strings representing integers or decimals.
    try:
        # handle booleans 'true'/'false'
        if isinstance(val_str, bool):
            return str(val_str)
        if val_str is None:
            return ""
        vs = str(val_str).strip()
        # return preserved textual values if not numeric
        # Recognize numbers with possible scientific notation
        if re.match(r"^-?\d+(\.\d+)?([eE][-+]?\d+)?$", vs):
            num = Decimal(vs)
            if decimals is not None:
                # XBRL decimals: if decimals = -6, it means value shown * 10^6 scaling or similar.
                # Many times sec-api returns raw 'decimals' for display guidance (we'll not rescale numeric)
                # Instead interpret decimals as number of decimal places (if a non-negative integer).
                try:
                    d = int(decimals)
                    if d >= 0:
                        # format with exactly d decimals
                        fmt = f"{{:,.{d}f}}"
                        return fmt.format(num)
                except Exception:
                    pass
            # fallback: format with no decimals if integer, else 2 decimals
            if num == num.to_integral():
                return f"{int(num):,}"
            else:
                # show up to 6 significant digits but keep thousands separators
                return "{:,.2f}".format(num.normalize())
        # not a plain numeric string: return raw
        return vs
    except Exception:
        return str(val_str)

def format_period(period_obj: Optional[Dict[str,Any]]) -> str:
    """Return readable period string."""
    if not period_obj:
        return ""
    if 'instant' in period_obj:
        return f"as of {period_obj['instant']}"
    if 'startDate' in period_obj and 'endDate' in period_obj:
        s = period_obj['startDate']
        e = period_obj['endDate']
        # format nicely (optional)
        try:
            sd = datetime.fromisoformat(s).strftime("%b %d, %Y")
            ed = datetime.fromisoformat(e).strftime("%b %d, %Y")
            return f"for the period {sd} – {ed}"
        except Exception:
            return f"for the period {s} to {e}"
    return str(period_obj)

def parent_group_key(provenance_path: str) -> str:
    """
    Normalize provenance path to a group key:
    remove trailing '/[index]' or '/[index]/segment' or trailing '/segment' etc.
    Example:
      StatementsOfIncome/Revenue/[0]/segment -> StatementsOfIncome/Revenue
    """
    if not provenance_path:
        return provenance_path or ""
    # Remove trailing '/segment' components first
    p = provenance_path
    # remove '/segment' occurrences
    p = re.sub(r'/segment$', '', p)
    # remove trailing '/[digits]' (but keep earlier indices)
    p = re.sub(r'/\[\d+\]$', '', p)
    # also remove trailing '/\[\d+\]/segment'
    p = re.sub(r'/\[\d+\]/segment$', '', p)
    return p

def extract_row_index(provenance_path: str) -> Optional[int]:
    """Return the integer row index if path ends with /[N], else None."""
    m = re.search(r'/\[(\d+)\](?:/segment)?$', provenance_path)
    if m:
        return int(m.group(1))
    return None

# ----------------- Load input files -----------------
print("Loading files...")
facts = safe_load_json(INPUT_XBRL_FLAT)
try:
    xbrl_full = safe_load_json(INPUT_XBRL_FULL)
except Exception:
    xbrl_full = {}
print(f"Loaded {len(facts)} facts (flat). xbrl_full keys: {list(xbrl_full.keys())[:20]}")

# Optionally build a label lookup from xbrl_full (if it contains label mapping)
label_lookup = {}
# heuristics: xbrl_full sometimes includes keys mapping to HTML text blocks using readable keys.
# We will populate label_lookup with simple keys if possible (best-effort).
for k, v in (xbrl_full.items() if isinstance(xbrl_full, dict) else []):
    # if key looks like something mapping to a label or text block
    if isinstance(k, str) and "TextBlock" in k and isinstance(v, str):
        # e.g., RevenueFromContractWithCustomerTextBlock -> label "Revenue"
        lab = re.sub(r'TextBlock$', '', k)
        label_lookup[lab] = re.sub(r'<[^>]+>', '', v).strip()[:400]  # store first 400 chars as context

# ----------------- Group facts by parent group and row index -----------------
groups = defaultdict(lambda: defaultdict(list))
# groups[group_key][row_index] -> list of facts
# we will use row_index = None for facts without index (group-level facts)

for f in facts:
    prov = f.get("provenance_path") or ""
    gk = parent_group_key(prov)
    ridx = extract_row_index(prov)  # None if not row-indexed
    groups[gk][ridx].append(f)

print(f"Detected {len(groups)} groups.")

# ----------------- Convert each group/row to sentences -----------------
all_group_sentences = []
flat_sentences = []  # flat list for convenient embedding/storage

for gk, rows in groups.items():
    # group-level title/hint
    group_title = gk.split("/")[-1] if gk else "UnknownGroup"
    human_group = humanize_concept(group_title)
    # try to pull group-level context from label_lookup
    group_context = label_lookup.get(group_title, None)

    # If group has numbered rows (0..N) we iterate row by row sorted by index
    row_indices = sorted([ri for ri in rows.keys() if ri is not None])
    # Also capture non-indexed facts as group-level facts
    nonindexed_facts = rows.get(None, [])

    group_entry = {
        "group_key": gk,
        "group_label": human_group,
        "group_context_snippet": group_context,
        "rows": []
    }

    # process non-indexed facts as standalone sentences
    for nf in nonindexed_facts:
        concept = nf.get("concept") or ""
        human_conc = humanize_concept(concept)
        period = nf.get("period")
        value = nf.get("value")
        raw = nf.get("raw", {})
        decimals = raw.get("decimals") if isinstance(raw, dict) else None
        unit = nf.get("unit") or (raw.get("unitRef") if isinstance(raw, dict) else None)
        vstr = format_number_str(value, decimals, unit)
        pstr = format_period(period)
        unit_str = f" {unit}" if unit else ""
        sentence = ""
        if pstr:
            sentence = f"{human_conc}: {vstr}{unit_str} ({pstr})."
        else:
            sentence = f"{human_conc}: {vstr}{unit_str}."
        row_obj = {
            "row_index": None,
            "sentence": sentence,
            "facts": [nf],
            "provenance": nf.get("provenance_path")
        }
        group_entry["rows"].append(row_obj)
        flat_sentences.append({
            "group": gk,
            "row_index": None,
            "sentence": sentence,
            "provenance": nf.get("provenance_path"),
            "concepts": [concept],
            "tokens_estimate": None
        })

    # process row-indexed entries -> build per-row sentences
    if row_indices:
        # Determine column concepts found across rows, to help ordering in sentence
        # Build set of concepts across the first row to use as order
        first_row_idx = row_indices[0]
        col_order = [c.get("concept") for c in rows.get(first_row_idx, [])]
        # fallback: unique set
        # col_order = list(dict.fromkeys(col_order))
        for ri in row_indices:
            facts_in_row = rows.get(ri, [])
            # prefer stable ordering: by concept then by provenance
            facts_in_row_sorted = sorted(facts_in_row, key=lambda x: (x.get("concept") or "", x.get("provenance_path") or ""))
            # Build a small map concept->(value, unit, period)
            parts = []
            used_concepts = []
            period_for_row = None
            for ff in facts_in_row_sorted:
                concept = ff.get("concept") or ""
                used_concepts.append(concept)
                human_conc = humanize_concept(concept)
                raw = ff.get("raw", {}) or {}
                decimals = raw.get("decimals") if isinstance(raw, dict) else None
                unit = ff.get("unit") or raw.get("unitRef")
                value = ff.get("value")
                # sometimes segment/dimension is another fact in the same row, include it in parentheses
                # gather period if present
                if not period_for_row and ff.get("period"):
                    period_for_row = ff.get("period")
                vstr = format_number_str(value, decimals, unit)
                if unit:
                    part = f"{human_conc}: {vstr} {unit}"
                else:
                    part = f"{human_conc}: {vstr}"
                parts.append(part)
            # Compose row sentence
            pstr = format_period(period_for_row)
            if pstr:
                sentence = f"{human_group} row {ri} ({pstr}) — " + "; ".join(parts) + "."
            else:
                sentence = f"{human_group} row {ri} — " + "; ".join(parts) + "."
            row_obj = {
                "row_index": ri,
                "sentence": sentence,
                "facts": facts_in_row_sorted,
                "provenance": [ff.get("provenance_path") for ff in facts_in_row_sorted]
            }
            group_entry["rows"].append(row_obj)
            flat_sentences.append({
                "group": gk,
                "row_index": ri,
                "sentence": sentence,
                "provenance": [ff.get("provenance_path") for ff in facts_in_row_sorted],
                "concepts": used_concepts,
                "tokens_estimate": None
            })

    # Add group entry
    all_group_sentences.append(group_entry)

# Save outputs
group_out_path = os.path.join(OUTPUT_DIR, "xbrl_sentences_by_group.json")
flat_out_path = os.path.join(OUTPUT_DIR, "xbrl_sentences_flat.json")

with open(group_out_path, "w", encoding="utf-8") as fh:
    json.dump(all_group_sentences, fh, indent=2, ensure_ascii=False)

with open(flat_out_path, "w", encoding="utf-8") as fh:
    json.dump(flat_sentences, fh, indent=2, ensure_ascii=False)

print("Done. Outputs:")
print(" - Grouped sentences:", group_out_path)
print(" - Flat sentences (one per row/fact-group):", flat_out_path)
print(f"Number of groups written: {len(all_group_sentences)}")
print(f"Number of flat sentences written: {len(flat_sentences)}")

# Print a few sample sentences to inspect
print("\nSample sentences (first 20):")
for i, s in enumerate(flat_sentences[:20]):
    print(i+1, "->", s["sentence"])


Loading files...
Loaded 2341 facts (flat). xbrl_full keys: ['CoverPage', 'AuditorInformation', 'StatementsOfIncome', 'StatementsOfComprehensiveIncome', 'BalanceSheets', 'BalanceSheetsParenthetical', 'StatementsOfShareholdersEquity', 'StatementsOfCashFlows', 'SummaryofSignificantAccountingPolicies', 'Revenue', 'EarningsPerShare', 'FinancialInstruments', 'PropertyPlantandEquipment', 'ConsolidatedFinancialStatementDetails', 'IncomeTaxes', 'Leases', 'Debt', 'ShareholdersEquity', 'ShareBasedCompensation', 'CommitmentsContingenciesandSupplyConcentrations']
Detected 331 groups.
Done. Outputs:
 - Grouped sentences: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_by_group.json
 - Flat sentences (one per row/fact-group): /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_flat.json
Number of groups written: 331
Number of flat sentences written: 1369

Sample sentences (first 20):
1 -> Security12b Title row 0 (for the period Oct 01, 2023 – Sep 28, 20

In [None]:
# 2nd Method:
"""
XBRL facts -> cleaned sentences + grouping + attach filing metadata

Outputs:
  - xbrl_sentences_by_group_with_meta.json
  - xbrl_sentences_flat_with_meta.json

CONFIGURE paths in the CONFIG section below before running.
"""

import os
import json
import re
from datetime import datetime
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple

# ---------------------- CONFIG ----------------------
INPUT_FACTS_FLAT = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_facts_flat.json"  # path to xbrl_facts_flat.json
INPUT_FULL = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_full.json"              # path to xbrl_full.json (optional but recommended)
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"  # optional; if present, will be attached
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output"       # where outputs will be saved
# ----------------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------ Helpers / Utilities ------------------

def load_json_safe(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def humanize_concept_name(concept: str) -> str:
    """Turn something like us-gaap:CashAndCashEquivalents -> Cash And Cash Equivalents"""
    if not concept:
        return ""
    # strip namespace
    if ":" in concept:
        concept = concept.split(":", 1)[1]
    # underscores/dashes -> spaces
    s = concept.replace("_", " ").replace("-", " ")
    # split camelCase boundaries
    s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", s)
    s = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    # Smart casing: keep major words capitalized except small words
    small = {"and","or","the","of","in","on","at","for","to","by","with","a","an"}
    parts = s.split()
    parts = [p.capitalize() if p.lower() not in small else p.lower() for p in parts]
    # ensure first word capitalized
    if parts:
        parts[0] = parts[0].capitalize()
    return " ".join(parts)

def format_number(value: Any) -> str:
    """Format number-like strings with commas. If can't convert, return original trimmed."""
    if value is None:
        return ""
    s = str(value).strip()
    # remove surrounding brackets used for negatives e.g., (123)
    negative = False
    if s.startswith("(") and s.endswith(")"):
        negative = True
        s = s[1:-1]
    # remove commas
    s_clean = s.replace(",", "")
    # attempt float/int
    try:
        if "." in s_clean or "e" in s_clean.lower():
            f = float(s_clean)
            # if it's effectively integer:
            if abs(f - int(f)) < 1e-9:
                out = f"{int(f):,}"
            else:
                # round to up to 6 decimals, strip trailing zeros
                out = f"{f:,.6f}".rstrip("0").rstrip(".")
        else:
            i = int(s_clean)
            out = f"{i:,}"
        if negative:
            out = "-" + out
        return out
    except Exception:
        # not numeric: return original
        return str(value).strip()

def canonical_unit(unit: Optional[str]) -> Optional[str]:
    """Normalize unit string: prefer USD -> 'USD', else uppercase if short."""
    if unit is None:
        return None
    u = str(unit).strip()
    # Common patterns: 'usd', 'USD', 'iso4217:USD', '$'
    if not u:
        return None
    if "usd" in u.lower() or "iso4217" in u.lower() and "usd" in u.lower() or u == "$":
        return "USD"
    # else uppercase short units
    if len(u) <= 10:
        return u.upper()
    return u

def period_to_type_and_string(period_field: Any) -> Tuple[str, Optional[str]]:
    """
    Interpret period info. Many shapes possible:
      - {"instant":"2023-09-30"}
      - {"startDate":"2022-10-01","endDate":"2023-09-30"}
      - some other nested shapes
    Returns tuple (period_type, period_string) where period_string is ISO-like suitable for embedding
    """
    if period_field is None:
        return ("unknown", None)
    # if string like '2023-09-30' directly
    if isinstance(period_field, str):
        # assume instant
        return ("instant", period_field)
    if isinstance(period_field, dict):
        if "instant" in period_field:
            return ("instant", period_field.get("instant"))
        if "startDate" in period_field and "endDate" in period_field:
            return ("duration", f"{period_field.get('startDate')} to {period_field.get('endDate')}")
        # common alternative keys
        if "start" in period_field and "end" in period_field:
            return ("duration", f"{period_field.get('start')} to {period_field.get('end')}")
        # sometimes simple keys:
        for k in ("date", "dateInstant"):
            if k in period_field:
                return ("instant", period_field.get(k))
    # fallback to stringified
    try:
        return ("unknown", json.dumps(period_field))
    except Exception:
        return ("unknown", str(period_field))

def canonical_segment_key(segment_field: Any) -> Optional[str]:
    """Produce canonical string key for segment/dimension to group on."""
    if not segment_field:
        return None
    if isinstance(segment_field, str):
        return segment_field
    if isinstance(segment_field, dict):
        # try to combine dimension & value
        dim = segment_field.get("dimension") or segment_field.get("axis") or segment_field.get("concept")
        val = segment_field.get("value") or segment_field.get("member") or segment_field.get("label")
        if dim and val:
            return f"{dim}={val}"
        # fallback to JSON string
        try:
            return json.dumps(segment_field, sort_keys=True)
        except Exception:
            return str(segment_field)
    if isinstance(segment_field, list):
        parts = []
        for s in segment_field:
            parts.append(canonical_segment_key(s) or str(s))
        return "|".join(parts)
    # other fallbacks
    return str(segment_field)

def strip_row_and_bracket_markers(text: str) -> str:
    """Remove 'row 14' and '[14]:' and similar artifacts from a string."""
    if not text:
        return text
    s = text
    # remove 'row N' patterns
    s = re.sub(r'\brow\s*\d+\b', '', s, flags=re.IGNORECASE)
    # remove bracket numbers like [14]: or [14]
    s = re.sub(r'\[\s*\d+\s*\]\s*:?', '', s)
    # remove leftover multiple spaces and punctuation duplicates
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s+:\s+', ': ', s)
    return s.strip(" -;:.")

def pick_concept_label(concept: str, full_map: Dict[str,str]) -> str:
    """Pick label from mapping or humanize fallback."""
    if not concept:
        return ""
    if concept in full_map:
        return full_map[concept]
    # also try without prefix
    if ":" in concept:
        key = concept.split(":",1)[1]
        if key in full_map:
            return full_map[key]
    return humanize_concept_name(concept)

# ---------------- Build label map from xbrl_full (best-effort) ----------------

def build_label_map_from_full(xbrl_full: Dict) -> Dict[str,str]:
    """
    Best-effort construction of concept -> label mapping from the xbrl_full structure.
    xbrl_full layouts vary by provider. We'll attempt a few heuristics.
    """
    mapping = {}
    if not isinstance(xbrl_full, dict):
        return mapping

    # heuristic 1: 'concepts' or 'elements' keys
    for key in ("concepts","elements","labels","facts","data"):
        if key in xbrl_full and isinstance(xbrl_full[key], dict):
            for k,v in xbrl_full[key].items():
                if isinstance(k, str) and k not in mapping:
                    # if v has 'label' use it
                    if isinstance(v, dict):
                        lab = v.get("label") or v.get("name") or v.get("label_text")
                        if isinstance(lab, str):
                            mapping[k] = lab
                        else:
                            mapping[k] = humanize_concept_name(k)

    # fallback scanning: look for keys that look like concept identifiers and their label fields
    def deep_scan(obj):
        if isinstance(obj, dict):
            for k,v in obj.items():
                # if v is a dict and contains 'label' or 'name', we can assign
                if isinstance(k, str) and isinstance(v, dict):
                    if "label" in v and isinstance(v["label"], str):
                        mapping.setdefault(k, v["label"])
                    elif "name" in v and isinstance(v["name"], str):
                        mapping.setdefault(k, v["name"])
                # continue deep
                deep_scan(v)
        elif isinstance(obj, list):
            for it in obj:
                deep_scan(it)

    deep_scan(xbrl_full)
    # final pass: ensure every key is humanized
    # no removal of labeling if found, else set humanized form
    return mapping

# ----------------- Main grouping + cleaning -----------------

def extract_facts_list(xbrl_facts_flat_obj: Any) -> List[Dict]:
    """Normalize the incoming facts container into a list of fact dicts."""
    if isinstance(xbrl_facts_flat_obj, list):
        return xbrl_facts_flat_obj
    if isinstance(xbrl_facts_flat_obj, dict):
        # common wrappers
        for poss in ("facts","items","data","factsList"):
            if poss in xbrl_facts_flat_obj and isinstance(xbrl_facts_flat_obj[poss], list):
                return xbrl_facts_flat_obj[poss]
        # else flatten dictionary-of-lists keyed by concept
        flat = []
        for k,v in xbrl_facts_flat_obj.items():
            if isinstance(v, list):
                for it in v:
                    if isinstance(it, dict):
                        it2 = dict(it)
                        it2.setdefault("concept", k)
                        flat.append(it2)
        return flat
    # else unknown structure
    return []

def build_outputs(facts_list: List[Dict], label_map: Dict[str,str], filing_meta: Dict[str,Any]):
    """
    Build group-level and flat-level cleaned sentences and attach metadata.
    Returns (groups, flat_entries)
    """
    groups = {}
    flat_entries = []

    def group_key(fact):
        concept = fact.get("concept") or fact.get("name") or fact.get("conceptName")
        p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw")
        p_type, p_str = period_to_type_and_string(p_field)
        unit = canonical_unit(fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref"))
        decimals = fact.get("decimals")
        seg = canonical_segment_key(fact.get("segment"))
        return (concept, p_type, p_str, unit, decimals, seg)

    for idx, fact in enumerate(facts_list):
        # robustly get fields
        concept = fact.get("concept") or fact.get("name") or fact.get("conceptName") or fact.get("label")
        if not concept:
            # skip if no concept
            continue
        label = pick_concept_label(concept, label_map)
        # period
        p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw")
        p_type, p_str = period_to_type_and_string(p_field)
        # value
        raw_value = fact.get("value") if "value" in fact else fact.get("val") if "val" in fact else fact.get("amount")
        formatted_value = format_number(raw_value) if raw_value is not None else ""
        # unit
        unit_raw = fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref")
        unit = canonical_unit(unit_raw)
        # segment
        seg = canonical_segment_key(fact.get("segment"))
        # sanitize any leftover row markers in label or fact text (rare)
        label = strip_row_and_bracket_markers(label)

        # Build canonical embed sentence for this fact (match requested style)
        # Example required: "Stockholders Equity as of 2021-09-25 is 163,000,000 USD."
        if p_type == "instant" and p_str:
            # ensure p_str is ISO-like date; if it's full "YYYY-MM-DD" keep as-is
            period_text = p_str
            if formatted_value:
                embed_text = f"{label} as of {period_text} is {formatted_value}"
                if unit:
                    embed_text += f" {unit}"
                embed_text += "."
            else:
                embed_text = f"{label} as of {period_text} (no numeric value parsed)."
        elif p_type == "duration" and p_str:
            embed_text = f"{label} for the period {p_str} is {formatted_value}"
            if unit:
                embed_text += f" {unit}"
            embed_text += "."
        else:
            # fallback
            embed_text = f"{label}: {formatted_value}"
            if unit:
                embed_text += f" {unit}"
            embed_text += "."

        # display text (more verbose/provenance)
        display = embed_text
        # add small provenance (source file/index) if present in fact
        provenance = {
            "fact_index": idx,
            "concept": concept,
            "label": label,
            "period_type": p_type,
            "period_string": p_str,
            "unit": unit,
            "decimals": fact.get("decimals"),
            "segment": seg,
            "original_fact_id": fact.get("id") or fact.get("fact_id") or None
        }

        # attach filing metadata (same format as you requested)
        metadata_block = {
            "ticker": filing_meta.get("ticker"),
            "company_name": filing_meta.get("company_name"),
            "form_type": filing_meta.get("form_type"),
            "accession": filing_meta.get("accession"),
            "accession_nodash": filing_meta.get("accession_nodash"),
            "cik": filing_meta.get("cik"),
            "period_of_report": filing_meta.get("period_of_report"),
            "fiscal_year_end": filing_meta.get("fiscal_year_end"),
            "file_no": filing_meta.get("file_no"),
            "filed_at_utc": filing_meta.get("filed_at_utc")
        }

        flat_entry = {
            "embed_text": embed_text,
            "display_text": display,
            "provenance": provenance,
            "metadata": metadata_block,
            "original_fact": fact
        }
        flat_entries.append(flat_entry)

        # add to group
        gk = group_key(fact)
        if gk not in groups:
            groups[gk] = {
                "concept": concept,
                "label": label,
                "period_type": p_type,
                "period_string": p_str,
                "unit": unit,
                "decimals": fact.get("decimals"),
                "segment_key": seg,
                "facts": [],
            }
        groups[gk]["facts"].append(flat_entry)

    # Build group-level sentences combining multiple facts if present
    group_list = []
    for gi, (gk, g) in enumerate(groups.items()):
        members = g["facts"]
        if len(members) == 1:
            group_sentence = members[0]["embed_text"]
        else:
            # join multiple members compactly (value (segment)) separated by "; "
            parts = []
            for m in members:
                val_str = m["provenance"].get("value_formatted") if "value_formatted" in m["provenance"] else None
                # we didn't include value_formatted in provenance; instead parse from embed_text quickly:
                # Just use the embed_text tail after 'is ' as fallback
                et = m["embed_text"]
                # attempt to find "is <value>"
                mval = re.search(r"is\s+(.+)$", et)
                valpiece = mval.group(1) if mval else et
                parts.append(valpiece.strip().rstrip("."))
            if g["period_type"] == "instant":
                group_sentence = f"{g['label']} as of {g['period_string']}: " + "; ".join(parts) + "."
            elif g["period_type"] == "duration":
                group_sentence = f"{g['label']} for the period {g['period_string']}: " + "; ".join(parts) + "."
            else:
                group_sentence = f"{g['label']} ({g['period_string']}): " + "; ".join(parts) + "."

        # attach metadata & provenance summary
        group_obj = {
            "group_id": f"group_{gi}",
            "concept": g["concept"],
            "label": g["label"],
            "period_type": g["period_type"],
            "period_string": g["period_string"],
            "unit": g["unit"],
            "decimals": g["decimals"],
            "segment_key": g["segment_key"],
            "member_count": len(members),
            "sentence": group_sentence,
            "facts": g["facts"],
            "metadata": metadata_block
        }
        group_list.append(group_obj)

    # sort group_list for determinism
    group_list_sorted = sorted(group_list, key=lambda x: (x["concept"], str(x["period_string"]), str(x["segment_key"] or "")))
    return group_list_sorted, flat_entries

# ---------------- Main runner ----------------

def main():
    print("[info] Loading inputs...")
    if not os.path.exists(INPUT_FACTS_FLAT):
        raise FileNotFoundError(f"Input facts flat file not found: {INPUT_FACTS_FLAT}")
    facts_obj = load_json_safe(INPUT_FACTS_FLAT)

    if os.path.exists(INPUT_FULL):
        xbrl_full = load_json_safe(INPUT_FULL)
    else:
        xbrl_full = {}

    # optional metadata source
    filing_meta = {}
    if METADATA_FILE and os.path.exists(METADATA_FILE):
        try:
            filing_meta = load_json_safe(METADATA_FILE)
            print("[info] loaded metadata file:", METADATA_FILE)
        except Exception:
            filing_meta = {}
    else:
        # try to extract simple metadata from xbrl_full (best-effort)
        # look for top-level CoverPage or metadata fields
        if isinstance(xbrl_full, dict):
            cp = xbrl_full.get("CoverPage") or xbrl_full.get("cover") or xbrl_full.get("cover_page") or xbrl_full.get("metadata")
            if cp and isinstance(cp, dict):
                filing_meta = {
                    "ticker": cp.get("ticker") or cp.get("stock_ticker"),
                    "company_name": cp.get("companyName") or cp.get("entityName") or cp.get("registrantName"),
                    "form_type": cp.get("formType") or cp.get("form_type"),
                    "accession": cp.get("accessionNumber") or cp.get("accession"),
                    "accession_nodash": (cp.get("accessionNumber") or cp.get("accession") or "").replace("-", ""),
                    "cik": cp.get("cik"),
                    "period_of_report": cp.get("periodOfReport") or cp.get("period_of_report"),
                    "fiscal_year_end": cp.get("fiscalYearEnd") or cp.get("fiscal_year_end"),
                    "file_no": cp.get("fileNumber") or cp.get("file_no"),
                    "filed_at_utc": cp.get("filingDate") or cp.get("filedAt")
                }
            else:
                filing_meta = {}
    # ensure keys exist (fill with None if missing)
    meta_template_keys = ["ticker","company_name","form_type","accession","accession_nodash","cik","period_of_report","fiscal_year_end","file_no","filed_at_utc"]
    filing_meta = {k: filing_meta.get(k) if filing_meta.get(k) is not None else None for k in meta_template_keys}

    print("[info] facts container loaded; normalizing to list...")
    facts_list = []
    # Normalize facts container to list shape
    if isinstance(facts_obj, list):
        facts_list = facts_obj
    elif isinstance(facts_obj, dict):
        # common wrapper keys
        if "facts" in facts_obj and isinstance(facts_obj["facts"], list):
            facts_list = facts_obj["facts"]
        else:
            # flatten possible mapping
            for k,v in facts_obj.items():
                if isinstance(v, list):
                    for it in v:
                        if isinstance(it, dict):
                            it2 = dict(it)
                            if "concept" not in it2:
                                it2["concept"] = k
                            facts_list.append(it2)
    else:
        raise RuntimeError("Unrecognized shape for xbrl_facts_flat.json")

    print(f"[info] number of facts found: {len(facts_list)}")

    # build label map from xbrl_full
    print("[info] building concept label map (best-effort) ...")
    label_map = build_label_map_from_full(xbrl_full) if xbrl_full else {}
    print(f"[info] label map size: {len(label_map)}")

    # Build group-level and flat outputs, attach metadata
    print("[info] building cleaned outputs and grouping ...")
    groups, flat = build_outputs(facts_list, label_map, filing_meta)

    # write outputs
    out_group = os.path.join(OUTPUT_DIR, "xbrl_sentences_by_group_with_meta.json")
    out_flat = os.path.join(OUTPUT_DIR, "xbrl_sentences_flat_with_meta.json")
    print(f"[info] writing grouped output to: {out_group} (groups: {len(groups)})")
    with open(out_group, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "group_count": len(groups),
            "groups": groups
        }, fh, indent=2, ensure_ascii=False)

    print(f"[info] writing flat output to: {out_flat} (entries: {len(flat)})")
    with open(out_flat, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "count": len(flat),
            "facts": flat
        }, fh, indent=2, ensure_ascii=False)

    print("[ok] Done. Outputs written to:", OUTPUT_DIR)

if __name__ == "__main__":
    main()


[info] Loading inputs...
[info] loaded metadata file: /content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json
[info] facts container loaded; normalizing to list...
[info] number of facts found: 2341
[info] building concept label map (best-effort) ...
[info] label map size: 0
[info] building cleaned outputs and grouping ...
[info] writing grouped output to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_by_group_with_meta.json (groups: 222)


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


[info] writing flat output to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_flat_with_meta.json (entries: 2341)


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


[ok] Done. Outputs written to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output


In [None]:
# 3rd Method:
"""
Fixed XBRL -> sentences script (handles empty/missing labels and non-numeric values)
+ Converts ISO dates (YYYY-MM-DD) to human-readable format like 'September 25, 2021'
Outputs (in OUTPUT_DIR):
 - xbrl_sentences_by_group_with_meta.json
 - xbrl_sentences_flat_with_meta.json
Edit CONFIG paths before running.
"""

import os
import json
import re
from datetime import datetime
from collections import defaultdict
from typing import Any, Dict, List, Tuple, Optional

# ---------------------- CONFIG ----------------------
INPUT_FACTS_FLAT = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_facts_flat.json"  # path to xbrl_facts_flat.json
INPUT_FULL = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_full.json"              # path to xbrl_full.json (optional but recommended)
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"  # optional; if present, will be attached
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output"       # where outputs will be saved
# ----------------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------ Helpers / Utilities ------------------

def load_json_safe(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def humanize_concept_name(concept: str) -> str:
    if not concept:
        return ""
    if ":" in concept:
        concept = concept.split(":", 1)[1]
    s = concept.replace("_", " ").replace("-", " ")
    s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", s)
    s = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    small = {"and","or","the","of","in","on","at","for","to","by","with","a","an"}
    parts = s.split()
    parts = [p.capitalize() if p.lower() not in small else p.lower() for p in parts]
    if parts:
        parts[0] = parts[0].capitalize()
    return " ".join(parts)

def format_number(value: Any) -> str:
    if value is None:
        return ""
    s = str(value).strip()
    negative = False
    if s.startswith("(") and s.endswith(")"):
        negative = True
        s = s[1:-1].strip()
    # Keep original if it looks clearly non-numeric (letters present)
    s_clean = s.replace(",", "").replace(" ", "")
    # allow percent sign
    if re.search(r'[A-Za-z]', s_clean) and not re.match(r'^[0-9\.\-\,\%]+$', s_clean):
        return s  # keep as-is (e.g., "Common Stock, $0.00001 par value per share")
    # attempt numeric
    try:
        # handle percentage like '0.000%'
        if s_clean.endswith("%"):
            num = float(s_clean[:-1])
            out = f"{num:.6f}".rstrip("0").rstrip(".") + "%"
        elif "." in s_clean or "e" in s_clean.lower():
            f = float(s_clean)
            if abs(f - int(f)) < 1e-9:
                out = f"{int(f):,}"
            else:
                out = f"{f:,.6f}".rstrip("0").rstrip(".")
        else:
            i = int(s_clean)
            out = f"{i:,}"
        if negative:
            out = "-" + out
        return out
    except Exception:
        return s

def canonical_unit(unit: Optional[str]) -> Optional[str]:
    if unit is None:
        return None
    u = str(unit).strip()
    if not u:
        return None
    if "usd" in u.lower() or ("iso4217" in u.lower() and "usd" in u.lower()) or u == "$":
        return "USD"
    return u.upper() if len(u) <= 10 else u

# ------------------ Date formatting helpers ------------------

def try_parse_date_str(date_str: str) -> Optional[datetime]:
    """
    Try multiple common date formats; return a datetime.date-bearing object on success.
    """
    if not date_str or not isinstance(date_str, str):
        return None
    s = date_str.strip()
    # remove timezone if present for simplistic parsing (we only need date)
    if "T" in s:
        s = s.split("T")[0]
    # try iso/dashed format and some alternatives
    patterns = [
        "%Y-%m-%d",
        "%Y%m%d",
        "%Y-%m-%dZ",
        "%Y-%m-%d%z",
        "%Y-%m-%d %H:%M:%S",
        "%Y/%m/%d",
    ]
    for p in patterns:
        try:
            return datetime.strptime(s, p)
        except Exception:
            continue
    # fallback: if string looks like YYYY-MM, create first day
    m = re.match(r"^(\d{4})-(\d{2})$", s)
    if m:
        try:
            return datetime(int(m.group(1)), int(m.group(2)), 1)
        except Exception:
            pass
    # if purely digits length 8 (YYYYMMDD) already handled, else no more tries
    return None

def format_date_human(date_str: Optional[str]) -> Optional[str]:
    """
    Convert a date-like string (YYYY-MM-DD etc.) to 'September 25, 2021' form.
    Returns None if cannot parse.
    """
    if not date_str:
        return None
    if not isinstance(date_str, str):
        date_str = str(date_str)
    dt = try_parse_date_str(date_str)
    if not dt:
        return None
    # Format e.g., "September 25, 2021" with day without zero padding
    return f"{dt.strftime('%B')} {dt.day}, {dt.year}"

def period_to_type_and_string(period_field: Any) -> Tuple[str, Optional[str]]:
    """
    Convert period field into (period_type, period_string) where period_string uses
    human-readable dates (e.g., 'September 25, 2021' or 'October 1, 2023 to September 28, 2024').
    """
    if period_field is None:
        return ("unknown", None)
    # if provided as simple string date
    if isinstance(period_field, str):
        human = format_date_human(period_field)
        return ("instant", human or period_field)
    # dictionary cases
    if isinstance(period_field, dict):
        # inline-xbrl style: {"instant": "2023-09-30"} or {"startDate":..., "endDate":...}
        if "instant" in period_field:
            human = format_date_human(period_field.get("instant"))
            return ("instant", human or period_field.get("instant"))
        if "startDate" in period_field and "endDate" in period_field:
            s = format_date_human(period_field.get("startDate")) or period_field.get("startDate")
            e = format_date_human(period_field.get("endDate")) or period_field.get("endDate")
            return ("duration", f"{s} to {e}")
        if "start" in period_field and "end" in period_field:
            s = format_date_human(period_field.get("start")) or period_field.get("start")
            e = format_date_human(period_field.get("end")) or period_field.get("end")
            return ("duration", f"{s} to {e}")
        for k in ("date","dateInstant"):
            if k in period_field:
                human = format_date_human(period_field.get(k))
                return ("instant", human or period_field.get(k))
    # fallback: try to coerce to string
    try:
        s = str(period_field)
        # try to extract a date chunk and format
        m = re.search(r"(\d{4}-\d{2}-\d{2})", s)
        if m:
            human = format_date_human(m.group(1))
            return ("unknown", human or s)
        return ("unknown", s)
    except Exception:
        return ("unknown", None)

def canonical_segment_key(segment_field: Any) -> Optional[str]:
    if not segment_field:
        return None
    if isinstance(segment_field, str):
        return segment_field
    if isinstance(segment_field, dict):
        dim = segment_field.get("dimension") or segment_field.get("axis") or segment_field.get("concept")
        val = segment_field.get("value") or segment_field.get("member") or segment_field.get("label")
        if dim and val:
            return f"{dim}={val}"
        try:
            return json.dumps(segment_field, sort_keys=True)
        except Exception:
            return str(segment_field)
    if isinstance(segment_field, list):
        parts = []
        for s in segment_field:
            parts.append(canonical_segment_key(s) or str(s))
        return "|".join(parts)
    return str(segment_field)

def strip_row_and_bracket_markers(text: str) -> str:
    if not text:
        return text
    s = text
    s = re.sub(r'\brow\s*\d+\b', '', s, flags=re.IGNORECASE)
    s = re.sub(r'\[\s*\d+\s*\]\s*:?','', s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s+:\s+', ': ', s)
    return s.strip(" -;:.")

def pick_concept_label(concept: str, full_map: Dict[str,str]) -> str:
    if not concept:
        return ""
    if concept in full_map and full_map[concept]:
        return full_map[concept]
    if ":" in concept:
        key = concept.split(":",1)[1]
        if key in full_map and full_map[key]:
            return full_map[key]
    # avoid returning things like '[0]'
    if re.match(r'^\[\d+\]$', concept):
        return ""
    return humanize_concept_name(concept)

def is_value_numeric_like(v: Any) -> bool:
    if v is None:
        return False
    if isinstance(v, (int, float)):
        return True
    s = str(v).strip()
    if s == "":
        return False
    # if letters present (other than %), treat as non-numeric
    no_pct = s.replace("%","")
    s2 = re.sub(r'[,\(\)\s\$£€]', '', no_pct)
    if re.search(r'[A-Za-z]', s2):
        return False
    try:
        float(s2)
        return True
    except Exception:
        return False

# ---------------- Build label map from xbrl_full (best-effort) ----------------

def build_label_map_from_full(xbrl_full: Dict) -> Dict[str,str]:
    mapping = {}
    if not isinstance(xbrl_full, dict):
        return mapping
    # heuristics
    def deep_scan(obj, parent_key=None):
        if isinstance(obj, dict):
            for k,v in obj.items():
                if isinstance(v, dict):
                    if "label" in v and isinstance(v["label"], str):
                        mapping.setdefault(k, v["label"])
                    if "name" in v and isinstance(v["name"], str):
                        mapping.setdefault(k, v["name"])
                deep_scan(v, k)
        elif isinstance(obj, list):
            for it in obj:
                deep_scan(it, parent_key)
    deep_scan(xbrl_full)
    return mapping

# ----------------- Main grouping + cleaning -----------------

def extract_facts_list(xbrl_facts_flat_obj: Any) -> List[Dict]:
    if isinstance(xbrl_facts_flat_obj, list):
        return xbrl_facts_flat_obj
    if isinstance(xbrl_facts_flat_obj, dict):
        for poss in ("facts","items","data","factsList"):
            if poss in xbrl_facts_flat_obj and isinstance(xbrl_facts_flat_obj[poss], list):
                return xbrl_facts_flat_obj[poss]
        flat = []
        for k,v in xbrl_facts_flat_obj.items():
            if isinstance(v, list):
                for it in v:
                    if isinstance(it, dict):
                        it2 = dict(it)
                        it2.setdefault("concept", k)
                        flat.append(it2)
        return flat
    return []

def build_outputs(facts_list: List[Dict], label_map: Dict[str,str], filing_meta: Dict[str,Any]):
    groups = {}
    flat_entries = []

    def group_key(fact):
        concept = fact.get("concept") or fact.get("name") or fact.get("conceptName")
        p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw")
        p_type, p_str = period_to_type_and_string(p_field)
        unit = canonical_unit(fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref"))
        decimals = fact.get("decimals")
        seg = canonical_segment_key(fact.get("segment"))
        return (concept, p_type, p_str, unit, decimals, seg)

    for idx, fact in enumerate(facts_list):
        concept = fact.get("concept") or fact.get("name") or fact.get("conceptName") or fact.get("label")
        if not concept:
            # fallback: if original_fact holds a provenance_path with a key, use that
            concept = fact.get("original_fact", {}).get("concept") or fact.get("original_fact", {}).get("provenance_path") or "[unknown]"

        label = pick_concept_label(concept, label_map) or ""
        p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw")
        p_type, p_str = period_to_type_and_string(p_field)
        raw_value = None
        # prefer explicit keys
        if "value" in fact:
            raw_value = fact.get("value")
        elif "val" in fact:
            raw_value = fact.get("val")
        elif "amount" in fact:
            raw_value = fact.get("amount")
        else:
            # try nested original raw
            raw_value = fact.get("original_fact",{}).get("raw", {}).get("value") if fact.get("original_fact") else None

        formatted_value = format_number(raw_value) if raw_value is not None else ""
        unit_raw = fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref")
        unit = canonical_unit(unit_raw)
        seg = canonical_segment_key(fact.get("segment"))

        # strip markers in label
        label = strip_row_and_bracket_markers(label).strip()

        # Build embed_text robustly:
        embed_text = ""
        display_text = ""
        # If label exists (good canonical label), create "Label as of ... is VALUE"
        if label:
            if p_type == "instant" and p_str:
                embed_text = f"{label} as of {p_str} is {formatted_value if formatted_value else (raw_value or '')}"
            elif p_type == "duration" and p_str:
                embed_text = f"{label} for the period {p_str} is {formatted_value if formatted_value else (raw_value or '')}"
            else:
                embed_text = f"{label}: {formatted_value if formatted_value else (raw_value or '')}"
        else:
            # label missing: often the fact itself contains textual 'value' (cover page items)
            if raw_value:
                if p_type == "instant" and p_str:
                    embed_text = f"{raw_value} as of {p_str}."
                elif p_type == "duration" and p_str:
                    embed_text = f"{raw_value} for the period {p_str}."
                else:
                    embed_text = f"{raw_value}."
            else:
                # no label and no raw_value: use concept or provenance path
                concept_display = concept if concept else "Fact"
                if p_str:
                    embed_text = f"{concept_display} ({p_str})."
                else:
                    embed_text = str(concept_display) + "."

        # Final cleanup: remove duplicate spaces and ensure no leading/trailing whitespace
        embed_text = re.sub(r'\s+', ' ', str(embed_text)).strip()
        display_text = embed_text

        # build provenance
        provenance = {
            "fact_index": idx,
            "concept": concept,
            "label": label,
            "period_type": p_type,
            "period_string": p_str,
            "unit": unit,
            "decimals": fact.get("decimals"),
            "segment": seg,
            "original_fact_id": fact.get("id") or fact.get("fact_id") or None
        }

        # metadata block
        metadata_block = {
            "ticker": filing_meta.get("ticker"),
            "company_name": filing_meta.get("company_name"),
            "form_type": filing_meta.get("form_type"),
            "accession": filing_meta.get("accession"),
            "accession_nodash": filing_meta.get("accession_nodash"),
            "cik": filing_meta.get("cik"),
            "period_of_report": filing_meta.get("period_of_report"),
            "fiscal_year_end": filing_meta.get("fiscal_year_end"),
            "file_no": filing_meta.get("file_no"),
            "filed_at_utc": filing_meta.get("filed_at_utc")
        }

        flat_entry = {
            "embed_text": embed_text,
            "display_text": display_text,
            "provenance": provenance,
            "metadata": metadata_block,
            "original_fact": fact
        }
        flat_entries.append(flat_entry)

        # group
        gk = group_key(fact)
        if gk not in groups:
            groups[gk] = {
                "concept": concept,
                "label": label,
                "period_type": p_type,
                "period_string": p_str,
                "unit": unit,
                "decimals": fact.get("decimals"),
                "segment_key": seg,
                "facts": [],
            }
        groups[gk]["facts"].append(flat_entry)

    # Build group-level sentences
    group_list = []
    for gi, (gk, g) in enumerate(groups.items()):
        members = g["facts"]
        if len(members) == 1:
            group_sentence = members[0]["embed_text"]
        else:
            parts = []
            for m in members:
                parts.append(m["embed_text"].rstrip("."))
            if g["period_type"] == "instant":
                group_sentence = f"{g['label'] or g['concept']} as of {g['period_string']}: " + "; ".join(parts) + "."
            elif g["period_type"] == "duration":
                group_sentence = f"{g['label'] or g['concept']} for the period {g['period_string']}: " + "; ".join(parts) + "."
            else:
                group_sentence = f"{g['label'] or g['concept']} ({g['period_string']}): " + "; ".join(parts) + "."

        group_obj = {
            "group_id": f"group_{gi}",
            "concept": g["concept"],
            "label": g["label"],
            "period_type": g["period_type"],
            "period_string": g["period_string"],
            "unit": g["unit"],
            "decimals": g["decimals"],
            "segment_key": g["segment_key"],
            "member_count": len(members),
            "sentence": re.sub(r'\s+', ' ', group_sentence).strip(),
            "facts": g["facts"],
            "metadata": metadata_block
        }
        group_list.append(group_obj)

    group_list_sorted = sorted(group_list, key=lambda x: (str(x.get("concept")), str(x.get("period_string") or ""), str(x.get("segment_key") or "")))
    return group_list_sorted, flat_entries

# ---------------- Main runner ----------------

def main():
    print("[info] Loading inputs...")
    if not os.path.exists(INPUT_FACTS_FLAT):
        raise FileNotFoundError(f"Input facts flat file not found: {INPUT_FACTS_FLAT}")
    facts_obj = load_json_safe(INPUT_FACTS_FLAT)

    xbrl_full = {}
    if os.path.exists(INPUT_FULL):
        try:
            xbrl_full = load_json_safe(INPUT_FULL)
        except Exception:
            xbrl_full = {}

    filing_meta = {}
    if METADATA_FILE and os.path.exists(METADATA_FILE):
        try:
            filing_meta = load_json_safe(METADATA_FILE)
            print("[info] loaded metadata file:", METADATA_FILE)
        except Exception:
            filing_meta = {}
    else:
        # try small extraction from xbrl_full if possible
        if isinstance(xbrl_full, dict):
            cp = xbrl_full.get("CoverPage") or xbrl_full.get("cover") or xbrl_full.get("metadata") or {}
            if isinstance(cp, dict):
                filing_meta = {
                    "ticker": cp.get("ticker"),
                    "company_name": cp.get("companyName") or cp.get("entityName"),
                    "form_type": cp.get("formType"),
                    "accession": cp.get("accessionNumber") or cp.get("accession"),
                    "accession_nodash": (cp.get("accessionNumber") or cp.get("accession") or "").replace("-", ""),
                    "cik": cp.get("cik"),
                    "period_of_report": cp.get("periodOfReport"),
                    "fiscal_year_end": cp.get("fiscalYearEnd"),
                    "file_no": cp.get("fileNumber"),
                    "filed_at_utc": cp.get("filingDate")
                }
    meta_template_keys = ["ticker","company_name","form_type","accession","accession_nodash","cik","period_of_report","fiscal_year_end","file_no","filed_at_utc"]
    filing_meta = {k: filing_meta.get(k) if filing_meta.get(k) is not None else None for k in meta_template_keys}

    facts_list = extract_facts_list(facts_obj)
    print(f"[info] number of facts found: {len(facts_list)}")

    print("[info] building label map (best-effort) from full XBRL ...")
    label_map = build_label_map_from_full(xbrl_full) if xbrl_full else {}
    print(f"[info] label map entries: {len(label_map)}")

    print("[info] building cleaned outputs and grouping ...")
    groups, flat = build_outputs(facts_list, label_map, filing_meta)

    out_group = os.path.join(OUTPUT_DIR, "xbrl_sentences_by_group_with_meta.json")
    out_flat = os.path.join(OUTPUT_DIR, "xbrl_sentences_flat_with_meta.json")

    print(f"[info] writing grouped output to: {out_group} (groups: {len(groups)})")
    with open(out_group, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "group_count": len(groups),
            "groups": groups
        }, fh, indent=2, ensure_ascii=False)

    print(f"[info] writing flat output to: {out_flat} (entries: {len(flat)})")
    with open(out_flat, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "count": len(flat),
            "facts": flat
        }, fh, indent=2, ensure_ascii=False)

    print("[ok] Done. Outputs written to:", OUTPUT_DIR)

if __name__ == "__main__":
    main()


[info] Loading inputs...
[info] loaded metadata file: /content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json
[info] number of facts found: 2341
[info] building label map (best-effort) from full XBRL ...
[info] label map entries: 0
[info] building cleaned outputs and grouping ...
[info] writing grouped output to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_by_group_with_meta.json (groups: 222)


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


[info] writing flat output to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_flat_with_meta.json (entries: 2341)
[ok] Done. Outputs written to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


In [None]:
# 4th Method:
"""
Fixed XBRL -> sentences script (handles empty/missing labels and non-numeric values)
+ Converts ISO dates (YYYY-MM-DD) to human-readable format like 'September 25, 2021'
+ Ensures units are appended for numeric facts and humanizes segment/member values
Outputs (in OUTPUT_DIR):
 - xbrl_sentences_by_group_with_meta.json
 - xbrl_sentences_flat_with_meta.json
Edit CONFIG paths before running.
"""

import os
import json
import re
from datetime import datetime
from collections import defaultdict
from typing import Any, Dict, List, Tuple, Optional

# ---------------------- CONFIG ----------------------
INPUT_FACTS_FLAT = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_facts_flat.json"  # path to xbrl_facts_flat.json
INPUT_FULL = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_full.json"              # path to xbrl_full.json (optional but recommended)
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"  # optional; if present, will be attached
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output"       # where outputs will be saved
# ----------------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------ Helpers / Utilities ------------------

def load_json_safe(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def humanize_concept_name(concept: str) -> str:
    if not concept:
        return ""
    if ":" in concept:
        concept = concept.split(":", 1)[1]
    s = concept.replace("_", " ").replace("-", " ")
    # Break camel-case sequences
    s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", s)
    s = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    small = {"and","or","the","of","in","on","at","for","to","by","with","a","an"}
    parts = s.split()
    parts = [p.capitalize() if p.lower() not in small else p.lower() for p in parts]
    if parts:
        parts[0] = parts[0].capitalize()
    return " ".join(parts)

def format_number(value: Any) -> str:
    """Try to produce a readable numeric string with commas/percent handling."""
    if value is None:
        return ""
    s = str(value).strip()
    negative = False
    if s.startswith("(") and s.endswith(")"):
        negative = True
        s = s[1:-1].strip()
    # Keep original if it looks clearly non-numeric (letters present, not percent)
    s_clean = s.replace(",", "").replace(" ", "")
    if re.search(r'[A-Za-z]', s_clean) and not re.match(r'^[0-9\.\-\,\%]+$', s_clean):
        return s  # textual facts keep original
    try:
        # percentage
        if s_clean.endswith("%"):
            num = float(s_clean[:-1])
            out = f"{num:.6f}".rstrip("0").rstrip(".") + "%"
        elif "." in s_clean or "e" in s_clean.lower():
            f = float(s_clean)
            if abs(f - int(f)) < 1e-9:
                out = f"{int(f):,}"
            else:
                out = f"{f:,.6f}".rstrip("0").rstrip(".")
        else:
            i = int(s_clean)
            out = f"{i:,}"
        if negative:
            out = "-" + out
        return out
    except Exception:
        # fallback to original string
        return s

def canonical_unit(unit: Optional[str]) -> Optional[str]:
    if unit is None:
        return None
    u = str(unit).strip()
    if not u:
        return None
    # common USD detection
    if "usd" in u.lower() or ("iso4217" in u.lower() and "usd" in u.lower()) or u == "$":
        return "USD"
    # return uppercase short form for readability
    u_upper = u.upper()
    # Trim long namespace-like values, prefer readable token
    u_upper = re.sub(r'[^A-Z0-9\%]+', '', u_upper)
    return u_upper if u_upper else None

# ------------------ Date formatting helpers ------------------

def try_parse_date_str(date_str: str) -> Optional[datetime]:
    """
    Try multiple common date formats; return a datetime.date-bearing object on success.
    """
    if not date_str or not isinstance(date_str, str):
        return None
    s = date_str.strip()
    # Cut off time part
    if "T" in s:
        s = s.split("T")[0]
    patterns = [
        "%Y-%m-%d",
        "%Y%m%d",
        "%Y-%m-%dZ",
        "%Y-%m-%d%z",
        "%Y-%m-%d %H:%M:%S",
        "%Y/%m/%d",
    ]
    for p in patterns:
        try:
            return datetime.strptime(s, p)
        except Exception:
            continue
    m = re.match(r"^(\d{4})-(\d{2})$", s)
    if m:
        try:
            return datetime(int(m.group(1)), int(m.group(2)), 1)
        except Exception:
            pass
    return None

def format_date_human(date_str: Optional[str]) -> Optional[str]:
    """
    Convert a date-like string (YYYY-MM-DD etc.) to 'September 25, 2021' form.
    Returns None if cannot parse.
    """
    if not date_str:
        return None
    if not isinstance(date_str, str):
        date_str = str(date_str)
    dt = try_parse_date_str(date_str)
    if not dt:
        return None
    # Full month name e.g., "September 25, 2021"
    return f"{dt.strftime('%B')} {dt.day}, {dt.year}"

def period_to_type_and_string(period_field: Any) -> Tuple[str, Optional[str]]:
    """
    Convert period field into (period_type, period_string) where period_string uses
    human-readable dates (e.g., 'September 25, 2021' or 'October 1, 2023 to September 28, 2024').
    """
    if period_field is None:
        return ("unknown", None)
    if isinstance(period_field, str):
        human = format_date_human(period_field)
        return ("instant", human or period_field)
    if isinstance(period_field, dict):
        if "instant" in period_field:
            human = format_date_human(period_field.get("instant"))
            return ("instant", human or period_field.get("instant"))
        if "startDate" in period_field and "endDate" in period_field:
            s = format_date_human(period_field.get("startDate")) or period_field.get("startDate")
            e = format_date_human(period_field.get("endDate")) or period_field.get("endDate")
            return ("duration", f"{s} to {e}")
        if "start" in period_field and "end" in period_field:
            s = format_date_human(period_field.get("start")) or period_field.get("start")
            e = format_date_human(period_field.get("end")) or period_field.get("end")
            return ("duration", f"{s} to {e}")
        for k in ("date","dateInstant"):
            if k in period_field:
                human = format_date_human(period_field.get(k))
                return ("instant", human or period_field.get(k))
    try:
        s = str(period_field)
        m = re.search(r"(\d{4}-\d{2}-\d{2})", s)
        if m:
            human = format_date_human(m.group(1))
            return ("unknown", human or s)
        return ("unknown", s)
    except Exception:
        return ("unknown", None)

def canonical_segment_key(segment_field: Any) -> Optional[str]:
    if not segment_field:
        return None
    if isinstance(segment_field, str):
        return segment_field
    if isinstance(segment_field, dict):
        dim = segment_field.get("dimension") or segment_field.get("axis") or segment_field.get("concept")
        val = segment_field.get("value") or segment_field.get("member") or segment_field.get("label")
        if dim and val:
            return f"{dim}={val}"
        try:
            return json.dumps(segment_field, sort_keys=True)
        except Exception:
            return str(segment_field)
    if isinstance(segment_field, list):
        parts = []
        for s in segment_field:
            parts.append(canonical_segment_key(s) or str(s))
        return "|".join(parts)
    return str(segment_field)

def strip_row_and_bracket_markers(text: str) -> str:
    if not text:
        return text
    s = text
    s = re.sub(r'\brow\s*\d+\b', '', s, flags=re.IGNORECASE)
    s = re.sub(r'\[\s*\d+\s*\]\s*:?','', s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s+:\s+', ': ', s)
    return s.strip(" -;:.")

def pick_concept_label(concept: str, full_map: Dict[str,str]) -> str:
    if not concept:
        return ""
    if concept in full_map and full_map[concept]:
        return full_map[concept]
    if ":" in concept:
        key = concept.split(":",1)[1]
        if key in full_map and full_map[key]:
            return full_map[key]
    if re.match(r'^\[\d+\]$', concept):
        return ""
    return humanize_concept_name(concept)

def is_value_numeric_like(v: Any) -> bool:
    if v is None:
        return False
    if isinstance(v, (int, float)):
        return True
    s = str(v).strip()
    if s == "":
        return False
    no_pct = s.replace("%","")
    s2 = re.sub(r'[,\(\)\s\$£€]', '', no_pct)
    if re.search(r'[A-Za-z]', s2):
        return False
    try:
        float(s2)
        return True
    except Exception:
        return False

# --------------- New helpers for better display ----------------

def prettify_segment_value(val: str) -> str:
    """Turn things like 'aapl:EuropeSegmentMember' into 'Europe' (human-readable)."""
    if not val or not isinstance(val, str):
        return str(val or "")
    s = val.strip()
    # If contains colon, take suffix
    if ":" in s:
        s = s.split(":",1)[1]
    # Remove Member/Sector/Segment suffixes often present
    s = re.sub(r'(Segment|Member|MemberId|MemberType)$', '', s, flags=re.IGNORECASE)
    # Replace underscores/dashes/camelcase
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    # Capitalize words reasonably
    parts = s.split()
    parts = [p.capitalize() for p in parts if p]
    return " ".join(parts)

def append_unit_to_value_string(value_str: str, unit: Optional[str]) -> str:
    if not value_str:
        return value_str
    if unit:
        return f"{value_str} {unit}"
    return value_str

# ---------------- Build label map from xbrl_full (best-effort) ----------------

def build_label_map_from_full(xbrl_full: Dict) -> Dict[str,str]:
    mapping = {}
    if not isinstance(xbrl_full, dict):
        return mapping
    def deep_scan(obj, parent_key=None):
        if isinstance(obj, dict):
            for k,v in obj.items():
                if isinstance(v, dict):
                    if "label" in v and isinstance(v["label"], str):
                        mapping.setdefault(k, v["label"])
                    if "name" in v and isinstance(v["name"], str):
                        mapping.setdefault(k, v["name"])
                deep_scan(v, k)
        elif isinstance(obj, list):
            for it in obj:
                deep_scan(it, parent_key)
    deep_scan(xbrl_full)
    return mapping

# ----------------- Main grouping + cleaning -----------------

def extract_facts_list(xbrl_facts_flat_obj: Any) -> List[Dict]:
    if isinstance(xbrl_facts_flat_obj, list):
        return xbrl_facts_flat_obj
    if isinstance(xbrl_facts_flat_obj, dict):
        for poss in ("facts","items","data","factsList"):
            if poss in xbrl_facts_flat_obj and isinstance(xbrl_facts_flat_obj[poss], list):
                return xbrl_facts_flat_obj[poss]
        flat = []
        for k,v in xbrl_facts_flat_obj.items():
            if isinstance(v, list):
                for it in v:
                    if isinstance(it, dict):
                        it2 = dict(it)
                        it2.setdefault("concept", k)
                        flat.append(it2)
        return flat
    return []

def build_outputs(facts_list: List[Dict], label_map: Dict[str,str], filing_meta: Dict[str,Any]):
    groups = {}
    flat_entries = []

    def group_key(fact):
        concept = fact.get("concept") or fact.get("name") or fact.get("conceptName")
        p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw")
        p_type, p_str = period_to_type_and_string(p_field)
        unit = canonical_unit(fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref"))
        decimals = fact.get("decimals")
        seg = canonical_segment_key(fact.get("segment"))
        return (concept, p_type, p_str, unit, decimals, seg)

    for idx, fact in enumerate(facts_list):
        # Determine concept and label
        concept = fact.get("concept") or fact.get("name") or fact.get("conceptName") or fact.get("label")
        if not concept:
            concept = fact.get("original_fact", {}).get("concept") or fact.get("original_fact", {}).get("provenance_path") or "[unknown]"

        # Try to find human label from label map; can be empty
        label = pick_concept_label(concept, label_map) or ""

        # Period handling
        p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw")
        p_type, p_str = period_to_type_and_string(p_field)

        # Value extraction
        raw_value = None
        if "value" in fact:
            raw_value = fact.get("value")
        elif "val" in fact:
            raw_value = fact.get("val")
        elif "amount" in fact:
            raw_value = fact.get("amount")
        else:
            raw_value = fact.get("original_fact",{}).get("raw", {}).get("value") if fact.get("original_fact") else None

        # Format numeric values
        formatted_value = format_number(raw_value) if raw_value is not None else ""
        # Unit canonicalization
        unit_raw = fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref") or fact.get("original_fact",{}).get("raw",{}).get("unitRef")
        unit = canonical_unit(unit_raw)

        # Segment info
        seg = canonical_segment_key(fact.get("segment"))

        # Clean label
        label_clean = strip_row_and_bracket_markers(label).strip()

        # Build embed_text and display_text robustly
        embed_text = ""
        display_text = ""

        # If this looks like a segment/member fact (or label is "Segment"), pretty-format the member
        is_segment_like = False
        if (label_clean and label_clean.lower() == "segment") or (str(concept).lower() == "segment") or ("segment" in str(concept).lower()) or (fact.get("original_fact",{}).get("provenance_path","").lower().endswith("/segment")):
            is_segment_like = True

        if is_segment_like and raw_value:
            seg_pretty = prettify_segment_value(str(raw_value))
            embed_text = f"Segment: {seg_pretty}"
        else:
            # Numeric-like path (prefer label if present)
            if label_clean:
                # Prepare value string with unit if numeric-like; otherwise raw text
                if formatted_value:
                    value_with_unit = append_unit_to_value_string(formatted_value, unit)
                else:
                    # textual label-value cases (e.g., cover page text)
                    value_with_unit = raw_value if raw_value is not None else ""
                if p_type == "instant" and p_str:
                    embed_text = f"{label_clean} as of {p_str} is {value_with_unit}."
                elif p_type == "duration" and p_str:
                    embed_text = f"{label_clean} for the period {p_str} is {value_with_unit}."
                else:
                    embed_text = f"{label_clean}: {value_with_unit}"
            else:
                # no label — try to be explicit and not start with raw number alone
                if formatted_value:
                    value_with_unit = append_unit_to_value_string(formatted_value, unit)
                    if p_type == "instant" and p_str:
                        embed_text = f"Amount as of {p_str} is {value_with_unit}."
                    elif p_type == "duration" and p_str:
                        embed_text = f"Amount for the period {p_str} is {value_with_unit}."
                    else:
                        embed_text = f"Amount: {value_with_unit}."
                elif raw_value:
                    # textual raw value (non-numeric) — preserve but add context if period exists
                    rv = str(raw_value)
                    if p_type == "instant" and p_str:
                        embed_text = f"{rv} as of {p_str}."
                    elif p_type == "duration" and p_str:
                        embed_text = f"{rv} for the period {p_str}."
                    else:
                        embed_text = f"{rv}."
                else:
                    # fallback to concept or provenance
                    concept_display = concept if concept else "Fact"
                    if p_str:
                        embed_text = f"{concept_display} ({p_str})."
                    else:
                        embed_text = str(concept_display) + "."

        # Final cleanup: collapse whitespace, ensure punctuation
        embed_text = re.sub(r'\s+', ' ', str(embed_text)).strip()
        # Ensure punctuation at end
        if embed_text and embed_text[-1] not in ".?!":
            embed_text = embed_text + "."

        display_text = embed_text

        # build provenance
        provenance = {
            "fact_index": idx,
            "concept": concept,
            "label": label_clean,
            "period_type": p_type,
            "period_string": p_str,
            "unit": unit,
            "decimals": fact.get("decimals"),
            "segment": seg,
            "original_fact_id": fact.get("id") or fact.get("fact_id") or None
        }

        # metadata block (keeps the same keys you requested)
        metadata_block = {
            "ticker": filing_meta.get("ticker"),
            "company_name": filing_meta.get("company_name"),
            "form_type": filing_meta.get("form_type"),
            "accession": filing_meta.get("accession"),
            "accession_nodash": filing_meta.get("accession_nodash"),
            "cik": filing_meta.get("cik"),
            "period_of_report": filing_meta.get("period_of_report"),
            "fiscal_year_end": filing_meta.get("fiscal_year_end"),
            "file_no": filing_meta.get("file_no"),
            "filed_at_utc": filing_meta.get("filed_at_utc")
        }

        flat_entry = {
            "embed_text": embed_text,
            "display_text": display_text,
            "provenance": provenance,
            "metadata": metadata_block,
            "original_fact": fact
        }
        flat_entries.append(flat_entry)

        # group by (concept, period, unit, decimals, segment)
        gk = group_key(fact)
        if gk not in groups:
            groups[gk] = {
                "concept": concept,
                "label": label_clean,
                "period_type": p_type,
                "period_string": p_str,
                "unit": unit,
                "decimals": fact.get("decimals"),
                "segment_key": seg,
                "facts": [],
            }
        groups[gk]["facts"].append(flat_entry)

    # Build group-level sentences (aggregate multiple members of the same group)
    group_list = []
    for gi, (gk, g) in enumerate(groups.items()):
        members = g["facts"]
        if len(members) == 1:
            group_sentence = members[0]["embed_text"]
        else:
            parts = []
            for m in members:
                parts.append(m["embed_text"].rstrip("."))
            if g["period_type"] == "instant":
                group_sentence = f"{g['label'] or g['concept']} as of {g['period_string']}: " + "; ".join(parts) + "."
            elif g["period_type"] == "duration":
                group_sentence = f"{g['label'] or g['concept']} for the period {g['period_string']}: " + "; ".join(parts) + "."
            else:
                ps = g.get("period_string") or ""
                group_sentence = f"{g['label'] or g['concept']} ({ps}): " + "; ".join(parts) + "."

        group_obj = {
            "group_id": f"group_{gi}",
            "concept": g["concept"],
            "label": g["label"],
            "period_type": g["period_type"],
            "period_string": g["period_string"],
            "unit": g["unit"],
            "decimals": g["decimals"],
            "segment_key": g["segment_key"],
            "member_count": len(members),
            "sentence": re.sub(r'\s+', ' ', group_sentence).strip(),
            "facts": g["facts"],
            "metadata": metadata_block
        }
        group_list.append(group_obj)

    group_list_sorted = sorted(group_list, key=lambda x: (str(x.get("concept")), str(x.get("period_string") or ""), str(x.get("segment_key") or "")))
    return group_list_sorted, flat_entries

# ---------------- Main runner ----------------

def main():
    print("[info] Loading inputs...")
    if not os.path.exists(INPUT_FACTS_FLAT):
        raise FileNotFoundError(f"Input facts flat file not found: {INPUT_FACTS_FLAT}")
    facts_obj = load_json_safe(INPUT_FACTS_FLAT)

    xbrl_full = {}
    if os.path.exists(INPUT_FULL):
        try:
            xbrl_full = load_json_safe(INPUT_FULL)
        except Exception:
            xbrl_full = {}

    filing_meta = {}
    if METADATA_FILE and os.path.exists(METADATA_FILE):
        try:
            filing_meta = load_json_safe(METADATA_FILE)
            print("[info] loaded metadata file:", METADATA_FILE)
        except Exception:
            filing_meta = {}
    else:
        if isinstance(xbrl_full, dict):
            cp = xbrl_full.get("CoverPage") or xbrl_full.get("cover") or xbrl_full.get("metadata") or {}
            if isinstance(cp, dict):
                filing_meta = {
                    "ticker": cp.get("ticker"),
                    "company_name": cp.get("companyName") or cp.get("entityName"),
                    "form_type": cp.get("formType"),
                    "accession": cp.get("accessionNumber") or cp.get("accession"),
                    "accession_nodash": (cp.get("accessionNumber") or cp.get("accession") or "").replace("-", ""),
                    "cik": cp.get("cik"),
                    "period_of_report": cp.get("periodOfReport"),
                    "fiscal_year_end": cp.get("fiscalYearEnd"),
                    "file_no": cp.get("fileNumber"),
                    "filed_at_utc": cp.get("filingDate")
                }
    meta_template_keys = ["ticker","company_name","form_type","accession","accession_nodash","cik","period_of_report","fiscal_year_end","file_no","filed_at_utc"]
    filing_meta = {k: filing_meta.get(k) if filing_meta.get(k) is not None else None for k in meta_template_keys}

    facts_list = extract_facts_list(facts_obj)
    print(f"[info] number of facts found: {len(facts_list)}")

    print("[info] building label map (best-effort) from full XBRL ...")
    label_map = build_label_map_from_full(xbrl_full) if xbrl_full else {}
    print(f"[info] label map entries: {len(label_map)}")

    print("[info] building cleaned outputs and grouping ...")
    groups, flat = build_outputs(facts_list, label_map, filing_meta)

    out_group = os.path.join(OUTPUT_DIR, "xbrl_sentences_by_group_with_meta.json")
    out_flat = os.path.join(OUTPUT_DIR, "xbrl_sentences_flat_with_meta.json")

    print(f"[info] writing grouped output to: {out_group} (groups: {len(groups)})")
    with open(out_group, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "group_count": len(groups),
            "groups": groups
        }, fh, indent=2, ensure_ascii=False)

    print(f"[info] writing flat output to: {out_flat} (entries: {len(flat)})")
    with open(out_flat, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "count": len(flat),
            "facts": flat
        }, fh, indent=2, ensure_ascii=False)

    print("[ok] Done. Outputs written to:", OUTPUT_DIR)

if __name__ == "__main__":
    main()


[info] Loading inputs...
[info] loaded metadata file: /content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json
[info] number of facts found: 2341
[info] building label map (best-effort) from full XBRL ...
[info] label map entries: 0
[info] building cleaned outputs and grouping ...
[info] writing grouped output to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_by_group_with_meta.json (groups: 222)


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


[info] writing flat output to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output/xbrl_sentences_flat_with_meta.json (entries: 2341)


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


[ok] Done. Outputs written to: /content/drive/My Drive/SEC-API/AAPL/XBRL_sentences_output


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 5th Method: using non-segment and segment semantics
# xbrl_sentences_from_provenance.py
# Colab-friendly single-file script.
#
# Edit the CONFIG paths below before running.

import os
import json
import re
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple, Iterable

# ---------------------- CONFIG ----------------------
# Change these paths to the actual locations in your Drive or local filesystem.
INPUT_FACTS_FLAT = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_facts_flat.json"
INPUT_FULL = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/xbrl_full.json"         # optional (for labels)
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"  # optional
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/xbrl_sentences_output"
# ---------------------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)


# ------------------ Helpers ------------------

def load_json_safe(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def try_get(obj: dict, *keys, default=None):
    for k in keys:
        if isinstance(obj, dict) and k in obj:
            return obj[k]
    return default


# Humanize: strip namespace, split camel/underscores, titlecase with small words lowered
def humanize_concept_name(concept: Optional[str]) -> str:
    if not concept:
        return ""
    s = str(concept).strip()
    # If concept looks like label default like [0], return empty
    if re.match(r'^\[\d+\]$', s):
        return ""
    # remove namespace prefixes like us-gaap:, aapl:
    if ":" in s:
        s = s.split(":", 1)[1]
    # remove trailing words like Member if present
    s = re.sub(r'(Member|Axis|AxisMember|Member)$', '', s, flags=re.IGNORECASE).strip()
    # split camelCase + underscores + dashes
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    # title-case but keep small words lower
    small = {"and","or","the","of","in","on","at","for","to","by","with","a","an","per","as"}
    parts = [p.capitalize() if p.lower() not in small else p.lower() for p in s.split()]
    if parts:
        parts[0] = parts[0].capitalize()
    return " ".join(parts)


def strip_row_and_bracket_markers(text: str) -> str:
    if not text:
        return ""
    s = str(text)
    s = re.sub(r'\[\s*\d+\s*\]\s*:?','', s)       # remove [123] markers
    s = re.sub(r'\brow\s*\d+\b', '', s, flags=re.IGNORECASE)
    s = re.sub(r'\s+', ' ', s).strip()
    return s.strip(" -;:.")


def format_number(value: Any, decimals: Optional[Any] = None) -> str:
    if value is None:
        return ""
    s = str(value).strip()
    if s == "":
        return ""
    # If contains HTML or letters other than allowed symbols, keep as-is
    # (cover-page textual values like "Common Stock, $0.00001 par value per share")
    no_html = re.sub(r'<[^>]+>', '', s)
    if re.search(r'[A-Za-z]', no_html) and not re.search(r'^[\d\-\+\.,Ee\(\)%]+$', no_html):
        return no_html
    # handle parentheses negative like "(123)"
    negative = False
    if s.startswith("(") and s.endswith(")"):
        negative = True
        s = s[1:-1].strip()
    # strip spaces and commas for numeric parsing
    s_clean = s.replace(",", "").replace(" ", "")
    # handle percent
    if s_clean.endswith("%"):
        try:
            num = float(s_clean[:-1])
            out = f"{num:.6f}".rstrip("0").rstrip(".") + "%"
            return "-" + out if negative else out
        except Exception:
            return ("-" if negative else "") + s
    # attempt float convert (handles E notation)
    try:
        f = float(s_clean)
        # if basically an integer, print no decimals
        if abs(f - int(f)) < 1e-6:
            out = f"{int(round(f)):,}"
        else:
            # show up to 6 decimal places trimmed
            out = f"{f:,.6f}".rstrip("0").rstrip(".")
        return "-" + out if negative else out
    except Exception:
        return ("-" if negative else "") + s


def canonical_unit(unit: Optional[str]) -> Optional[str]:
    if unit is None:
        return None
    u = str(unit).strip()
    if not u:
        return None
    if "usd" in u.lower() or ("iso4217" in u.lower() and "usd" in u.lower()) or u == "$":
        return "USD"
    # return uppercase short unit if short
    u_clean = re.sub(r'[^A-Za-z0-9]', '', u)
    return u_clean.upper() if len(u_clean) <= 10 else u


# Date parsing + humanization with full month name (e.g., September 25, 2021)
def parse_iso_date_try(date_str: str) -> Optional[datetime]:
    if not date_str or not isinstance(date_str, str):
        return None
    s = date_str.strip()
    # cut time if present
    if "T" in s:
        s = s.split("T")[0]
    s = s.strip()
    patterns = ["%Y-%m-%d", "%Y%m%d", "%Y/%m/%d", "%d-%m-%Y", "%d/%m/%Y"]
    for p in patterns:
        try:
            return datetime.strptime(s, p)
        except Exception:
            continue
    # fallback: try to find substring like YYYY-MM-DD
    m = re.search(r'(\d{4}-\d{2}-\d{2})', s)
    if m:
        try:
            return datetime.strptime(m.group(1), "%Y-%m-%d")
        except Exception:
            pass
    return None


def format_date_full_month(date_str: Optional[str]) -> Optional[str]:
    dt = None
    if date_str is None:
        return None
    if isinstance(date_str, datetime):
        dt = date_str
    else:
        dt = parse_iso_date_try(str(date_str))
    if not dt:
        return None
    # Example: "September 25, 2021"
    # use dt.day (no leading zero)
    return f"{dt.strftime('%B')} {dt.day}, {dt.year}"


def period_to_type_and_string(period_field: Any) -> Tuple[str, Optional[str]]:
    """
    Input may be:
     - a dict with startDate/endDate or instant
     - a string like '2023-10-01 to 2024-09-28' or '2023-10-01'
     - None
    """
    if period_field is None:
        return ("unknown", None)
    if isinstance(period_field, str):
        # try to detect a range "YYYY-MM-DD/YYYY-MM-DD" or "startDate/endDate"
        s = period_field.strip()
        # separators maybe "/" or " to " or "-" (ambiguous)
        if "/" in s and re.search(r'\d{4}-\d{2}-\d{2}', s):
            parts = s.split("/")
            if len(parts) >= 2:
                start = format_date_full_month(parts[0].strip())
                end = format_date_full_month(parts[1].strip())
                if start and end:
                    return ("duration", f"{start} to {end}")
        if " to " in s and re.search(r'\d{4}-\d{2}-\d{2}', s):
            a, b = s.split(" to ", 1)
            start = format_date_full_month(a.strip())
            end = format_date_full_month(b.strip())
            if start and end:
                return ("duration", f"{start} to {end}")
        # fallback: try parse single date
        human = format_date_full_month(s)
        if human:
            return ("instant", human)
        return ("unknown", s)
    if isinstance(period_field, dict):
        if "instant" in period_field and period_field.get("instant"):
            h = format_date_full_month(period_field.get("instant"))
            return ("instant", h or period_field.get("instant"))
        if "startDate" in period_field and "endDate" in period_field:
            s = format_date_full_month(period_field.get("startDate")) or period_field.get("startDate")
            e = format_date_full_month(period_field.get("endDate")) or period_field.get("endDate")
            return ("duration", f"{s} to {e}")
        # alternative keys
        if "start" in period_field and "end" in period_field:
            s = format_date_full_month(period_field.get("start")) or period_field.get("start")
            e = format_date_full_month(period_field.get("end")) or period_field.get("end")
            return ("duration", f"{s} to {e}")
        # look for date-like strings within dict
        for k in ("date","dateInstant","period"):
            if k in period_field:
                h = format_date_full_month(period_field.get(k))
                return ("instant", h or str(period_field.get(k)))
    # fallback: stringify
    try:
        s = str(period_field)
        m = re.search(r'(\d{4}-\d{2}-\d{2})', s)
        if m:
            h = format_date_full_month(m.group(1))
            return ("unknown", h or s)
        return ("unknown", s)
    except Exception:
        return ("unknown", None)


# strip html tags (basic)
def strip_html_tags(s: str) -> str:
    if s is None:
        return ""
    s = re.sub(r'<\s*br\s*/?\s*>', '\n', s, flags=re.I)
    s = re.sub(r'<[^>]+>', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


# ---------------- Build label map from xbrl_full (best-effort) ----------------
def build_label_map_from_full(xbrl_full: Dict) -> Dict[str, str]:
    mapping: Dict[str, str] = {}
    if not isinstance(xbrl_full, dict):
        return mapping

    def deep_scan(obj: Any, key_hint: Optional[str] = None):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, dict):
                    if "label" in v and isinstance(v["label"], str):
                        mapping.setdefault(k, v["label"])
                    if "name" in v and isinstance(v["name"], str):
                        mapping.setdefault(k, v["name"])
                deep_scan(v, k)
        elif isinstance(obj, list):
            for it in obj:
                deep_scan(it, key_hint)

    deep_scan(xbrl_full)
    return mapping


# ---------------- Core logic using provenance paths ----------------

# Regex helpers
RE_INDEX = re.compile(r'\[(\d+)\]')  # find [123]
RE_SEGMENT_SUFFIX = re.compile(r'(/segment(?:/.*)?)$')  # identify /segment or /segment/[i] suffix


def parent_base_from_prov_path(prov_path: str) -> str:
    """
    For: .../SomeTableDetails/[6]                       -> return .../[6]
        .../SomeTableDetails/[6]/segment                 -> return .../[6]
        .../SomeTableDetails/[6]/segment/[1]             -> return .../[6]
    """
    if not prov_path:
        return prov_path
    # If ends with '/segment...' remove that part
    m = RE_SEGMENT_SUFFIX.search(prov_path)
    if m:
        parent = prov_path[:m.start(1)]
        return parent
    return prov_path


def last_index_from_path(prov_path: str) -> Optional[int]:
    """
    Return final bracket index if present, else None.
    """
    if not prov_path:
        return None
    m = list(RE_INDEX.finditer(prov_path))
    if not m:
        return None
    last = m[-1]
    try:
        return int(last.group(1))
    except Exception:
        return None


def is_segment_fact(fact: Dict[str, Any]) -> bool:
    # common heuristics: concept == 'segment' OR provenance_path contains '/segment'
    c = fact.get("concept") or fact.get("name") or ""
    if isinstance(c, str) and c.lower() == "segment":
        return True
    p = fact.get("provenance_path") or fact.get("provenancePath") or ""
    if isinstance(p, str) and "/segment" in p:
        return True
    # sometimes in nested original_fact.raw there is segment field - treat as segment when concept equals 'segment'
    return False


def extract_facts_list(x: Any) -> List[Dict[str, Any]]:
    # Accept either list or dict with keys like 'facts' etc.
    if isinstance(x, list):
        return x
    if isinstance(x, dict):
        for poss in ("facts", "items", "data", "factsList"):
            if poss in x and isinstance(x[poss], list):
                return x[poss]
        # otherwise build list of dict values heuristically
        collected = []
        for v in x.values():
            if isinstance(v, list):
                for it in v:
                    if isinstance(it, dict):
                        collected.append(it)
        if collected:
            return collected
    return []


def build_sentences_from_facts(facts_list: List[Dict[str, Any]], label_map: Dict[str, str], filing_meta: Dict[str, Any]):
    # Build mappings by parent base
    segments_by_parent: Dict[str, List[Dict[str, Any]]] = {}
    facts_by_parent: Dict[str, List[Dict[str, Any]]] = {}

    # Preserve original order index for provenance
    for idx, fact in enumerate(facts_list):
        # ensure provenance_path exists
        prov = fact.get("provenance_path") or fact.get("provenancePath") or fact.get("provenance") or ""
        fact["_original_index"] = idx
        fact["_provenance_path"] = prov
        parent = parent_base_from_prov_path(str(prov))
        if is_segment_fact(fact):
            segments_by_parent.setdefault(parent, []).append(fact)
        else:
            facts_by_parent.setdefault(parent, []).append(fact)

    flat_output: List[Dict[str, Any]] = []
    # For grouping, build a map keyed by (concept,label,period_string,unit,parent)
    groups_map: Dict[Tuple[str,str,str,str,str], List[Dict[str, Any]]] = {}

    # iterate parents in original order to keep natural order; take keys from facts_by_parent
    for parent in facts_by_parent.keys():
        parent_facts = facts_by_parent.get(parent, [])
        parent_segments = segments_by_parent.get(parent, [])

        # Helper: find segment(s) matching fact by index if possible
        # Build index->segment list mapping
        segs_index_map: Dict[Optional[int], List[Dict[str, Any]]] = {}
        for s in parent_segments:
            s_path = s.get("provenance_path") or s.get("_provenance_path") or ""
            s_idx = last_index_from_path(str(s_path))
            segs_index_map.setdefault(s_idx, []).append(s)

        # If segment count equals fact count and both have indices missing, we may want position mapping
        use_position_fallback = False
        # Check if none of parent_segments have explicit indices but counts equal -> pair by position
        if parent_segments and all(last_index_from_path(str(s.get("provenance_path") or "")) is None for s in parent_segments):
            if len(parent_segments) == len(parent_facts):
                use_position_fallback = True

        for pf_idx, fact in enumerate(parent_facts):
            orig_idx = fact.get("_original_index")
            fact_prov = fact.get("_provenance_path") or ""
            fact_index = last_index_from_path(str(fact_prov))

            # determine label/concept text
            concept = fact.get("concept") or fact.get("name") or fact.get("conceptName") or fact.get("label") or ""
            label = ""
            # prefer label_map lookup by concept key or by stripped key
            if isinstance(concept, str) and concept in label_map and label_map[concept]:
                label = strip_row_and_bracket_markers(label_map[concept])
            else:
                # try removing namespace
                label = pick_label_from_fact_or_map(concept, fact, label_map)

            label = label or strip_row_and_bracket_markers(fact.get("label") or fact.get("title") or "")
            if not label:
                label = humanize_concept_name(concept)

            # period detection
            p_field = fact.get("period") or fact.get("context") or fact.get("periods") or fact.get("period_raw") or fact.get("original_fact",{}).get("raw",{}).get("period")
            period_type, period_string = period_to_type_and_string(p_field)

            # raw value resolution
            raw_value = None
            if "value" in fact:
                raw_value = fact.get("value")
            elif "val" in fact:
                raw_value = fact.get("val")
            elif "amount" in fact:
                raw_value = fact.get("amount")
            else:
                raw_value = fact.get("original_fact", {}).get("raw", {}).get("value")

            # format number or text and unit
            decimals = fact.get("decimals") or fact.get("original_fact",{}).get("raw",{}).get("decimals")
            unit_raw = fact.get("unit") or fact.get("unitRef") or fact.get("unit_ref") or fact.get("original_fact",{}).get("raw",{}).get("unitRef") or fact.get("original_fact",{}).get("unit")
            unit = canonical_unit(unit_raw)

            formatted_value = format_number(raw_value, decimals) if raw_value is not None else ""

            # find segment(s) for this fact
            matched_segments: List[Dict[str,Any]] = []

            # 1) exact index match
            if fact_index is not None and fact_index in segs_index_map:
                matched_segments = segs_index_map[fact_index]
            # 2) if only one segment exists and fact_index is None -> use it
            elif fact_index is None and len(parent_segments) == 1:
                matched_segments = parent_segments.copy()
            # 3) if position fallback
            elif use_position_fallback:
                # pair by position
                if pf_idx < len(parent_segments):
                    matched_segments = [parent_segments[pf_idx]]
            # 4) if no matched but there are segments, and only one seg with same parent -> use it
            elif not matched_segments and parent_segments and len(parent_segments) == 1:
                matched_segments = parent_segments.copy()
            # 5) else maybe multiple segments apply to this fact (e.g., multiple axis) -> pick all
            elif parent_segments:
                # if length==len(parent_facts) and fact_index is None, pair by order index as fallback
                if len(parent_segments) == len(parent_facts) and fact_index is None:
                    if pf_idx < len(parent_segments):
                        matched_segments = [parent_segments[pf_idx]]
                else:
                    # fallback to an empty list (meaning no segment applied explicitly)
                    matched_segments = []

            # Build the humanized segment text
            seg_texts = []
            seg_key_parts = []
            for sfact in matched_segments:
                # segment value may be in sfact.value or in original_fact.raw.value or sfact.get('value')
                sval = sfact.get("value") or sfact.get("original_fact", {}).get("raw", {}).get("value")
                if sval is None:
                    continue
                sval = str(sval).strip()
                # humanize: remove namespace and Member suffix if present
                if ":" in sval:
                    sval_h = sval.split(":", 1)[1]
                else:
                    sval_h = sval
                sval_h = re.sub(r'(Member|Axis|AxisMember)$', '', sval_h, flags=re.IGNORECASE).strip()
                # replace camel/underscores etc.
                sval_h = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', ' ', sval_h)
                sval_h = sval_h.replace("_", " ").replace("-", " ")
                sval_h = re.sub(r'\s+', ' ', sval_h).strip()
                if sval_h:
                    seg_texts.append(sval_h)
                    seg_key_parts.append(sval)

            # Compose the text sentence
            # If label is empty but raw_value text present (cover pages), create sentence from raw_value
            if label:
                # choose natural order: "<segment> <label> for the period <period> is <value> <unit>."
                prefix = ""
                if seg_texts:
                    prefix = " ".join(seg_texts)
                # build the core part
                # If the label already starts with the segment word (rare), don't duplicate
                # Lowercase the label for natural flow if prefix present? Usually label is Title-case; keep it.
                if period_string:
                    core = f"{label} for the period {period_string}"
                else:
                    core = f"{label}"
                # If there is a prefix that is not already contained, put it before core
                if prefix:
                    sentence_main = f"{prefix} {core} is {formatted_value}"
                else:
                    sentence_main = f"{core} is {formatted_value}"
                if unit:
                    sentence_main = f"{sentence_main} {unit}"
                sentence_main = sentence_main.strip()
                if not sentence_main.endswith("."):
                    sentence_main = sentence_main + "."
                embed_text = sentence_main
                display_text = sentence_main
            else:
                # label missing: attempt to form from raw_value and period and maybe segment
                # e.g., "<raw_value> for the period <period>."
                if formatted_value:
                    if period_string:
                        embed_text = f"{formatted_value} for the period {period_string}."
                    else:
                        embed_text = f"{formatted_value}."
                else:
                    # fallback: show concept/provenance
                    cdisplay = concept if concept else parent
                    if period_string:
                        embed_text = f"{cdisplay} ({period_string})."
                    else:
                        embed_text = f"{cdisplay}."
                display_text = embed_text

            # Clean up embed_text: remove double spaces
            embed_text = re.sub(r'\s+', ' ', str(embed_text)).strip()
            display_text = re.sub(r'\s+', ' ', str(display_text)).strip()

            # provenance block
            provenance = {
                "fact_index": orig_idx,
                "concept": concept,
                "label": label,
                "period_type": period_type,
                "period_string": period_string,
                "unit": unit,
                "decimals": decimals,
                "segment": seg_key_parts if seg_key_parts else None,
                "original_provenance_path": fact_prov
            }

            # metadata block (safe get from filing_meta)
            meta_block = {
                "ticker": filing_meta.get("ticker"),
                "company_name": filing_meta.get("company_name"),
                "form_type": filing_meta.get("form_type"),
                "accession": filing_meta.get("accession"),
                "accession_nodash": filing_meta.get("accession_nodash"),
                "cik": filing_meta.get("cik"),
                "period_of_report": filing_meta.get("period_of_report"),
                "fiscal_year_end": filing_meta.get("fiscal_year_end"),
                "file_no": filing_meta.get("file_no"),
                "filed_at_utc": filing_meta.get("filed_at_utc")
            }

            entry = {
                "embed_text": embed_text,
                "display_text": display_text,
                "provenance": provenance,
                "metadata": meta_block,
                "original_fact": fact
            }
            flat_output.append(entry)

            # group key generation
            group_key = (
                concept or "",
                label or "",
                period_string or "",
                unit or "",
                parent or ""
            )
            groups_map.setdefault(group_key, []).append(entry)

    # Build grouped sentences
    group_list = []
    gi = 0
    for (concept, label, period_string, unit, parent), members in groups_map.items():
        # if only one member -> sentence is member embed_text
        if len(members) == 1:
            group_sentence = members[0]["embed_text"]
        else:
            # Build parts: for each member, try to extract segment(s) and value
            parts = []
            for m in members:
                prov = m.get("provenance", {})
                segs = prov.get("segment")
                # original_fact value
                orig_fact = m.get("original_fact", {})
                val = orig_fact.get("value") or orig_fact.get("raw", {}).get("value") or ""
                formatted = format_number(val, orig_fact.get("decimals") or prov.get("decimals"))
                seg_text = ""
                if segs:
                    # segs may be list of original segment strings; humanize them
                    if isinstance(segs, list):
                        segs_h = []
                        for s in segs:
                            s_ = str(s)
                            if ":" in s_:
                                s_h = s_.split(":",1)[1]
                            else:
                                s_h = s_
                            s_h = re.sub(r'(Member|Axis|AxisMember)$', '', s_h, flags=re.IGNORECASE).strip()
                            s_h = s_h.replace("_", " ").replace("-", " ")
                            s_h = re.sub(r'\s+', ' ', s_h).strip()
                            segs_h.append(s_h)
                        seg_text = " ".join(segs_h)
                    else:
                        s_ = str(segs)
                        seg_text = s_
                if seg_text:
                    part = f"{seg_text} — {formatted} {unit or ''}".strip()
                else:
                    part = f"{formatted} {unit or ''}".strip()
                parts.append(part)
            # join parts with '; '
            if label:
                prefix = label
            else:
                prefix = concept or parent
            if period_string:
                group_sentence = f"{prefix} for the period {period_string}: " + "; ".join(parts) + "."
            else:
                group_sentence = f"{prefix}: " + "; ".join(parts) + "."
        group_obj = {
            "group_id": f"group_{gi}",
            "concept": concept,
            "label": label,
            "period_string": period_string,
            "unit": unit,
            "member_count": len(members),
            "sentence": re.sub(r'\s+', ' ', group_sentence).strip(),
            "facts": members,
            "metadata": members[0]["metadata"] if members else {}
        }
        group_list.append(group_obj)
        gi += 1

    return flat_output, group_list


# small helper used earlier to pick label from fact or label_map
def pick_label_from_fact_or_map(concept_key: str, fact: Dict[str,Any], label_map: Dict[str,str]) -> str:
    # Try several keys
    if not concept_key:
        return ""
    # direct map
    if concept_key in label_map and label_map[concept_key]:
        return strip_row_and_bracket_markers(label_map[concept_key])
    # if concept contains colon, try second part
    if ":" in concept_key:
        short = concept_key.split(":",1)[1]
        if short in label_map and label_map[short]:
            return strip_row_and_bracket_markers(label_map[short])
        return humanize_concept_name(short)
    # fallback: if fact contains 'label' or 'title'
    lab = fact.get("label") or fact.get("title")
    if lab:
        return strip_row_and_bracket_markers(lab)
    # else humanize concept_key
    return humanize_concept_name(concept_key)


# ------------------ Runner ------------------

def main():
    print("[info] Loading inputs...")
    if not os.path.exists(INPUT_FACTS_FLAT):
        raise FileNotFoundError(f"Input facts flat file not found: {INPUT_FACTS_FLAT}")
    facts_obj = load_json_safe(INPUT_FACTS_FLAT)
    facts_list = extract_facts_list(facts_obj)
    print(f"[info] loaded {len(facts_list)} facts (flat).")

    xbrl_full = {}
    if os.path.exists(INPUT_FULL):
        try:
            xbrl_full = load_json_safe(INPUT_FULL)
            print("[info] loaded xbrl_full (for labels).")
        except Exception:
            xbrl_full = {}

    # load filing metadata if present
    filing_meta = {}
    if METADATA_FILE and os.path.exists(METADATA_FILE):
        try:
            filing_meta = load_json_safe(METADATA_FILE)
            print("[info] loaded metadata file:", METADATA_FILE)
        except Exception:
            filing_meta = {}
    else:
        # try to get small meta from xbrl_full if possible
        if isinstance(xbrl_full, dict):
            # try common locations
            cp = xbrl_full.get("CoverPage") or xbrl_full.get("cover") or xbrl_full.get("metadata") or {}
            if isinstance(cp, dict):
                filing_meta = {
                    "ticker": cp.get("ticker"),
                    "company_name": cp.get("companyName") or cp.get("entityName"),
                    "form_type": cp.get("formType"),
                    "accession": cp.get("accessionNumber") or cp.get("accession"),
                    "accession_nodash": (cp.get("accessionNumber") or cp.get("accession") or "").replace("-", ""),
                    "cik": cp.get("cik"),
                    "period_of_report": cp.get("periodOfReport"),
                    "fiscal_year_end": cp.get("fiscalYearEnd"),
                    "file_no": cp.get("fileNumber"),
                    "filed_at_utc": cp.get("filingDate")
                }

    # standard meta keys (even if None)
    meta_template_keys = ["ticker","company_name","form_type","accession","accession_nodash","cik","period_of_report","fiscal_year_end","file_no","filed_at_utc"]
    filing_meta = {k: filing_meta.get(k) if filing_meta.get(k) is not None else None for k in meta_template_keys}

    print("[info] building label map (best-effort) from full XBRL ...")
    label_map = build_label_map_from_full(xbrl_full) if xbrl_full else {}
    print(f"[info] label map entries: {len(label_map)}")

    print("[info] building cleaned outputs and grouping ...")
    flat, groups = build_sentences_from_facts(facts_list, label_map, filing_meta)

    out_flat = os.path.join(OUTPUT_DIR, "xbrl_sentences_flat_with_meta_4.json")
    out_group = os.path.join(OUTPUT_DIR, "xbrl_sentences_by_group_with_meta_4.json")

    print(f"[info] writing flat output to: {out_flat} (entries: {len(flat)})")
    with open(out_flat, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "count": len(flat),
            "facts": flat
        }, fh, indent=2, ensure_ascii=False)

    print(f"[info] writing grouped output to: {out_group} (groups: {len(groups)})")
    with open(out_group, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "group_count": len(groups),
            "groups": groups
        }, fh, indent=2, ensure_ascii=False)

    print("[ok] Done. Outputs written to:", OUTPUT_DIR)
    print("[summary] flat facts:", len(flat), "groups:", len(groups))


if __name__ == "__main__":
    main()


[info] Loading inputs...
[info] loaded 2341 facts (flat).
[info] loaded xbrl_full (for labels).
[info] loaded metadata file: /content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json
[info] building label map (best-effort) from full XBRL ...
[info] label map entries: 0
[info] building cleaned outputs and grouping ...
[info] writing flat output to: /content/drive/My Drive/SEC-API/AAPL/xbrl_sentences_output/xbrl_sentences_flat_with_meta_4.json (entries: 1351)


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",
  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


[info] writing grouped output to: /content/drive/My Drive/SEC-API/AAPL/xbrl_sentences_output/xbrl_sentences_by_group_with_meta_4.json (groups: 1351)
[ok] Done. Outputs written to: /content/drive/My Drive/SEC-API/AAPL/xbrl_sentences_output
[summary] flat facts: 1351 groups: 1351


In [None]:
# Jan-style robust XBRL pipeline with fallback to inline-XBRL parsing
# Colab-friendly: installs minimal deps and falls back to parsing HTML ix:* tags
#
# Configure:
#   API_KEY = "<your_sec_api_key>"  # optional
#   FILING_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"
#   OUTPUT_DIR = "./xbrl_jan_output"
#
# Outputs:
#   - xbrl_raw_source.json  (if sec-api provided JSON)
#   - xbrl_facts_flat.json
#   - xbrl_facts_flat.csv
#
import os, sys, json, time, re
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
from collections import defaultdict

# ---------------- CONFIG ----------------
API_KEY = "327f556515bcbb181578971950353ecb41012af4b728189fe6d024bee61b6c8c"  # optional, provide your sec-api key if you have it (not mandatory)
FILING_URL = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"
OUTPUT_DIR = "./xbrl_jan_output"
# ----------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------- deps ------------------
def ensure_package(pkg: str):
    try:
        __import__(pkg)
    except Exception:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# minimal deps: requests, beautifulsoup4, lxml, pandas (pandas optional)
ensure_package("requests")
ensure_package("beautifulsoup4")
ensure_package("lxml")
ensure_package("pandas")
# sec_api optionally
try:
    import sec_api  # noqa: F401
except Exception:
    # don't force-install sec-api - some users cannot access it; we'll attempt but it's optional
    pass

import requests
from bs4 import BeautifulSoup
import pandas as pd

# -------------- helpers ------------------
def save_json(obj: Any, path: str):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def try_sec_api_client_xbrl(api_key: str, filing_url: str) -> Optional[dict]:
    """
    Try multiple method names from sec_api.XbrlApi instance.
    Returns dict if successful, else None.
    """
    try:
        from sec_api import XbrlApi  # type: ignore
    except Exception:
        return None
    try:
        x = XbrlApi(api_key=api_key) if api_key else XbrlApi()
    except Exception:
        # Could fail if constructor signature different
        try:
            x = XbrlApi()
        except Exception:
            return None
    # candidate method names (best-effort)
    candidate_methods = [
        "get_xbrl_from_url", "get_xbrl_for_url", "get_xbrl", "xbrl_for_url",
        "get_xbrl_as_json", "get_xbrl_json", "get_xbrl_by_url", "get_xbrl_from_filing"
    ]
    for mn in candidate_methods:
        if hasattr(x, mn):
            try:
                method = getattr(x, mn)
                try:
                    res = method(filing_url)
                except TypeError:
                    # maybe signature expects url + token
                    res = method(filing_url, api_key) if api_key else method(filing_url)
                if isinstance(res, dict):
                    return res
            except Exception:
                continue
    # try any callable with 'url' in name
    for name in dir(x):
        attr = getattr(x, name)
        if callable(attr) and 'url' in name.lower():
            try:
                res = attr(filing_url)
                if isinstance(res, dict):
                    return res
            except Exception:
                pass
    return None

def try_sec_api_http(api_key: str, filing_url: str) -> Optional[dict]:
    """
    Try the sec-api REST endpoint for XBRL. This may fail with 404 if the endpoint is not available.
    Returns JSON dict if success, else None.
    """
    # Known historic endpoint - may not be available for all keys
    base = "https://api.sec-api.io/xbrl"
    try:
        resp = requests.get(base, params={"url": filing_url, "token": api_key}, timeout=60)
        if resp.status_code == 200:
            return resp.json()
        # sometimes API expects header Authorization
        headers = {"Authorization": api_key}
        resp2 = requests.get("https://api.sec-api.io/xbrl", params={"url": filing_url}, headers=headers, timeout=60)
        if resp2.status_code == 200:
            return resp2.json()
    except Exception:
        pass
    return None

# Inline XBRL parsing from filing HTML
def parse_inline_xbrl_from_html(html_text: str) -> List[Dict]:
    """
    Parse inline XBRL (ix:nonNumeric, ix:nonFraction, ix:nonNumeric, ix:nonFraction)
    and return a list of facts dicts with keys:
      - concept (string, e.g., us-gaap:Revenue)
      - value (text)
      - unit (unitRef attribute if present)
      - context (contextRef attribute)
      - decimals (if present)
      - provenance_path (constructed from element tag path)
      - original (raw data)
    This is a best-effort extraction. For more advanced coverage use an XBRL parser on the instance document.
    """
    soup = BeautifulSoup(html_text, "lxml")
    # Inline XBRL uses the ix namespace, elements could be prefixed 'ix:'.
    # We'll find tags by full local-name match for common ix elements.
    facts = []
    # list of ix element local names that typically carry facts
    ix_fact_tags = {"nonfraction","nonNumeric","nonFraction","fraction","nonfraction"}  # include variants
    # iterate all tags and pick ix:* elements by checking prefix or namespace
    # BeautifulSoup doesn't expose namespace easily; rely on tag.name containing ':' typically
    all_tags = soup.find_all()
    for el in all_tags:
        try:
            tag_name = el.name or ""
        except Exception:
            tag_name = ""
        if not tag_name:
            continue
        # Cases:
        #  - 'ix:nonNumeric', 'ix:nonFraction', 'ix:nonFraction'
        #  - or sometimes without prefix but typical for inline xbrl it's prefixed
        parts = tag_name.split(":")
        local = parts[-1].lower()
        if local not in ix_fact_tags:
            continue
        # get name attribute (the concept)
        concept = el.get("name") or el.get("concept") or el.get("fieldName") or el.get("axis") or None
        # text value
        text_val = (el.get_text(" ", strip=True) or "").strip()
        # attributes
        ctx = el.get("contextRef") or el.get("context") or el.get("contextref")
        unit = el.get("unitRef") or el.get("unit") or el.get("unitref")
        dec = el.get("decimals") or el.get("precision")
        # provenance: build simplified path using element's parent tags up to body
        # join tag names and optionally indices for repeated siblings
        node = el
        path_parts = []
        while node and getattr(node, "name", None):
            parent = node.parent
            # find index of node among siblings with same name
            if parent and getattr(parent, "contents", None):
                same = [c for c in parent.contents if getattr(c, "name", None) == getattr(node, "name", None)]
                if len(same) > 1:
                    idx = same.index(node)
                    path_parts.append(f"{node.name}[{idx}]")
                else:
                    path_parts.append(node.name)
            else:
                path_parts.append(node.name)
            node = parent
        path_parts.reverse()
        prov_path = "/".join(path_parts)
        fact = {
            "concept": concept,
            "value": text_val,
            "unit": unit,
            "decimals": dec,
            "period": None,
            "contextRef": ctx,
            "provenance_path": prov_path,
            "tag": tag_name,
            "original_element_attrs": dict(el.attrs)
        }
        facts.append(fact)
    return facts

# helper to fetch filing HTML
def fetch_filing_html(url: str) -> Optional[str]:
    try:
        headers = {
            "User-Agent": "xbrl-extraction-script/1.0 (contact: none)"
        }
        r = requests.get(url, headers=headers, timeout=30)
        if r.status_code == 200:
            return r.text
    except Exception:
        pass
    return None

# canonicalize unit
def canonical_unit(u):
    if not u:
        return None
    s = str(u).strip().lower()
    if "usd" in s or s == "$":
        return "USD"
    return u

# normalize raw results into flat facts (Jan-style simplified)
def normalize_flat_facts(raw_facts: List[Dict]) -> List[Dict]:
    out = []
    for f in raw_facts:
        concept = f.get("concept") or f.get("tag") or ""
        label = concept
        value = f.get("value")
        unit = canonical_unit(f.get("unit"))
        decimals = f.get("decimals")
        period = f.get("period") or f.get("contextRef") or None
        prov = f.get("provenance_path") or None
        entry = {
            "concept": concept,
            "label": label,
            "value": value,
            "unit": unit,
            "decimals": decimals,
            "period": period,
            "provenance_path": prov,
            "original": f
        }
        out.append(entry)
    return out

# high-level runner
def run_pipeline(api_key: str, filing_url: str, out_dir: str):
    # 1) try sec_api client
    xbrl_json = None
    if api_key:
        print("[info] trying sec_api client (if installed)...")
        try:
            res = try_sec_api_client_xbrl(api_key, filing_url)
            if res:
                print("[ok] got xbrl JSON from sec_api client")
                xbrl_json = res
        except Exception as e:
            print("[warn] sec_api client attempt failed:", e)
    else:
        # try even without key (some sec_api installs don't require token)
        try:
            res = try_sec_api_client_xbrl("", filing_url)
            if res:
                print("[ok] got xbrl JSON from sec_api client (no key)")
                xbrl_json = res
        except Exception:
            pass

    # 2) try sec-api HTTP endpoint (best-effort)
    if xbrl_json is None and api_key:
        print("[info] trying sec-api HTTP endpoint (best-effort)...")
        try:
            res = try_sec_api_http(api_key, filing_url)
            if res:
                print("[ok] got xbrl JSON via sec-api HTTP endpoint")
                xbrl_json = res
        except Exception as e:
            print("[warn] sec-api HTTP attempt failed:", e)

    # 3) fallback: fetch filing HTML and parse inline XBRL
    inline_parsed_facts = None
    if xbrl_json is None:
        print("[info] falling back to inline-XBRL parsing of filing HTML ...")
        html = fetch_filing_html(filing_url)
        if not html:
            raise RuntimeError("Failed to download filing HTML for fallback parsing. Check filing URL/network.")
        # Save raw HTML
        html_path = os.path.join(out_dir, "filing_raw.html")
        with open(html_path, "w", encoding="utf-8") as fh:
            fh.write(html)
        print("[ok] saved raw filing HTML to:", html_path)
        # parse inline XBRL
        facts = parse_inline_xbrl_from_html(html)
        print(f"[info] parsed {len(facts)} inline-XBRL fact-like elements from HTML")
        inline_parsed_facts = facts

    # If we obtained an xbrl_json from sec_api, try to extract facts if possible
    flat_facts = []
    raw_source_path = None
    if xbrl_json:
        # Save raw source returned by sec-api client/API for inspection
        raw_source_path = os.path.join(out_dir, "xbrl_raw_source.json")
        save_json(xbrl_json, raw_source_path)
        # Try to extract facts list from many possible shapes
        # this mirrors Jan's "extract_facts_list_from_xbrl" logic
        facts_candidates = []
        if isinstance(xbrl_json, dict):
            for key in ("facts","data","items","factsList","itemsList"):
                if key in xbrl_json and isinstance(xbrl_json[key], list):
                    facts_candidates = xbrl_json[key]
                    break
            # sometimes nested under 'xbrl'
            if not facts_candidates and "xbrl" in xbrl_json and isinstance(xbrl_json["xbrl"], dict):
                for key in ("facts","data","items"):
                    if key in xbrl_json["xbrl"] and isinstance(xbrl_json["xbrl"][key], list):
                        facts_candidates = xbrl_json["xbrl"][key]
                        break
        if not facts_candidates:
            # deep-scan to find first list of dicts having 'concept' or 'value'
            def deep_scan(obj):
                if isinstance(obj, dict):
                    for v in obj.values():
                        res = deep_scan(v)
                        if res:
                            return res
                elif isinstance(obj, list):
                    if len(obj)>0 and isinstance(obj[0], dict) and ("concept" in obj[0] or "value" in obj[0] or "provenance_path" in obj[0]):
                        return obj
                    for it in obj:
                        res = deep_scan(it)
                        if res:
                            return res
                return None
            facts_candidates = deep_scan(xbrl_json) or []
        # normalize candidates (look for keys used by various providers)
        for f in facts_candidates:
            if not isinstance(f, dict):
                continue
            concept = f.get("concept") or f.get("name") or f.get("conceptName") or f.get("label")
            # get value
            value = None
            for vc in ("value","val","amount","text"):
                if vc in f:
                    value = f.get(vc)
                    break
            # unit
            unit = f.get("unit") or f.get("unitRef") or f.get("currency")
            decimals = f.get("decimals") or f.get("dec")
            period = f.get("period") or f.get("context")
            provenance = f.get("provenance_path") or f.get("path") or f.get("provenance") or f.get("provenancePath")
            entry = {
                "concept": concept,
                "label": None,
                "value": value,
                "unit": canonical_unit(unit),
                "decimals": decimals,
                "period": period,
                "provenance_path": provenance,
                "original": f
            }
            flat_facts.append(entry)
        print(f"[info] extracted {len(flat_facts)} facts from sec-api JSON")
    else:
        # use inline parsed facts
        flat_facts = normalize_flat_facts(inline_parsed_facts or [])
        print(f"[info] normalized {len(flat_facts)} inline facts")

    # Save flat facts JSON & CSV
    flat_json_path = os.path.join(out_dir, "xbrl_facts_flat.json")
    save_json(flat_facts, flat_json_path)
    print("[ok] saved flat facts JSON to:", flat_json_path)

    # CSV export
    rows = []
    for i,f in enumerate(flat_facts):
        rows.append({
            "idx": i,
            "concept": f.get("concept"),
            "label": f.get("label"),
            "value": f.get("value"),
            "unit": f.get("unit"),
            "decimals": f.get("decimals"),
            "period": f.get("period"),
            "provenance_path": f.get("provenance_path")
        })
    df = pd.DataFrame(rows)
    csv_path = os.path.join(out_dir, "xbrl_facts_flat.csv")
    df.to_csv(csv_path, index=False)
    print("[ok] saved flat CSV to:", csv_path)

    return {
        "raw_source": raw_source_path,
        "flat_json": flat_json_path,
        "flat_csv": csv_path
    }

# -------------- run --------------
if __name__ == "__main__":
    print("Starting Jan-style robust XBRL pipeline ...")
    t0 = time.time()
    try:
        out = run_pipeline(API_KEY.strip(), FILING_URL, OUTPUT_DIR)
        t1 = time.time()
        print("Done. Outputs written to:", out)
        print("Elapsed seconds:", round(t1 - t0, 2))
    except Exception as e:
        print("Error during pipeline:", e)
        raise


Starting Jan-style robust XBRL pipeline ...
[info] trying sec_api client (if installed)...
[info] trying sec-api HTTP endpoint (best-effort)...
[info] falling back to inline-XBRL parsing of filing HTML ...
Error during pipeline: Failed to download filing HTML for fallback parsing. Check filing URL/network.


RuntimeError: Failed to download filing HTML for fallback parsing. Check filing URL/network.

In [None]:
import requests
import json
import re
from urllib.parse import urljoin, urlparse
from datetime import datetime
import pandas as pd
from typing import Dict, List, Any, Optional

class SECXBRLExtractor:
    def __init__(self, sec_api_key: str = None):
        """
        Initialize the SEC XBRL Facts Extractor

        Args:
            sec_api_key: Optional SEC-API key for sec-api.io service
        """
        self.sec_api_key = sec_api_key
        self.use_sec_api_service = sec_api_key is not None

        if self.use_sec_api_service:
            self.base_headers = {
                'Authorization': f'Bearer {sec_api_key}',
                'Accept': 'application/json',
                'User-Agent': 'SEC-API-Client'
            }
            self.sec_api_base = 'https://api.sec-api.io'
        else:
            self.base_headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'Accept-Encoding': 'gzip, deflate, br',
                'DNT': '1',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
            }

    def extract_filing_metadata(self, filing_url: str) -> Dict[str, Any]:
        """
        Extract basic filing metadata from the HTML filing URL
        """
        try:
            # Add a small delay to be respectful to SEC servers
            import time
            time.sleep(1)

            response = requests.get(filing_url, headers=self.base_headers, timeout=30)
            response.raise_for_status()

            # Extract CIK, accession number, and form type from URL
            url_parts = filing_url.split('/')
            cik = url_parts[-3]  # Extract CIK from URL structure
            accession_raw = url_parts[-2]  # Raw accession number

            # Clean CIK (remove leading zeros for API calls)
            cik_clean = str(int(cik))

            # Format accession number (remove hyphens for XBRL file naming)
            accession_clean = accession_raw.replace('-', '')

            # Extract company name from HTML title if possible
            company_name = self._extract_company_name_from_html(response.text)

            return {
                'cik': cik_clean,
                'cik_padded': cik,
                'accession_number': accession_raw,
                'accession_clean': accession_clean,
                'company_name': company_name,
                'filing_url': filing_url,
                'base_url': '/'.join(url_parts[:-1]) + '/'
            }

        except Exception as e:
            print(f"Error extracting filing metadata: {e}")
            print("Attempting to extract metadata from URL structure...")

            # Fallback: extract what we can from URL structure
            try:
                url_parts = filing_url.split('/')
                cik = url_parts[-3]
                accession_raw = url_parts[-2]
                cik_clean = str(int(cik))
                accession_clean = accession_raw.replace('-', '')

                return {
                    'cik': cik_clean,
                    'cik_padded': cik,
                    'accession_number': accession_raw,
                    'accession_clean': accession_clean,
                    'company_name': 'Apple Inc.',  # Hardcoded for this specific URL
                    'filing_url': filing_url,
                    'base_url': '/'.join(url_parts[:-1]) + '/'
                }
            except:
                return {}

    def _extract_company_name_from_html(self, html_content: str) -> str:
        """Extract company name from HTML content"""
        try:
            # Look for company name in title or header
            title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_content, re.IGNORECASE)
            if title_match:
                title = title_match.group(1)
                # Extract company name (usually before "10-K" or similar)
                company_match = re.search(r'^([^-]+)', title.strip())
                if company_match:
                    return company_match.group(1).strip()
            return "Unknown Company"
        except:
            return "Unknown Company"

    def get_xbrl_instance_url(self, metadata: Dict[str, Any]) -> str:
        """
        Construct the XBRL instance document URL
        """
        base_url = metadata['base_url']
        # XBRL instance files typically follow the pattern: [filename]_htm.xml
        filing_name = metadata['filing_url'].split('/')[-1].replace('.htm', '')
        xbrl_filename = f"{filing_name}_htm.xml"
        return urljoin(base_url, xbrl_filename)

    def extract_xbrl_facts(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract XBRL facts from the instance document
        """
        if self.use_sec_api_service:
            return self._extract_via_sec_api_service(metadata)

        xbrl_url = self.get_xbrl_instance_url(metadata)

        try:
            response = requests.get(xbrl_url, headers=self.base_headers)
            response.raise_for_status()

            # Parse XBRL XML content
            xbrl_content = response.text
            facts = self._parse_xbrl_content(xbrl_content, metadata)

            return {
                'metadata': metadata,
                'xbrl_url': xbrl_url,
                'extraction_timestamp': datetime.now().isoformat(),
                'facts': facts
            }

        except Exception as e:
            print(f"Error extracting XBRL facts from {xbrl_url}: {e}")
            # Fallback to SEC API companyfacts
            return self._fallback_to_companyfacts_api(metadata)

    def _extract_via_sec_api_service(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract XBRL facts using sec-api.io service
        """
        try:
            # Use sec-api.io XBRL extraction endpoint
            api_url = f"{self.sec_api_base}/xbrl-to-json"

            params = {
                'htm-url': metadata['filing_url'],
                'token': self.sec_api_key
            }

            response = requests.get(api_url, params=params, headers=self.base_headers)
            response.raise_for_status()

            xbrl_data = response.json()

            # Process sec-api.io response format
            facts = self._process_sec_api_response(xbrl_data)

            return {
                'metadata': metadata,
                'source': 'sec-api.io service',
                'extraction_timestamp': datetime.now().isoformat(),
                'facts': facts
            }

        except Exception as e:
            print(f"Error using sec-api.io service: {e}")
            # Fallback to direct extraction
            return self._fallback_to_companyfacts_api(metadata)

    def _parse_xbrl_content(self, xbrl_content: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Parse XBRL XML content and extract meaningful facts
        """
        import xml.etree.ElementTree as ET

        try:
            root = ET.fromstring(xbrl_content)
            facts = []

            # Define namespaces commonly used in SEC XBRL
            namespaces = {
                'us-gaap': 'http://fasb.org/us-gaap/2024',
                'dei': 'http://xbrl.sec.gov/dei/2024',
                'xbrli': 'http://www.xbrl.org/2003/instance',
                'xbrl': 'http://www.xbrl.org/2003/instance'
            }

            # Key financial statement concepts to prioritize
            priority_concepts = {
                'Revenues', 'Revenue', 'SalesRevenueNet',
                'NetIncomeLoss', 'NetIncome',
                'Assets', 'AssetsCurrent', 'AssetsNoncurrent',
                'Liabilities', 'LiabilitiesCurrent', 'LiabilitiesNoncurrent',
                'StockholdersEquity', 'RetainedEarnings',
                'CashAndCashEquivalentsAtCarryingValue',
                'OperatingIncomeLoss', 'GrossProfit',
                'ResearchAndDevelopmentExpense',
                'SellingGeneralAndAdministrativeExpenses'
            }

            # Extract facts from all elements
            for elem in root.iter():
                if self._is_fact_element(elem):
                    fact = self._extract_fact_details(elem, namespaces)
                    if fact and self._is_meaningful_fact(fact, priority_concepts):
                        facts.append(fact)

            # Sort facts by importance and date
            facts = sorted(facts, key=lambda x: (
                x.get('is_priority', False),
                x.get('period_end', ''),
                x.get('concept_name', '')
            ), reverse=True)

            return facts

        except Exception as e:
            print(f"Error parsing XBRL content: {e}")
            return []

    def _is_fact_element(self, elem) -> bool:
        """Check if an XML element represents an XBRL fact"""
        return (elem.text and
                elem.text.strip() and
                'contextRef' in elem.attrib and
                not elem.tag.endswith('}context') and
                not elem.tag.endswith('}unit'))

    def _extract_fact_details(self, elem, namespaces: Dict[str, str]) -> Optional[Dict[str, Any]]:
        """Extract detailed information from an XBRL fact element"""
        try:
            # Get concept name (remove namespace prefix)
            concept_local = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
            concept_namespace = elem.tag.split('}')[0].replace('{', '') if '}' in elem.tag else ''

            # Extract value and handle numeric conversion
            raw_value = elem.text.strip()

            # Try to convert to numeric value
            numeric_value = self._convert_to_numeric(raw_value)

            fact = {
                'concept_name': concept_local,
                'concept_namespace': concept_namespace,
                'raw_value': raw_value,
                'numeric_value': numeric_value,
                'context_ref': elem.get('contextRef'),
                'unit_ref': elem.get('unitRef'),
                'decimals': elem.get('decimals'),
                'scale': elem.get('scale'),
                'is_numeric': numeric_value is not None
            }

            return fact

        except Exception as e:
            return None

    def _convert_to_numeric(self, value: str) -> Optional[float]:
        """Convert string value to numeric, handling various formats"""
        try:
            # Remove common formatting
            clean_value = value.replace(',', '').replace('$', '').replace('(', '-').replace(')', '').strip()

            # Try to convert to float
            return float(clean_value)
        except:
            return None

    def _is_meaningful_fact(self, fact: Dict[str, Any], priority_concepts: set) -> bool:
        """Determine if a fact is meaningful and should be included"""
        concept_name = fact.get('concept_name', '')

        # Mark as priority if it's in our priority list
        is_priority = any(priority in concept_name for priority in priority_concepts)
        fact['is_priority'] = is_priority

        # Include if it's priority, numeric, or has meaningful text
        return (is_priority or
                fact.get('is_numeric') or
                (len(fact.get('raw_value', '')) > 5 and
                 not fact.get('raw_value', '').isdigit()))

    def _fallback_to_companyfacts_api(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """
        Fallback method using SEC's companyfacts API
        """
        try:
            cik = metadata.get('cik_padded')
            if not cik:
                print("No CIK available for companyfacts API")
                return {'error': 'No CIK available'}

            api_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"

            # Use more comprehensive headers for SEC API
            api_headers = {
                'User-Agent': 'Mozilla/5.0 (compatible; Research Bot 1.0; +http://www.example.com/bot)',
                'Accept': 'application/json',
                'Host': 'data.sec.gov'
            }

            print(f"Attempting to fetch from SEC API: {api_url}")
            response = requests.get(api_url, headers=api_headers, timeout=30)
            response.raise_for_status()

            company_facts = response.json()

            # Extract relevant facts from the API response
            facts = self._process_companyfacts_data(company_facts)

            return {
                'metadata': metadata,
                'source': 'SEC CompanyFacts API',
                'extraction_timestamp': datetime.now().isoformat(),
                'facts': facts
            }

        except Exception as e:
            print(f"Fallback API extraction failed: {e}")
            return {'error': str(e), 'facts': []}

    def _process_companyfacts_data(self, company_facts: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Process data from SEC companyfacts API"""
        facts = []

        # Extract facts from us-gaap section
        us_gaap = company_facts.get('facts', {}).get('us-gaap', {})

        for concept, concept_data in us_gaap.items():
            description = concept_data.get('description', '')
            units = concept_data.get('units', {})

            # Process each unit type (USD, shares, etc.)
            for unit_type, values in units.items():
                # Get the most recent values
                recent_values = sorted(values, key=lambda x: x.get('end', ''), reverse=True)[:5]

                for value_data in recent_values:
                    fact = {
                        'concept_name': concept,
                        'description': description,
                        'value': value_data.get('val'),
                        'unit': unit_type,
                        'period_end': value_data.get('end'),
                        'period_start': value_data.get('start'),
                        'form': value_data.get('form'),
                        'fiscal_year': value_data.get('fy'),
                        'fiscal_period': value_data.get('fp'),
                        'is_priority': self._is_priority_concept(concept)
                    }
                    facts.append(fact)

        return facts

    def _is_priority_concept(self, concept: str) -> bool:
        """Check if a concept is considered high priority"""
        priority_keywords = [
            'Revenue', 'Income', 'Asset', 'Liabilit', 'Equity', 'Cash',
            'Profit', 'Loss', 'Expense', 'Research', 'Development'
        ]
        return any(keyword in concept for keyword in priority_keywords)

    def create_semantic_sentences(self, facts_data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Convert XBRL facts into natural language sentences for embeddings
        """
        facts = facts_data.get('facts', [])
        sentences = []

        if not facts:
            print("No facts available to create sentences")
            return []

        for fact in facts:
            sentence_data = self._fact_to_sentence(fact, facts_data.get('metadata', {}))
            if sentence_data:
                sentences.append(sentence_data)

        return sentences

    def _fact_to_sentence(self, fact: Dict[str, Any], metadata: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Convert a single fact to a natural language sentence"""
        try:
            concept = fact.get('concept_name', '')
            value = fact.get('value') or fact.get('numeric_value')
            description = fact.get('description', '')
            period_end = fact.get('period_end', '')
            unit = fact.get('unit') or fact.get('unit_ref', '')
            company_name = metadata.get('company_name', 'The company')

            # Format the value appropriately
            if value is not None and isinstance(value, (int, float)):
                if 'USD' in str(unit) or '$' in str(unit):
                    if abs(value) >= 1e9:
                        formatted_value = f"${value/1e9:.2f} billion"
                    elif abs(value) >= 1e6:
                        formatted_value = f"${value/1e6:.2f} million"
                    else:
                        formatted_value = f"${value:,.0f}"
                else:
                    formatted_value = f"{value:,.0f}"
            else:
                formatted_value = str(fact.get('raw_value', value))

            # Create natural language sentence
            if period_end:
                sentence = f"As of {period_end}, {company_name} reported {self._humanize_concept(concept)} of {formatted_value}."
            else:
                sentence = f"{company_name} reported {self._humanize_concept(concept)} of {formatted_value}."

            # Add description context if available
            if description and description != concept:
                sentence += f" ({description})"

            return {
                'sentence': sentence,
                'concept': concept,
                'value': value,
                'period': period_end,
                'unit': unit,
                'is_priority': fact.get('is_priority', False),
                'metadata': {
                    'original_fact': fact,
                    'company': company_name
                }
            }

        except Exception as e:
            return None

    def _humanize_concept(self, concept: str) -> str:
        """Convert XBRL concept names to human-readable form"""
        # Add spaces before capital letters
        humanized = re.sub(r'([A-Z])', r' \1', concept).strip()

        # Handle common abbreviations and terms
        replacements = {
            'Net Income Loss': 'Net Income',
            'Assets Current': 'Current Assets',
            'Liabilities Current': 'Current Liabilities',
            'Revenue Net': 'Net Revenue',
            'Cash And Cash Equivalents': 'Cash and Cash Equivalents'
        }

        for old, new in replacements.items():
            humanized = humanized.replace(old, new)

        return humanized.lower()

    def _process_sec_api_response(self, xbrl_data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Process XBRL data from sec-api.io service response
        """
        facts = []

        try:
            # sec-api.io typically returns structured JSON with facts
            # The exact structure depends on their API response format

            if 'BalanceSheet' in xbrl_data:
                facts.extend(self._extract_statement_facts(xbrl_data['BalanceSheet'], 'Balance Sheet'))

            if 'IncomeStatement' in xbrl_data:
                facts.extend(self._extract_statement_facts(xbrl_data['IncomeStatement'], 'Income Statement'))

            if 'CashFlow' in xbrl_data:
                facts.extend(self._extract_statement_facts(xbrl_data['CashFlow'], 'Cash Flow Statement'))

            if 'StatementsOfIncome' in xbrl_data:
                facts.extend(self._extract_statement_facts(xbrl_data['StatementsOfIncome'], 'Income Statement'))

            # Handle other potential structures from sec-api.io
            for key, value in xbrl_data.items():
                if isinstance(value, dict) and key not in ['BalanceSheet', 'IncomeStatement', 'CashFlow', 'StatementsOfIncome']:
                    facts.extend(self._extract_statement_facts(value, key))

            return facts

        except Exception as e:
            print(f"Error processing sec-api.io response: {e}")
            return []

    def _extract_statement_facts(self, statement_data: Dict[str, Any], statement_type: str) -> List[Dict[str, Any]]:
        """
        Extract facts from a financial statement section
        """
        facts = []

        def extract_recursive(data, prefix=""):
            if isinstance(data, dict):
                for key, value in data.items():
                    if isinstance(value, dict):
                        if 'value' in value and 'concept' in value:
                            # This looks like a fact
                            fact = {
                                'concept_name': value.get('concept', key),
                                'value': value.get('value'),
                                'unit': value.get('unit', 'USD'),
                                'period': value.get('period'),
                                'statement_type': statement_type,
                                'is_priority': self._is_priority_concept(key),
                                'source': 'sec-api.io'
                            }
                            facts.append(fact)
                        else:
                            extract_recursive(value, f"{prefix}{key}_" if prefix else f"{key}_")
                    elif isinstance(value, (int, float, str)) and key not in ['concept', 'unit', 'period']:
                        # Direct value
                        fact = {
                            'concept_name': f"{prefix}{key}" if prefix else key,
                            'value': value,
                            'statement_type': statement_type,
                            'is_priority': self._is_priority_concept(key),
                            'source': 'sec-api.io'
                        }
                        facts.append(fact)

        extract_recursive(statement_data)
        return facts

    def save_to_json(self, data: Dict[str, Any], filename: str):
        """Save extracted data to JSON file"""
        with open(filename, 'w') as f:
            json.dump(data, f, indent=2, default=str)
        print(f"Data saved to {filename}")

# Usage example with API key
def extract_aapl_xbrl_facts_with_api(api_key: str = None):
    """
    Extract XBRL facts from Apple's 2024 10-K filing
    Args:
        api_key: Your sec-api.io API key (optional)
    """
    # Initialize extractor with or without API key
    extractor = SECXBRLExtractor(sec_api_key=api_key)

    # Apple 2024 10-K URL
    filing_url = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"

    if api_key:
        print("Using sec-api.io service with API key")
    else:
        print("Using free SEC endpoints (no API key required)")

    print("Extracting filing metadata...")
    metadata = extractor.extract_filing_metadata(filing_url)
    print(f"Company: {metadata.get('company_name')}")
    print(f"CIK: {metadata.get('cik')}")

    print("\nExtracting XBRL facts...")
    facts_data = extractor.extract_xbrl_facts(metadata)

    if 'error' in facts_data and not facts_data.get('facts'):
        print(f"Failed to extract facts: {facts_data.get('error')}")
        # Return minimal structure to prevent further errors
        return {
            'extraction_info': metadata,
            'facts': [],
            'semantic_sentences': [],
            'summary': {
                'total_facts': 0,
                'priority_facts': 0,
                'semantic_sentences': 0
            },
            'error': facts_data.get('error')
        }

    print(f"Extracted {len(facts_data.get('facts', []))} facts")

    print("\nCreating semantic sentences...")
    sentences = extractor.create_semantic_sentences(facts_data)

    print(f"Created {len(sentences)} semantic sentences")

    # Combine all data
    final_data = {
        'extraction_info': facts_data.get('metadata', {}),
        'facts': facts_data.get('facts', []),
        'semantic_sentences': sentences,
        'summary': {
            'total_facts': len(facts_data.get('facts', [])),
            'priority_facts': len([f for f in facts_data.get('facts', []) if f.get('is_priority')]),
            'semantic_sentences': len(sentences)
        }
    }

    # Save to JSON
    extractor.save_to_json(final_data, 'aapl_2024_xbrl_facts.json')

    return final_data

# Run the extraction
if __name__ == "__main__":
    # Install required packages first
    print("Installing required packages...")
    import subprocess
    import sys

    packages = ['requests', 'pandas']
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

    # Option 1: Run without API key (uses free SEC endpoints)
    # print("=== RUNNING WITHOUT API KEY (FREE) ===")
    # result = extract_aapl_xbrl_facts_with_api()

    # Option 2: Run with sec-api.io API key (uncomment and add your key)
    YOUR_API_KEY = "327f556515bcbb181578971950353ecb41012af4b728189fe6d024bee61b6c8c"
    print("\n=== RUNNING WITH SEC-API.IO KEY ===")
    result = extract_aapl_xbrl_facts_with_api(api_key=YOUR_API_KEY)

    # Display sample results
    print("\n" + "="*50)
    print("SAMPLE EXTRACTED FACTS:")
    print("="*50)

    priority_sentences = [s for s in result['semantic_sentences'] if s.get('is_priority')]
    for i, sentence in enumerate(priority_sentences[:10]):  # Show top 10 priority sentences
        print(f"{i+1}. {sentence['sentence']}")

    print(f"\nTotal facts extracted: {result['summary']['total_facts']}")
    print(f"Priority facts: {result['summary']['priority_facts']}")
    print(f"Semantic sentences created: {result['summary']['semantic_sentences']}")

Installing required packages...

=== RUNNING WITH SEC-API.IO KEY ===
Using sec-api.io service with API key
Extracting filing metadata...
Error extracting filing metadata: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
Attempting to extract metadata from URL structure...
Company: Apple Inc.
CIK: 320193

Extracting XBRL facts...
Extracted 328 facts

Creating semantic sentences...
Created 328 semantic sentences
Data saved to aapl_2024_xbrl_facts.json

SAMPLE EXTRACTED FACTS:
1. Apple Inc. reported revenue from contract with customer text block of Revenue<div style="margin-top:6pt;text-align:justify"><span style="color:#000000;font-family:'Helvetica',sans-serif;font-size:9pt;font-weight:400;line-height:120%">The Company recognizes revenue at the amount to which it expects to be entitled when control of the products or services is transferred to its customers. Control is generally transferred when the Company has a pr

In [None]:
#7th Method: Jan + My Method
import requests
import json
import pandas as pd

# 10-Q filing URL of Apple
filing_url = "https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm"

# XBRL-to-JSON converter API endpoint
xbrl_converter_api_endpoint = "https://api.sec-api.io/xbrl-to-json"

# get your API key at https://sec-api.io
api_key = "327f556515bcbb181578971950353ecb41012af4b728189fe6d024bee61b6c8c"

final_url = xbrl_converter_api_endpoint + "?htm-url=" + filing_url + "&token=" + api_key

# make request to the API
response = requests.get(final_url)

# load JSON into memory
xbrl_json = json.loads(response.text)

# income statement example
# print(xbrl_json['StatementsOfIncome'])


{'RevenueFromContractWithCustomerExcludingAssessedTax': [{'decimals': '-6', 'unitRef': 'usd', 'period': {'startDate': '2023-10-01', 'endDate': '2024-09-28'}, 'segment': {'dimension': 'srt:ProductOrServiceAxis', 'value': 'us-gaap:ProductMember'}, 'value': '294866000000'}, {'decimals': '-6', 'unitRef': 'usd', 'period': {'startDate': '2022-09-25', 'endDate': '2023-09-30'}, 'segment': {'dimension': 'srt:ProductOrServiceAxis', 'value': 'us-gaap:ProductMember'}, 'value': '298085000000'}, {'decimals': '-6', 'unitRef': 'usd', 'period': {'startDate': '2021-09-26', 'endDate': '2022-09-24'}, 'segment': {'dimension': 'srt:ProductOrServiceAxis', 'value': 'us-gaap:ProductMember'}, 'value': '316199000000'}, {'decimals': '-6', 'unitRef': 'usd', 'period': {'startDate': '2023-10-01', 'endDate': '2024-09-28'}, 'segment': {'dimension': 'srt:ProductOrServiceAxis', 'value': 'us-gaap:ServiceMember'}, 'value': '96169000000'}, {'decimals': '-6', 'unitRef': 'usd', 'period': {'startDate': '2022-09-25', 'endDate'

In [None]:
filename = "test_xbrl.json"
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(xbrl_json, f, ensure_ascii=False, indent=4)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 7th Method A: GPT Approach
# Colab / Python-ready script
# Input: test_xbrl.json (hierarchical: section -> subsection -> facts)
# Output: xbrl_sentences.json (list of normalized fact sentences) and CSV preview

import os
import re
import json
import math
import html
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
import csv

# ------------ CONFIG ----------------
INPUT_JSON = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/XBRL_hierarchical_facts.json"   # adjust if necessary
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN"
OUT_JSON = os.path.join(OUTPUT_DIR, "xbrl_sentences.json")
OUT_CSV = os.path.join(OUTPUT_DIR, "xbrl_sentences_preview.csv")
# ------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Helpers ------------------

MONTH_FULL = {
    1: "January",2:"February",3:"March",4:"April",5:"May",6:"June",
    7:"July",8:"August",9:"September",10:"October",11:"November",12:"December"
}

def try_parse_date_str(date_str: Optional[str]) -> Optional[datetime]:
    if not date_str:
        return None
    s = str(date_str).strip()
    # remove time portion if present
    if "T" in s:
        s = s.split("T")[0]
    # common patterns
    patterns = ["%Y-%m-%d", "%Y%m%d", "%d-%m-%Y", "%m-%d-%Y", "%Y/%m/%d", "%d/%m/%Y"]
    for p in patterns:
        try:
            return datetime.strptime(s, p)
        except Exception:
            continue
    # regex fallback for YYYY-MM-DD
    m = re.search(r"(\d{4})-(\d{2})-(\d{2})", s)
    if m:
        try:
            return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)))
        except:
            pass
    # fallback for dd-mm-yyyy with dashes where month/day ambiguous - skip
    return None

def format_date_full_month(date_str: Optional[str]) -> Optional[str]:
    dt = try_parse_date_str(date_str)
    if not dt:
        return None
    month = MONTH_FULL.get(dt.month, dt.strftime("%B"))
    return f"{month} {dt.day}, {dt.year}"

def canonical_unit(u: Optional[str]) -> Optional[str]:
    if not u:
        return None
    s = str(u).strip()
    if s == "":
        return None
    sl = s.lower()
    if "usd" in sl or s == "$" or ("iso4217" in sl and "usd" in sl):
        return "USD"
    if "shares" in sl or sl.endswith("shares"):
        return "shares"
    if "usdpershare" in sl or "usd_per_share" in sl or 'usdPerShare'.lower() in sl:
        return "USD/share"
    # shorten common forms
    return s.upper()

def is_numeric_like(v: Any) -> bool:
    if v is None:
        return False
    if isinstance(v, (int, float)):
        return True
    s = str(v).strip()
    if s == "":
        return False
    # Remove common punctuation
    s2 = re.sub(r'[,\(\)\s\$£€]', '', s)
    # allow scientific format and percent
    s2 = s2.replace("%", "")
    return bool(re.match(r'^-?\d+(\.\d+)?(e[+\-]?\d+)?$', s2, flags=re.IGNORECASE))

def format_number_for_display(val: Any, decimals: Optional[Any]=None) -> str:
    """
    Heuristic formatting:
     - If decimals is numeric and negative, treat it as a power scaling like XBRL often uses.
       Example: value '123456' with decimals '-6' -> 123456 * 10^-6 -> 0.123456 (but many providers already apply scale)
     - If string contains 'E' or scientific notation, attempt float conversion.
     - If non-numeric text, return raw string (stripped).
    """
    if val is None:
        return ""
    s = str(val).strip()
    if s == "":
        return ""
    # Remove surrounding html if any (we will strip html elsewhere too)
    # If looks like numeric-ish:
    if is_numeric_like(s):
        s_clean = s.replace(",", "").replace(" ", "")
        try:
            num = float(s_clean)
            # apply decimals scaling if present and looks like integer
            d = None
            try:
                if decimals is not None:
                    if isinstance(decimals, str) and re.match(r'^-?\d+$', decimals.strip()):
                        d = int(decimals.strip())
                    elif isinstance(decimals, (int, float)):
                        d = int(decimals)
            except Exception:
                d = None
            # If decimals negative (e.g. '-6') then multiply by 10**d (which reduces value)
            if d is not None and d < 0:
                # Many data providers store value already scaled. This is heuristic:
                # We'll *not* blindly rescale if the result would produce fractional cents for large numbers.
                # We'll do rescaling but present integers with commas if rounding works.
                try:
                    scaled = num * (10 ** d)
                    num = scaled
                except Exception:
                    pass
            # Display nicely
            if abs(num - round(num)) < 1e-6:
                return f"{int(round(num)):,}"
            else:
                s_out = f"{num:,.6f}".rstrip("0").rstrip(".")
                return s_out
        except Exception:
            # fallback to original string
            return s
    else:
        # Non-numeric value (like textual cover page items)
        return re.sub(r'\s+', ' ', strip_html(s)).strip()

# HTML stripping helper
def strip_html(raw: str) -> str:
    if raw is None:
        return ""
    # unescape HTML entities
    raw = html.unescape(str(raw))
    # remove <style>...</style>
    raw = re.sub(r'(?is)<style.*?>.*?</style>', ' ', raw)
    # remove <script>...</script>
    raw = re.sub(r'(?is)<script.*?>.*?</script>', ' ', raw)
    # remove all tags
    text = re.sub(r'(?is)<[^>]*>', ' ', raw)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def humanize_concept_name(key: Optional[str]) -> str:
    if not key:
        return ""
    s = str(key)
    # If concept like "us-gaap:Revenue", use right side
    if ":" in s:
        s = s.split(":",1)[1]
    # replace underscores/camelCase/ hyphens
    s = re.sub(r'([a-z0-9])([A-Z])', r'\1 \2', s)
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r'\s+', ' ', s).strip()
    # Title case but keep small words lowercase
    small = {"and","or","the","of","in","on","at","for","to","by","with","a","an","is"}
    parts = s.split()
    out = []
    for i,p in enumerate(parts):
        if i>0 and p.lower() in small:
            out.append(p.lower())
        else:
            out.append(p.capitalize())
    return " ".join(out)

# Build period string from period object or string
def period_to_type_and_string(period_field: Any) -> Tuple[str, Optional[str]]:
    if (period_field is None) or (period_field == ""):
        return ("unknown", None)
    # If dict: check for 'instant' or 'startDate'+'endDate'
    if isinstance(period_field, dict):
        if "instant" in period_field:
            p = format_date_full_month(period_field.get("instant")) or period_field.get("instant")
            return ("instant", p)
        if "startDate" in period_field and "endDate" in period_field:
            s = format_date_full_month(period_field.get("startDate")) or str(period_field.get("startDate"))
            e = format_date_full_month(period_field.get("endDate")) or str(period_field.get("endDate"))
            return ("duration", f"{s} to {e}")
        # some providers use other keys:
        for k in ("start","start_date","from"):
            if k in period_field and "end" in period_field:
                s = format_date_full_month(period_field.get(k)) or str(period_field.get(k))
                e = format_date_full_month(period_field.get("end")) or str(period_field.get("end"))
                return ("duration", f"{s} to {e}")
        # fallback
        return ("unknown", json_safe_str(period_field))
    # If string:
    if isinstance(period_field, str):
        s = period_field.strip()
        # look for "/" or "to"
        if " to " in s:
            left,right = s.split(" to ",1)
            ls = format_date_full_month(left) or left
            rs = format_date_full_month(right) or right
            return ("duration", f"{ls} to {rs}")
        if "/" in s:
            # patterns like "2023-10-01/2024-09-28"
            parts = s.split("/")
            if len(parts)>=2:
                ls = format_date_full_month(parts[0]) or parts[0]
                rs = format_date_full_month(parts[1]) or parts[1]
                return ("duration", f"{ls} to {rs}")
        # single date string
        human = format_date_full_month(s) or s
        return ("instant", human)
    # else fallback
    return ("unknown", json_safe_str(period_field))

def json_safe_str(x: Any) -> str:
    try:
        return json.dumps(x, ensure_ascii=False)
    except Exception:
        return str(x)

# ---------- traversal & normalization ----------------

def is_fact_like(obj: Any) -> bool:
    """
    Heuristics to determine if object looks like a fact (has 'value' or 'unitRef' or 'period' or decimals)
    """
    if not isinstance(obj, dict):
        return False
    keys = set(k.lower() for k in obj.keys())
    if "value" in keys or "val" in keys or "amount" in keys or "text" in keys:
        return True
    if "unitref" in keys or "unit" in keys:
        return True
    if "period" in keys or "instant" in keys or "startdate" in keys:
        return True
    if "decimals" in keys or "dec" in keys:
        return True
    return False

def normalize_single_fact(raw_fact: Dict, label_hint: Optional[str], provenance_path: str) -> Dict:
    """
    Turn one raw fact dict into normalized structure and compose embed_text sentence.
    """
    # canonicalize keys (different providers use slightly different names)
    # value
    value = None
    for cand in ("value","val","amount","text","VALUE","Value"):
        if cand in raw_fact:
            value = raw_fact.get(cand)
            break
    # sometimes value is nested inside 'original' or 'raw'
    if value is None and "original_fact" in raw_fact and isinstance(raw_fact["original_fact"], dict):
        v = raw_fact["original_fact"].get("raw", {}).get("value")
        if v is not None:
            value = v

    # unit / decimals / period / segment
    unit = raw_fact.get("unitRef") or raw_fact.get("unit") or raw_fact.get("unit_ref") or raw_fact.get("currency")
    decimals = raw_fact.get("decimals") or raw_fact.get("dec")
    period_field = raw_fact.get("period") or raw_fact.get("context") or raw_fact.get("period_raw")
    # some facts use 'period' directly as string; sometimes 'period' can be nested raw['period']
    segment = raw_fact.get("segment") or raw_fact.get("segmentKey") or raw_fact.get("segment_ref")

    # If value is HTML or contains tags -> strip
    if isinstance(value, str) and bool(re.search(r'<[^>]+>', value)):
        clean_val = strip_html(value)
    else:
        clean_val = value

    # Format value text or numeric
    pretty_val = format_number_for_display(clean_val, decimals)
    canon_unit = canonical_unit(unit)

    # label selection: prefer label_hint > raw_fact label > humanized provenance key
    label = None
    # sometimes facts include a 'label' or 'concept'
    if "label" in raw_fact and raw_fact.get("label"):
        label = raw_fact.get("label")
    elif "concept" in raw_fact and raw_fact.get("concept"):
        label = raw_fact.get("concept")
    elif label_hint:
        label = label_hint
    else:
        # try to infer from provenance path last element
        if provenance_path:
            parts = provenance_path.strip("/").split("/")
            if parts:
                label = parts[-1]
    label_human = humanize_concept_name(label)

    # period
    p_type, p_string = period_to_type_and_string(period_field)

    # segment info as human phrase if present
    seg_phrase = ""
    seg_val = None
    if segment:
        # segment may be dict, list or string
        if isinstance(segment, dict):
            seg_val = segment.get("value") or segment.get("member") or str(segment)
        elif isinstance(segment, list):
            # combine members
            seg_vals = []
            for s in segment:
                if isinstance(s, dict):
                    seg_vals.append(s.get("value") or s.get("member") or json_safe_str(s))
                else:
                    seg_vals.append(str(s))
            seg_val = "|".join(seg_vals)
        else:
            seg_val = str(segment)
        # humanize segment
        if seg_val:
            seg_label = seg_val.split(":")[-1] if ":" in seg_val else seg_val
            seg_label = seg_label.replace("_", " ").replace("-", " ").strip()
            seg_phrase = f" for {seg_label}"

    # Compose embed_text sentence
    embed_text = ""
    # If numeric & we have a label -> "Label as of DATE is VAL UNIT." or "Label for the period X to Y is VAL UNIT."
    if pretty_val:
        val_with_unit = pretty_val + (f" {canon_unit}" if canon_unit and canon_unit not in pretty_val else "")
        if label_human:
            if p_type == "instant" and p_string:
                embed_text = f"{label_human} as of {p_string} is {val_with_unit}{seg_phrase}."
            elif p_type == "duration" and p_string:
                embed_text = f"{label_human} for the period {p_string} is {val_with_unit}{seg_phrase}."
            else:
                embed_text = f"{label_human} is {val_with_unit}{seg_phrase}."
        else:
            if p_string:
                embed_text = f"{val_with_unit} ({p_string}){seg_phrase}."
            else:
                embed_text = f"{val_with_unit}{seg_phrase}."
    else:
        # non-numeric textual value, maybe cover-page or textual block
        if isinstance(clean_val, str) and clean_val.strip():
            text = clean_val.strip()
            # short textual facts -> include the label if exists
            if label_human:
                if p_type == "instant" and p_string:
                    embed_text = f"{label_human} as of {p_string} is {text}{seg_phrase}."
                elif p_type == "duration" and p_string:
                    embed_text = f"{label_human} for the period {p_string} is {text}{seg_phrase}."
                else:
                    embed_text = f"{label_human} is {text}{seg_phrase}."
            else:
                # fallback
                if p_string:
                    embed_text = f"{text} ({p_string})."
                else:
                    embed_text = f"{text}."
        else:
            # no value, produce fallback sentence with concept/label
            if label_human:
                if p_string:
                    embed_text = f"{label_human} ({p_string})."
                else:
                    embed_text = f"{label_human}."
            else:
                embed_text = "Fact."

    embed_text = re.sub(r'\s+', ' ', embed_text).strip()

    provenance = {
        "provenance_path": provenance_path,
        "period_type": p_type,
        "period_string": p_string,
        "unit": canon_unit,
        "decimals": decimals,
        "segment": seg_val
    }

    normalized = {
        "embed_text": embed_text,
        "display_text": embed_text,
        "provenance": provenance,
        "original": raw_fact
    }
    return normalized

def walk_and_extract_facts(obj: Any, path_stack: List[str]=None) -> List[Dict]:
    """
    Walk the hierarchical JSON and return a list of normalized fact dicts.
    - path_stack: list of keys representing the hierarchy path (section/subsection/...)
    """
    if path_stack is None:
        path_stack = []
    results = []

    # If obj is a dict that itself looks like a container of facts (leaf level)
    if isinstance(obj, dict):
        # If this dict looks like single fact (has value/unit/period) -> treat as single fact
        if is_fact_like(obj):
            prov_path = "/".join(path_stack)
            label_hint = path_stack[-1] if path_stack else None
            norm = normalize_single_fact(obj, label_hint, prov_path)
            # also attach a little provenance label for section/subsection
            norm["provenance"]["section_path"] = prov_path
            norm["provenance"]["section_keys"] = list(path_stack)
            results.append(norm)
            return results

        # else iterate over keys
        for k,v in obj.items():
            new_stack = path_stack + [k]
            # If v is list -> likely list of facts or list of items
            if isinstance(v, list):
                # check if list members are fact-like or nested
                for i,elem in enumerate(v):
                    if is_fact_like(elem):
                        # single fact-like item inside list
                        prov_path = "/".join(new_stack + [f"[{i}]"])
                        label_hint = k
                        norm = normalize_single_fact(elem, label_hint, prov_path)
                        norm["provenance"]["section_path"] = prov_path
                        norm["provenance"]["section_keys"] = list(new_stack)
                        results.append(norm)
                    else:
                        # nested structure inside list (rare) - walk it
                        results.extend(walk_and_extract_facts(elem, new_stack + [f"[{i}]"]))
                # done with this v
            elif isinstance(v, dict):
                results.extend(walk_and_extract_facts(v, new_stack))
            else:
                # leaf scalar value (string/number) under a key => treat as a tiny fact
                # Example: "AuditorName": "Ernst & Young LLP"
                val = v
                raw_fact = {"value": val}
                prov_path = "/".join(new_stack)
                norm = normalize_single_fact(raw_fact, k, prov_path)
                norm["provenance"]["section_path"] = prov_path
                norm["provenance"]["section_keys"] = list(new_stack)
                results.append(norm)
    elif isinstance(obj, list):
        for i,elem in enumerate(obj):
            results.extend(walk_and_extract_facts(elem, path_stack + [f"[{i}]"]))
    else:
        # scalar at root (rare)
        raw_fact = {"value": obj}
        prov_path = "/".join(path_stack)
        norm = normalize_single_fact(raw_fact, path_stack[-1] if path_stack else None, prov_path)
        norm["provenance"]["section_path"] = prov_path
        norm["provenance"]["section_keys"] = list(path_stack)
        results.append(norm)

    return results

# ---------- main runner ----------------

def main():
    # load input JSON
    if not os.path.exists(INPUT_JSON):
        raise FileNotFoundError(f"Input JSON not found: {INPUT_JSON}")
    with open(INPUT_JSON, "r", encoding="utf-8") as fh:
        data = json.load(fh)

    print("[info] Input loaded. Starting traversal and extraction...")

    sentences = walk_and_extract_facts(data, [])

    print(f"[info] Extracted {len(sentences)} fact sentences. Cleaning duplicates and finalizing...")

    # Optional: remove exact duplicates in embed_text while preserving provenance
    unique_map = {}
    deduped = []
    for s in sentences:
        key = s.get("embed_text", "").strip()
        if not key:
            continue
        if key not in unique_map:
            unique_map[key] = True
            deduped.append(s)
        else:
            # if duplicate text appears, we still may want to keep provenance entries (optional)
            # For now skip duplicates to reduce embedding redundancy
            pass

    # Save JSON output
    with open(OUT_JSON, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "count": len(deduped),
            "facts": deduped
        }, fh, indent=2, ensure_ascii=False)

    # Write CSV preview (embed_text, concept/label from provenance, period_string, unit)
    with open(OUT_CSV, "w", encoding="utf-8", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["idx", "embed_text", "provenance_path", "period_type", "period_string", "unit", "decimals"])
        for i, s in enumerate(deduped):
            prov = s.get("provenance", {})
            writer.writerow([
                i,
                s.get("embed_text",""),
                prov.get("provenance_path") or prov.get("section_path"),
                prov.get("period_type"),
                prov.get("period_string"),
                prov.get("unit"),
                prov.get("decimals")
            ])

    print("[ok] Wrote outputs:")
    print(" JSON:", OUT_JSON)
    print(" CSV preview:", OUT_CSV)
    print("[done]")

if __name__ == "__main__":
    main()


[info] Input loaded. Starting traversal and extraction...
[info] Extracted 1454 fact sentences. Cleaning duplicates and finalizing...
[ok] Wrote outputs:
 JSON: /content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN/xbrl_sentences.json
 CSV preview: /content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN/xbrl_sentences_preview.csv
[done]


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


In [None]:
# 7th Method A: GPT Approach: Better one
# Colab / Python-ready script
# Input: test_xbrl.json (hierarchical: section -> subsection -> facts)
# Output: xbrl_sentences.json (list of normalized fact sentences) and CSV preview

import os
import re
import json
import math
import html
from decimal import Decimal, InvalidOperation, getcontext
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
import csv

# Increase decimal precision to safely handle large numbers
getcontext().prec = 50

# ------------ CONFIG ----------------
INPUT_JSON = "/content/drive/My Drive/SEC-API/AAPL/XBRL_facts/XBRL_hierarchical_facts.json"   # adjust if necessary
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN"
OUT_JSON = os.path.join(OUTPUT_DIR, "xbrl_sentences1.json")
OUT_CSV = os.path.join(OUTPUT_DIR, "xbrl_sentences_preview1.csv")
# ------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Helpers ------------------

MONTH_FULL = {
    1: "January",2:"February",3:"March",4:"April",5:"May",6:"June",
    7:"July",8:"August",9:"September",10:"October",11:"November",12:"December"
}

def try_parse_date_str(date_str: Optional[str]) -> Optional[datetime]:
    if not date_str:
        return None
    s = str(date_str).strip()
    # remove time portion if present
    if "T" in s:
        s = s.split("T")[0]
    # common patterns
    patterns = ["%Y-%m-%d", "%Y%m%d", "%d-%m-%Y", "%m-%d-%Y", "%Y/%m/%d", "%d/%m/%Y"]
    for p in patterns:
        try:
            return datetime.strptime(s, p)
        except Exception:
            continue
    # regex fallback for YYYY-MM-DD
    m = re.search(r"(\d{4})-(\d{2})-(\d{2})", s)
    if m:
        try:
            return datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)))
        except:
            pass
    # fallback for dd-mm-yyyy with dashes where month/day ambiguous - skip
    return None

def format_date_full_month(date_str: Optional[str]) -> Optional[str]:
    dt = try_parse_date_str(date_str)
    if not dt:
        return None
    month = MONTH_FULL.get(dt.month, dt.strftime("%B"))
    return f"{month} {dt.day}, {dt.year}"

def canonical_unit(u: Optional[str]) -> Optional[str]:
    if not u:
        return None
    s = str(u).strip()
    if s == "":
        return None
    sl = s.lower()
    if "usd" in sl or s == "$" or ("iso4217" in sl and "usd" in sl):
        return "USD"
    if "shares" in sl or sl.endswith("shares"):
        return "shares"
    if "usdpershare" in sl or "usd_per_share" in sl or 'usdPerShare'.lower() in sl:
        return "USD/share"
    # shorten common forms
    return s.upper()

def is_numeric_like(v: Any) -> bool:
    if v is None:
        return False
    if isinstance(v, (int, float, Decimal)):
        return True
    s = str(v).strip()
    if s == "":
        return False
    # Remove common punctuation
    s2 = re.sub(r'[,\(\)\s\$£€]', '', s)
    # allow scientific format and percent
    s2 = s2.replace("%", "")
    return bool(re.match(r'^-?\d+(\.\d+)?(e[+\-]?\d+)?$', s2, flags=re.IGNORECASE))

def rounding_note_for_decimals(decimals: Optional[Any]) -> Optional[str]:
    """
    If decimals is negative, return a human readable rounding note
    e.g. -3 -> 'rounded to thousands', -6 -> 'rounded to millions'
    """
    if decimals is None:
        return None
    try:
        d = int(str(decimals))
    except Exception:
        return None
    if d >= 0:
        return None
    # magnitude = 10 ** (-d)
    mag = -d
    if mag == 3:
        return "rounded to thousands"
    if mag == 6:
        return "rounded to millions"
    if mag == 9:
        return "rounded to billions"
    if mag == 12:
        return "rounded to trillions"
    # generic fallback:
    return f"rounded to 10^{mag}"

def format_number_for_display(val: Any, decimals: Optional[Any]=None) -> str:
    """
    Format numeric values *without removing trailing zeros*.
    - Preserve the full magnitude as present in the 'value' field.
    - Use Decimal for precision.
    - Do NOT rescale the numeric by 10^decimals. `decimals` is only used to create rounding note.
    """
    if val is None:
        return ""
    s = str(val).strip()
    if s == "":
        return ""
    # If non-numeric (textual HTML or text), strip html and return trimmed text
    if not is_numeric_like(s):
        return re.sub(r'\s+', ' ', strip_html(s)).strip()
    # numeric-like: use Decimal to preserve full digits (handles scientific notation)
    s_clean = s.replace(",", "").replace(" ", "")
    try:
        num = Decimal(s_clean)
    except InvalidOperation:
        # fallback to float formatting
        try:
            numf = float(s_clean)
            # integer-like?
            if abs(numf - round(numf)) < 1e-9:
                return f"{int(round(numf)):,}"
            else:
                return f"{numf:,.6f}".rstrip("0").rstrip(".")
        except Exception:
            return s
    # If it's an integer value (Decimal integral)
    if num == num.to_integral():
        # Convert to int for formatting separators
        try:
            intval = int(num)
            return f"{intval:,}"
        except Exception:
            # fallback: use quantize to remove exponent
            return format(num, 'f')
    else:
        # Not integer: format with up to 6 decimal places trimmed
        # Use normalization for decimal to string then trim trailing zeros
        out = format(num.normalize(), 'f')
        # Ensure thousands separators for integer portion:
        if '.' in out:
            int_part, frac_part = out.split('.', 1)
            try:
                int_val = int(int_part)
                int_str = f"{int_val:,}"
                out = int_str + "." + frac_part.rstrip('0').rstrip('.')
                return out
            except Exception:
                return out
        else:
            # no decimal part - treat as integer
            try:
                ival = int(out)
                return f"{ival:,}"
            except Exception:
                return out

# HTML stripping helper
def strip_html(raw: str) -> str:
    if raw is None:
        return ""
    # unescape HTML entities
    raw = html.unescape(str(raw))
    # remove <style>...</style>
    raw = re.sub(r'(?is)<style.*?>.*?</style>', ' ', raw)
    # remove <script>...</script>
    raw = re.sub(r'(?is)<script.*?>.*?</script>', ' ', raw)
    # remove all tags
    text = re.sub(r'(?is)<[^>]*>', ' ', raw)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def humanize_concept_name(key: Optional[str]) -> str:
    if not key:
        return ""
    s = str(key)
    # If concept like "us-gaap:Revenue", use right side
    if ":" in s:
        s = s.split(":",1)[1]
    # replace underscores/camelCase/ hyphens
    s = re.sub(r'([a-z0-9])([A-Z])', r'\1 \2', s)
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r'\s+', ' ', s).strip()
    # Title case but keep small words lowercase
    small = {"and","or","the","of","in","on","at","for","to","by","with","a","an","is"}
    parts = s.split()
    out = []
    for i,p in enumerate(parts):
        if i>0 and p.lower() in small:
            out.append(p.lower())
        else:
            out.append(p.capitalize())
    return " ".join(out)

# Convert main section name into a readable phrase (singularize 'Statements' -> 'Statement' if helpful)
def humanize_section_name(section_key: Optional[str]) -> str:
    if not section_key:
        return ""
    h = humanize_concept_name(section_key)
    # if the humanized begins with 'Statements ' -> change to 'Statement '
    if h.startswith("Statements "):
        h = "Statement " + h[len("Statements "):]
    # common special-case: "Statements Of Income" -> "Statement of Income"
    h = re.sub(r'\bOf\b', 'of', h)  # capitalization fix for Of -> of
    return h

# Build period string from period object or string
def period_to_type_and_string(period_field: Any) -> Tuple[str, Optional[str]]:
    if (period_field is None) or (period_field == ""):
        return ("unknown", None)
    # If dict: check for 'instant' or 'startDate'+'endDate'
    if isinstance(period_field, dict):
        if "instant" in period_field:
            p = format_date_full_month(period_field.get("instant")) or period_field.get("instant")
            return ("instant", p)
        if "startDate" in period_field and "endDate" in period_field:
            s = format_date_full_month(period_field.get("startDate")) or str(period_field.get("startDate"))
            e = format_date_full_month(period_field.get("endDate")) or str(period_field.get("endDate"))
            return ("duration", f"{s} to {e}")
        # some providers use other keys:
        for k in ("start","start_date","from"):
            if k in period_field and "end" in period_field:
                s = format_date_full_month(period_field.get(k)) or str(period_field.get(k))
                e = format_date_full_month(period_field.get("end")) or str(period_field.get("end"))
                return ("duration", f"{s} to {e}")
        # fallback
        return ("unknown", json_safe_str(period_field))
    # If string:
    if isinstance(period_field, str):
        s = period_field.strip()
        # look for "/" or "to"
        if " to " in s:
            left,right = s.split(" to ",1)
            ls = format_date_full_month(left) or left
            rs = format_date_full_month(right) or right
            return ("duration", f"{ls} to {rs}")
        if "/" in s:
            # patterns like "2023-10-01/2024-09-28"
            parts = s.split("/")
            if len(parts)>=2:
                ls = format_date_full_month(parts[0]) or parts[0]
                rs = format_date_full_month(parts[1]) or parts[1]
                return ("duration", f"{ls} to {rs}")
        # single date string
        human = format_date_full_month(s) or s
        return ("instant", human)
    # else fallback
    return ("unknown", json_safe_str(period_field))

def json_safe_str(x: Any) -> str:
    try:
        return json.dumps(x, ensure_ascii=False)
    except Exception:
        return str(x)

# ---------- traversal & normalization ----------------

def is_fact_like(obj: Any) -> bool:
    """
    Heuristics to determine if object looks like a fact (has 'value' or 'unitRef' or 'period' or decimals)
    """
    if not isinstance(obj, dict):
        return False
    keys = set(k.lower() for k in obj.keys())
    if "value" in keys or "val" in keys or "amount" in keys or "text" in keys:
        return True
    if "unitref" in keys or "unit" in keys:
        return True
    if "period" in keys or "instant" in keys or "startdate" in keys:
        return True
    if "decimals" in keys or "dec" in keys:
        return True
    return False

def normalize_single_fact(raw_fact: Dict, label_hint: Optional[str], provenance_path: str, section_keys: List[str]=None) -> Dict:
    """
    Turn one raw fact dict into normalized structure and compose embed_text sentence.
    section_keys: the hierarchical keys path e.g. ['StatementsOfIncome', 'OperatingIncomeLoss']
    """
    if section_keys is None:
        section_keys = []

    # canonicalize keys (different providers use slightly different names)
    # value
    value = None
    for cand in ("value","val","amount","text","VALUE","Value"):
        if cand in raw_fact:
            value = raw_fact.get(cand)
            break
    # sometimes value is nested inside 'original' or 'raw'
    if value is None and "original_fact" in raw_fact and isinstance(raw_fact["original_fact"], dict):
        v = raw_fact["original_fact"].get("raw", {}).get("value")
        if v is not None:
            value = v

    # unit / decimals / period / segment
    unit = raw_fact.get("unitRef") or raw_fact.get("unit") or raw_fact.get("unit_ref") or raw_fact.get("currency")
    decimals = raw_fact.get("decimals") or raw_fact.get("dec")
    period_field = raw_fact.get("period") or raw_fact.get("context") or raw_fact.get("period_raw")
    # some facts use 'period' directly as string; sometimes 'period' can be nested raw['period']
    segment = raw_fact.get("segment") or raw_fact.get("segmentKey") or raw_fact.get("segment_ref")

    # If value is HTML or contains tags -> strip
    if isinstance(value, str) and bool(re.search(r'<[^>]+>', value)):
        clean_val = strip_html(value)
    else:
        clean_val = value

    # Format value text or numeric (do NOT rescale by decimals)
    pretty_val = format_number_for_display(clean_val, decimals)
    canon_unit = canonical_unit(unit)

    # label selection: prefer label_hint > raw_fact label > humanized provenance key
    label = None
    # sometimes facts include a 'label' or 'concept'
    if "label" in raw_fact and raw_fact.get("label"):
        label = raw_fact.get("label")
    elif "concept" in raw_fact and raw_fact.get("concept"):
        label = raw_fact.get("concept")
    elif label_hint:
        label = label_hint
    else:
        # try to infer from provenance path last element
        if provenance_path:
            parts = provenance_path.strip("/").split("/")
            if parts:
                label = parts[-1]
    label_human = humanize_concept_name(label)

    # period
    p_type, p_string = period_to_type_and_string(period_field)

    # We will NOT use the segment dimension text in embed_text (per your instruction).
    seg_val = None
    if segment:
        # preserve in provenance but do not attach to embed_text
        if isinstance(segment, dict):
            seg_val = segment.get("value") or segment.get("member") or str(segment)
        elif isinstance(segment, list):
            seg_vals = []
            for s in segment:
                if isinstance(s, dict):
                    seg_vals.append(s.get("value") or s.get("member") or json_safe_str(s))
                else:
                    seg_vals.append(str(s))
            seg_val = "|".join(seg_vals)
        else:
            seg_val = str(segment)

    # Compose rounding note if decimals negative
    rounding_note = rounding_note_for_decimals(decimals)
    rounding_extra = f" ({rounding_note})" if rounding_note else ""

    # Compose embed_text sentence
    # Build main section prefix if available (humanize the top-level section key)
    main_section = None
    if section_keys and len(section_keys) >= 1:
        main_section = humanize_section_name(section_keys[0])
    section_prefix = f"{main_section} — " if main_section else ""

    embed_text = ""
    # If numeric & we have a label -> "Label as of DATE is VAL UNIT."
    if pretty_val:
        val_with_unit = pretty_val + (f" {canon_unit}" if canon_unit and canon_unit not in pretty_val else "")
        val_with_unit = val_with_unit + rounding_extra
        if label_human:
            if p_type == "instant" and p_string:
                embed_text = f"{section_prefix}{label_human} as of {p_string} is {val_with_unit}."
            elif p_type == "duration" and p_string:
                embed_text = f"{section_prefix}{label_human} for the period {p_string} is {val_with_unit}."
            else:
                embed_text = f"{section_prefix}{label_human} is {val_with_unit}."
        else:
            if p_string:
                embed_text = f"{section_prefix}{val_with_unit} ({p_string})."
            else:
                embed_text = f"{section_prefix}{val_with_unit}."
    else:
        # non-numeric textual value, maybe cover-page or textual block
        if isinstance(clean_val, str) and clean_val.strip():
            text = clean_val.strip()
            # short textual facts -> include the label if exists
            if label_human:
                if p_type == "instant" and p_string:
                    embed_text = f"{section_prefix}{label_human} as of {p_string} is {text}."
                elif p_type == "duration" and p_string:
                    embed_text = f"{section_prefix}{label_human} for the period {p_string} is {text}."
                else:
                    embed_text = f"{section_prefix}{label_human} is {text}."
            else:
                # fallback
                if p_string:
                    embed_text = f"{section_prefix}{text} ({p_string})."
                else:
                    embed_text = f"{section_prefix}{text}."
        else:
            # no value, produce fallback sentence with concept/label
            if label_human:
                if p_string:
                    embed_text = f"{section_prefix}{label_human} ({p_string})."
                else:
                    embed_text = f"{section_prefix}{label_human}."
            else:
                embed_text = f"{section_prefix}Fact."

    embed_text = re.sub(r'\s+', ' ', embed_text).strip()

    provenance = {
        "provenance_path": provenance_path,
        "period_type": p_type,
        "period_string": p_string,
        "unit": canon_unit,
        "decimals": decimals,
        "segment": seg_val
    }

    normalized = {
        "embed_text": embed_text,
        "display_text": embed_text,
        "provenance": provenance,
        "original": raw_fact
    }
    return normalized

def walk_and_extract_facts(obj: Any, path_stack: List[str]=None) -> List[Dict]:
    """
    Walk the hierarchical JSON and return a list of normalized fact dicts.
    - path_stack: list of keys representing the hierarchy path (section/subsection/...)
    """
    if path_stack is None:
        path_stack = []
    results = []

    # If obj is a dict that itself looks like a container of facts (leaf level)
    if isinstance(obj, dict):
        # If this dict looks like single fact (has value/unit/period) -> treat as single fact
        if is_fact_like(obj):
            prov_path = "/".join(path_stack)
            label_hint = path_stack[-1] if path_stack else None
            norm = normalize_single_fact(obj, label_hint, prov_path, section_keys=list(path_stack))
            # also attach a little provenance label for section/subsection
            norm["provenance"]["section_path"] = prov_path
            norm["provenance"]["section_keys"] = list(path_stack)
            results.append(norm)
            return results

        # else iterate over keys
        for k,v in obj.items():
            new_stack = path_stack + [k]
            # If v is list -> likely list of facts or list of items
            if isinstance(v, list):
                # check if list members are fact-like or nested
                for i,elem in enumerate(v):
                    if is_fact_like(elem):
                        # single fact-like item inside list
                        prov_path = "/".join(new_stack + [f"[{i}]"])
                        label_hint = k
                        norm = normalize_single_fact(elem, label_hint, prov_path, section_keys=new_stack)
                        norm["provenance"]["section_path"] = prov_path
                        norm["provenance"]["section_keys"] = list(new_stack)
                        results.append(norm)
                    else:
                        # nested structure inside list (rare) - walk it
                        results.extend(walk_and_extract_facts(elem, new_stack + [f"[{i}]"]))
                # done with this v
            elif isinstance(v, dict):
                results.extend(walk_and_extract_facts(v, new_stack))
            else:
                # leaf scalar value (string/number) under a key => treat as a tiny fact
                # Example: "AuditorName": "Ernst & Young LLP"
                val = v
                raw_fact = {"value": val}
                prov_path = "/".join(new_stack)
                norm = normalize_single_fact(raw_fact, k, prov_path, section_keys=new_stack)
                norm["provenance"]["section_path"] = prov_path
                norm["provenance"]["section_keys"] = list(new_stack)
                results.append(norm)
    elif isinstance(obj, list):
        for i,elem in enumerate(obj):
            results.extend(walk_and_extract_facts(elem, path_stack + [f"[{i}]"]))
    else:
        # scalar at root (rare)
        raw_fact = {"value": obj}
        prov_path = "/".join(path_stack)
        norm = normalize_single_fact(raw_fact, path_stack[-1] if path_stack else None, prov_path, section_keys=path_stack or [])
        norm["provenance"]["section_path"] = prov_path
        norm["provenance"]["section_keys"] = list(path_stack or [])
        results.append(norm)

    return results

# ---------- main runner ----------------

def main():
    # load input JSON
    if not os.path.exists(INPUT_JSON):
        raise FileNotFoundError(f"Input JSON not found: {INPUT_JSON}")
    with open(INPUT_JSON, "r", encoding="utf-8") as fh:
        data = json.load(fh)

    print("[info] Input loaded. Starting traversal and extraction...")

    sentences = walk_and_extract_facts(data, [])

    print(f"[info] Extracted {len(sentences)} fact sentences. Cleaning duplicates and finalizing...")

    # Optional: remove exact duplicates in embed_text while preserving provenance
    unique_map = {}
    deduped = []
    for s in sentences:
        key = s.get("embed_text", "").strip()
        if not key:
            continue
        if key not in unique_map:
            unique_map[key] = True
            deduped.append(s)
        else:
            # skip duplicates to reduce embedding redundancy
            pass

    # Save JSON output
    with open(OUT_JSON, "w", encoding="utf-8") as fh:
        json.dump({
            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
            "count": len(deduped),
            "facts": deduped
        }, fh, indent=2, ensure_ascii=False)

    # Write CSV preview (embed_text, concept/label from provenance, period_string, unit)
    with open(OUT_CSV, "w", encoding="utf-8", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["idx", "embed_text", "provenance_path", "period_type", "period_string", "unit", "decimals", "section_path"])
        for i, s in enumerate(deduped):
            prov = s.get("provenance", {})
            writer.writerow([
                i,
                s.get("embed_text",""),
                prov.get("provenance_path") or prov.get("section_path"),
                prov.get("period_type"),
                prov.get("period_string"),
                prov.get("unit"),
                prov.get("decimals"),
                prov.get("section_path")
            ])

    print("[ok] Wrote outputs:")
    print(" JSON:", OUT_JSON)
    print(" CSV preview:", OUT_CSV)
    print("[done]")

if __name__ == "__main__":
    main()


[info] Input loaded. Starting traversal and extraction...
[info] Extracted 1454 fact sentences. Cleaning duplicates and finalizing...
[ok] Wrote outputs:
 JSON: /content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN/xbrl_sentences1.json
 CSV preview: /content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN/xbrl_sentences_preview1.csv
[done]


  "generated_at_utc": datetime.utcnow().isoformat() + "Z",


### Converting the metadata file to a sentence(so that metadata sentence can be appended with each chunk)





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from datetime import datetime

# Path to your metadata file
META_PATH = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"

def format_date(date_str: str) -> str:
    """Convert YYYY-MM-DD to 'Month Day, Year'."""
    try:
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        return dt.strftime("%B %d, %Y")
    except Exception:
        return date_str

def build_metadata_sentence(meta: dict) -> str:
    """
    Build a concise metadata sentence to prepend to each chunk.
    """
    ticker = meta.get("ticker")
    company = meta.get("company_name")
    form_type = meta.get("form_type")
    cik = meta.get("cik_int")
    accession = meta.get("accession")
    filed_at = meta.get("filed_at_utc")
    report_date = format_date(meta.get("period_of_report"))
    fiscal_year_end = meta.get("fiscal_year_end")

    # Build sentence
    sentence = (
        f"This chunk is from SEC filing {form_type} of {company} (Ticker: {ticker}, CIK: {cik}), "
        f"Accession {accession}, filed on {filed_at}. "
        f"Reporting period ended {report_date}, fiscal year end {fiscal_year_end}."
    )
    return sentence

def main():
    with open(META_PATH, "r", encoding="utf-8") as f:
        meta = json.load(f)

    sentence = build_metadata_sentence(meta)
    print("Metadata Sentence:\n", sentence)

if __name__ == "__main__":
    main()


Metadata Sentence:
 This chunk is from SEC filing 10-K of Apple Inc. (Ticker: AAPL, CIK: 320193), Accession 0000320193-24-000123, filed on 2024-11-01T10:01:36Z. Reporting period ended September 28, 2024, fiscal year end 0928.


In [None]:
from transformers import AutoTokenizer

# Load tokenizer for BAAI/bge-large-en-v1.5
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")

metadata_sentence = (
    "This chunk is from the company Apple Inc.'s (AAPL) 10-K filing for the period ending September 28, 2024 (Accession: 0000320193-24-000123, Filed: 2024-11-01, CIK: 320193)."
)


# Tokenize
tokens = tokenizer.encode(metadata_sentence, add_special_tokens=True)

print("Token IDs:", tokens)
print("Number of tokens:", len(tokens))


Token IDs: [101, 2023, 20000, 2003, 2013, 1996, 2194, 6207, 4297, 1012, 1005, 1055, 1006, 9779, 24759, 1007, 2184, 1011, 1047, 15242, 2005, 1996, 2558, 4566, 2244, 2654, 1010, 16798, 2549, 1006, 16993, 1024, 2199, 2692, 16703, 24096, 2683, 2509, 1011, 2484, 1011, 2199, 12521, 2509, 1010, 6406, 1024, 16798, 2549, 1011, 2340, 1011, 5890, 1010, 25022, 2243, 1024, 13710, 16147, 2509, 1007, 1012, 102]
Number of tokens: 63


### Chunking2 : only the .txt files here
Strategy used: item based -> if item's content > token limit then paragraph based -> if paragraphs's content > token limit(this rarely happens) then sentence based

Same Methodology and methods used, but instead of having the metadata information as a placeholder we will pre append the metadata sentence to each chunk

In [None]:
# Full script: Hybrid chunking with paragraph + sentence fallback, metadata sentence prepended.
# Colab-ready. Set paths below.

# Uncomment to install if running in fresh Colab:
# !pip install --upgrade transformers sentencepiece

import os
import re
import json
from pathlib import Path
from typing import List, Dict, Any, Tuple
from tqdm import tqdm

# ---------------- CONFIG ----------------
INPUT_JSON_DIR = "/content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item"  # folder with per-item normalized JSONs
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence"

MODEL_NAME = "BAAI/bge-large-en-v1.5"  # tokenizer (used for token-counting)
TOKEN_LIMIT = 512                      # embedding model max tokens (including metadata)
PARA_OVERLAP = 3                       # paragraphs overlap when chunking by paragraphs
SENT_OVERLAP = 3                       # sentences overlap when chunking long paragraphs
MIN_TOKS_HEADING = 12                  # used to detect short headings (optional)
# ---------------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- tokenizer ----------
try:
    from transformers import AutoTokenizer
except Exception as e:
    raise RuntimeError("Install transformers (pip install transformers sentencepiece) before running. Error: " + str(e))

print(f"[info] loading tokenizer {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("[ok] tokenizer loaded.")

def count_tokens(text: str) -> int:
    if not text:
        return 0
    # count tokens of the text using the model tokenizer (without adding special tokens)
    return len(tokenizer.encode(text, add_special_tokens=False))

# Simple sentence splitter fallback (keeps punctuation)
SENT_SPLIT_RE = re.compile(r'(?<=[\.\?\!\…])\s+')

def split_into_sentences(paragraph: str) -> List[str]:
    if not paragraph or not paragraph.strip():
        return []
    sents = SENT_SPLIT_RE.split(paragraph.strip())
    # fallback: if no punctuation splitting, split by newlines
    if len(sents) <= 1 and "\n" in paragraph:
        sents = [s.strip() for s in paragraph.split("\n") if s.strip()]
    # strip
    sents = [s.strip() for s in sents if s and s.strip()]
    return sents

# ---------- load metadata ----------
if not os.path.exists(METADATA_FILE):
    raise FileNotFoundError(f"Metadata file not found: {METADATA_FILE}")
with open(METADATA_FILE, "r", encoding="utf-8") as fh:
    filing_metadata = json.load(fh)

# safe subset of metadata fields to attach
META_FIELDS = {
    "ticker": filing_metadata.get("ticker"),
    "company_name": filing_metadata.get("company_name"),
    "form_type": filing_metadata.get("form_type"),
    "accession": filing_metadata.get("accession"),
    "accession_nodash": filing_metadata.get("accession_nodash"),
    "cik": filing_metadata.get("cik_padded") or filing_metadata.get("cik") or filing_metadata.get("cik_raw"),
    "period_of_report": filing_metadata.get("period_of_report"),
    "fiscal_year_end": filing_metadata.get("fiscal_year_end"),
    "file_no": filing_metadata.get("file_no"),
    "filed_at_utc": filing_metadata.get("filed_at_utc"),
}

# Build the metadata sentence to prepend to each chunk (edit wording if you wish)
# Example (user-provided format):
metadata_sentence = (
    f"This chunk is from the company {META_FIELDS.get('company_name')}'s "
    f"({META_FIELDS.get('ticker')}) {META_FIELDS.get('form_type')} filing "
    f"for the period ending {META_FIELDS.get('period_of_report')} "
    f"(Accession: {META_FIELDS.get('accession')}, Filed: {META_FIELDS.get('filed_at_utc')}, "
    f"CIK: {META_FIELDS.get('cik')})."
)

# compute token count of metadata sentence
metadata_token_count = count_tokens(metadata_sentence)
print(f"[info] metadata sentence token count = {metadata_token_count}")

if metadata_token_count >= TOKEN_LIMIT:
    raise RuntimeError(f"Metadata sentence alone uses {metadata_token_count} tokens which is >= TOKEN_LIMIT {TOKEN_LIMIT}. "
                       "Please shorten metadata_sentence or increase TOKEN_LIMIT.")

# effective token budget left for the chunk content (excluding metadata)
EFFECTIVE_TOKEN_LIMIT = TOKEN_LIMIT - metadata_token_count
print(f"[info] effective token budget for content per chunk = {EFFECTIVE_TOKEN_LIMIT} tokens (out of {TOKEN_LIMIT} total)")

# ---------- utility to extract paragraphs from item JSON ----------
def extract_paragraphs_from_item_json(item_json: Dict[str, Any]) -> List[str]:
    paras: List[str] = []
    # prefer explicit paragraphs list
    if "paragraphs" in item_json and isinstance(item_json["paragraphs"], list) and item_json["paragraphs"]:
        for p in item_json["paragraphs"]:
            if isinstance(p, dict):
                # support 'text', 'clean_text'
                if "text" in p and isinstance(p["text"], str):
                    paras.append(p["text"].strip())
                elif "clean_text" in p and isinstance(p["clean_text"], str):
                    paras.append(p["clean_text"].strip())
                elif "raw_text" in p and isinstance(p["raw_text"], str):
                    paras.append(p["raw_text"].strip())
            elif isinstance(p, str):
                paras.append(p.strip())
    # fallback: normalized_text split by blank lines
    elif "normalized_text" in item_json and isinstance(item_json["normalized_text"], str):
        paras = [s.strip() for s in re.split(r'\n\s*\n', item_json["normalized_text"]) if s.strip()]
    elif "raw_text" in item_json and isinstance(item_json["raw_text"], str):
        paras = [s.strip() for s in re.split(r'\n\s*\n', item_json["raw_text"]) if s.strip()]
    return paras

# ---------- sentence-level chunking for a long paragraph ----------
def chunk_long_paragraph(paragraph: str, paragraph_idx: int, token_limit_excl_meta: int, sent_overlap: int):
    """
    Split a single very long paragraph into sentence-level chunks with overlap.
    token_limit_excl_meta : token budget for content excluding metadata sentence.
    Returns list of chunk dicts with fields:
        text, start_paragraph, end_paragraph, paragraph_indices, sentence_indices, token_count (content tokens)
    """
    sents = split_into_sentences(paragraph)
    if not sents:
        # fallback: return the paragraph as a single chunk (content tokens may exceed effective limit)
        return [{
            "text": paragraph,
            "start_paragraph": paragraph_idx,
            "end_paragraph": paragraph_idx,
            "paragraph_indices": [paragraph_idx],
            "sentence_indices": None,
            "token_count": count_tokens(paragraph)
        }]

    s_tok = [count_tokens(s) for s in sents]
    chunks = []
    s_ptr = 0

    while s_ptr < len(sents):
        cur_tokens = 0
        start_s = s_ptr
        included = []
        # add sentences while under token_limit_excl_meta
        while s_ptr < len(sents) and cur_tokens + s_tok[s_ptr] <= token_limit_excl_meta:
            included.append(s_ptr)
            cur_tokens += s_tok[s_ptr]
            s_ptr += 1

        if not included:
            # Single sentence too long to fit; force include single sentence
            included = [s_ptr]
            cur_tokens = s_tok[s_ptr]
            s_ptr += 1

        chunk_text = " ".join(sents[i] for i in included)
        chunks.append({
            "text": chunk_text,
            "start_paragraph": paragraph_idx,
            "end_paragraph": paragraph_idx,
            "paragraph_indices": [paragraph_idx],
            "sentence_indices": included.copy(),
            "token_count": cur_tokens
        })

        # compute next start with sentence-level overlap
        last_sent = included[-1]
        next_start = last_sent - sent_overlap + 1
        if next_start <= start_s:
            next_start = start_s + 1
        if next_start < 0:
            next_start = 0
        s_ptr = next_start

    return chunks

# ---------- paragraph-level chunking main algorithm ----------
def chunk_paragraphs_with_fallback(paragraphs: List[str],
                                   token_limit_excl_meta: int,
                                   para_overlap: int,
                                   sent_overlap: int,
                                   min_heading_tokens: int) -> Tuple[List[Dict[str, Any]], List[int]]:
    """
    Main chunking algorithm using token budget excluding metadata sentence:
      - If whole item (content) <= effective token limit, returns single chunk (content only)
      - Else chunk by paragraphs (never break paragraphs)
      - If a paragraph > token_limit_excl_meta, fallback to sentence-level chunking for that paragraph
    Returns (chunks, paragraph_token_counts)
    Each produced chunk is a dict with keys:
      text, start_paragraph, end_paragraph, paragraph_indices, token_count (content tokens)
    """
    n = len(paragraphs)
    para_tokens = [count_tokens(p) for p in paragraphs]
    chunks: List[Dict[str, Any]] = []

    # if the entire item content fits under the effective token limit -> single chunk
    item_joined = "\n\n".join(paragraphs)
    if count_tokens(item_joined) <= token_limit_excl_meta:
        chunks.append({
            "text": item_joined,
            "start_paragraph": 0,
            "end_paragraph": n - 1,
            "paragraph_indices": list(range(0, n)),
            "token_count": count_tokens(item_joined)
        })
        return chunks, para_tokens

    # otherwise iterate with a greedy window
    start = 0
    while start < n:
        # handle long paragraph via sentence fallback
        if para_tokens[start] > token_limit_excl_meta:
            long_chunks = chunk_long_paragraph(paragraphs[start], start, token_limit_excl_meta, sent_overlap)
            for lc in long_chunks:
                chunks.append(lc)
            start += 1
            continue

        # greedily extend window from 'start' by adding whole paragraphs until budget
        cur_tokens = 0
        included = []
        j = start
        while j < n and para_tokens[j] <= token_limit_excl_meta and cur_tokens + para_tokens[j] <= token_limit_excl_meta:
            cur_tokens += para_tokens[j]
            included.append(j)
            j += 1

        if not included:
            # defensive fallback
            included = [start]
            cur_tokens = para_tokens[start]
            j = start + 1

        # create chunk
        chunk_text = "\n\n".join(paragraphs[i] for i in included)
        chunks.append({
            "text": chunk_text,
            "start_paragraph": included[0],
            "end_paragraph": included[-1],
            "paragraph_indices": included.copy(),
            "token_count": cur_tokens
        })

        # determine paragraph overlap adjustment
        cand_last = included[-para_overlap:] if len(included) >= para_overlap else included[:]
        all_short = all(para_tokens[i] < min_heading_tokens for i in cand_last) if cand_last else False
        overlap_count = para_overlap + 1 if all_short else para_overlap

        # next window start index
        next_start = included[-1] - overlap_count + 1
        if next_start <= included[0]:
            next_start = included[0] + 1

        start = min(max(next_start, 0), n)

    return chunks, para_tokens

# ---------- driver: iterate item files and write per-item chunk outputs ----------
def process_all_items(input_dir: str, output_dir: str, token_limit: int,
                      para_overlap: int, sent_overlap: int, min_heading_tokens: int,
                      metadata_sentence_text: str, metadata_token_count: int):
    p = Path(input_dir)
    files = sorted([f for f in p.glob("*.json")])
    if not files:
        raise RuntimeError(f"No .json files found in {input_dir}")

    effective_limit = token_limit - metadata_token_count
    summary = []
    for fp in tqdm(files, desc="Items"):
        try:
            with open(fp, "r", encoding="utf-8") as fh:
                item_json = json.load(fh)
        except Exception as e:
            print(f"[warn] failed to load {fp}: {e}")
            continue

        paragraphs = extract_paragraphs_from_item_json(item_json)
        if not paragraphs:
            print(f"[warn] no paragraphs found in {fp}; skipping")
            continue

        # chunk with fallback using effective_limit (content-only budget)
        chunks, para_tok_counts = chunk_paragraphs_with_fallback(
            paragraphs,
            effective_limit,
            para_overlap,
            sent_overlap,
            min_heading_tokens
        )

        # enrich with metadata and item id; prepend metadata_sentence in chunk text
        item_id = item_json.get("item_number") or item_json.get("item_title") or fp.stem
        enriched = []
        for idx, c in enumerate(chunks):
            content_tokens = c["token_count"]
            full_text = metadata_sentence_text + " " + c["text"]
            # safety: if full_text somehow exceeds TOKEN_LIMIT due to tokenizer differences, we still store but log
            full_token_count = metadata_token_count + content_tokens
            if full_token_count > token_limit:
                print(f"[warn] combined tokens({full_token_count}) exceed limit({token_limit}) for {fp.stem}_chunk_{idx}.")
            enriched_chunk = {
                "chunk_id": f"{fp.stem}_chunk_{idx}",
                "item_id": item_id,
                "text": full_text,
                "start_paragraph": c.get("start_paragraph"),
                "end_paragraph": c.get("end_paragraph"),
                "paragraph_indices": c.get("paragraph_indices"),
                "sentence_indices": c.get("sentence_indices") if "sentence_indices" in c else None,
                # token_count is content-only tokens; metadata_token_count stored separately
                "token_count": content_tokens,
                "metadata_token_count": metadata_token_count,
                "metadata": META_FIELDS,
                "source_item_file": str(fp)
            }
            enriched.append(enriched_chunk)

        out_obj = {
            "item_file": str(fp),
            "item_basename": fp.stem,
            "model_name": MODEL_NAME,
            "token_limit_total": token_limit,
            "metadata_token_count": metadata_token_count,
            "effective_content_token_limit": effective_limit,
            "para_overlap": para_overlap,
            "sent_overlap": sent_overlap,
            "paragraph_count": len(paragraphs),
            "paragraph_token_counts": para_tok_counts,
            "chunks": enriched
        }

        out_file = Path(output_dir) / f"{fp.stem}.chunks_with_metadata.json"
        with open(out_file, "w", encoding="utf-8") as fh:
            json.dump(out_obj, fh, ensure_ascii=False, indent=2)

        summary.append({
            "item": fp.stem,
            "paragraphs": len(paragraphs),
            "chunks": len(enriched),
            "out_file": str(out_file)
        })

    return summary

# ---------- run ----------
if __name__ == "__main__":
    print("INPUT_JSON_DIR:", INPUT_JSON_DIR)
    print("OUTPUT_DIR:", OUTPUT_DIR)
    print("MODEL:", MODEL_NAME)
    print("TOKEN_LIMIT:", TOKEN_LIMIT, "PARA_OVERLAP:", PARA_OVERLAP, "SENT_OVERLAP:", SENT_OVERLAP)
    print("Metadata sentence (preview):", metadata_sentence[:300], "...")
    summary = process_all_items(INPUT_JSON_DIR, OUTPUT_DIR, TOKEN_LIMIT, PARA_OVERLAP, SENT_OVERLAP, MIN_TOKS_HEADING,
                                metadata_sentence, metadata_token_count)
    print("\nDone. Summary:")
    for s in summary:
        print(" ", s)


[info] loading tokenizer BAAI/bge-large-en-v1.5 ...
[ok] tokenizer loaded.
[info] metadata sentence token count = 72
[info] effective token budget for content per chunk = 440 tokens (out of 512 total)
INPUT_JSON_DIR: /content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item
OUTPUT_DIR: /content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence
MODEL: BAAI/bge-large-en-v1.5
TOKEN_LIMIT: 512 PARA_OVERLAP: 3 SENT_OVERLAP: 3
Metadata sentence (preview): This chunk is from the company Apple Inc.'s (AAPL) 10-K filing for the period ending 2024-09-28 (Accession: 0000320193-24-000123, Filed: 2024-11-01T10:01:36Z, CIK: 0000320193). ...


Items:   0%|          | 0/21 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2849 > 512). Running this sequence through the model will result in indexing errors
Items: 100%|██████████| 21/21 [00:12<00:00,  1.69it/s]


Done. Summary:
  {'item': 'item_1.normalized', 'paragraphs': 66, 'chunks': 15, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence/item_1.normalized.chunks_with_metadata.json'}
  {'item': 'item_10.normalized', 'paragraphs': 3, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence/item_10.normalized.chunks_with_metadata.json'}
  {'item': 'item_11.normalized', 'paragraphs': 2, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence/item_11.normalized.chunks_with_metadata.json'}
  {'item': 'item_12.normalized', 'paragraphs': 2, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence/item_12.normalized.chunks_with_metadata.json'}
  {'item': 'item_13.normalized', 'paragraphs': 2, 'chunks': 1, 'out_file': '/content/drive/My Drive/SEC-API/AAPL/txt_chunks_with_metadata_sentence/item_13.normalized.chunks_with_metadata.json'}
  {'item': 'it




### Chunking2: Converting xbrl facts sentences into chunks

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# One-chunk-per-fact script with metadata prepended
# Colab-ready. Requires: pip install transformers sentencepiece

import os
import re
import json
import csv
from pathlib import Path
from typing import List, Dict, Any, Optional

# ---------- CONFIG ----------
INPUT_FACTS_JSON = "/content/drive/My Drive/SEC-API/AAPL/XBRL_SENTENCES_JAN/xbrl_sentences1.json"   # input facts file (with "facts" array)
METADATA_FILE = "/content/drive/My Drive/SEC-API/AAPL/metadata_json_file/metadata_extracted_nolinks.json"
OUTPUT_DIR = "/content/drive/My Drive/SEC-API/AAPL/fact_chunks_with_metadata_single"
OUT_JSON = os.path.join(OUTPUT_DIR, "xbrl_fact_chunks_with_metadata_single.json")
OUT_CSV = os.path.join(OUTPUT_DIR, "xbrl_fact_chunks_with_metadata_single_preview.csv")

MODEL_NAME = "BAAI/bge-large-en-v1.5"  # tokenizer used for token counting
TOKEN_LIMIT = 512                      # model token limit
# --------------------------------

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- imports & tokenizer ----------
try:
    from transformers import AutoTokenizer
except Exception as e:
    raise RuntimeError("Please install transformers and sentencepiece (pip install transformers sentencepiece). Error: " + str(e))

print(f"[info] Loading tokenizer: {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("[ok] Tokenizer loaded.")

def count_tokens(text: Optional[str]) -> int:
    if not text:
        return 0
    # Use add_special_tokens=False to mirror typical embedder counting
    return len(tokenizer.encode(text, add_special_tokens=False))

# ---------- load inputs ----------
if not os.path.exists(INPUT_FACTS_JSON):
    raise FileNotFoundError(f"Facts JSON not found: {INPUT_FACTS_JSON}")
with open(INPUT_FACTS_JSON, "r", encoding="utf-8") as fh:
    facts_obj = json.load(fh)

facts_list = facts_obj.get("facts") or facts_obj.get("items") or facts_obj.get("data") or []
if not isinstance(facts_list, list):
    raise RuntimeError("Input facts JSON does not contain a top-level 'facts' list.")

if not os.path.exists(METADATA_FILE):
    raise FileNotFoundError(f"Metadata file not found: {METADATA_FILE}")
with open(METADATA_FILE, "r", encoding="utf-8") as fh:
    filing_metadata = json.load(fh)

# ---------- build metadata sentence ----------
# Use safe fallbacks; user can customize fields used
ticker = filing_metadata.get("ticker") or filing_metadata.get("symbol") or filing_metadata.get("cik") or ""
company_name = filing_metadata.get("company_name") or filing_metadata.get("company") or ""
form_type = filing_metadata.get("form_type") or filing_metadata.get("form") or ""
accession = filing_metadata.get("accession") or filing_metadata.get("accession_raw") or filing_metadata.get("accession_nodash") or ""
filed_at = filing_metadata.get("filed_at_utc") or filing_metadata.get("filed_at_raw") or filing_metadata.get("filed_at") or ""
period_of_report = filing_metadata.get("period_of_report") or filing_metadata.get("period_of_report")  # keep as-is
cik = filing_metadata.get("cik_int") or filing_metadata.get("cik") or filing_metadata.get("cik_padded") or filing_metadata.get("cik_raw") or ""
fiscal_year_end = filing_metadata.get("fiscal_year_end") or ""

# Format period_of_report to human form if looks like YYYY-MM-DD (optional)
# We'll keep user-provided format to avoid accidental changes.
meta_sentence_parts = []
if company_name:
    meta_sentence_parts.append(company_name)
if ticker and ticker not in company_name:
    meta_sentence_parts.append(f"({ticker})")

# Compose main phrase with form and period if available
main_parts = []
if form_type:
    main_parts.append(str(form_type))
if period_of_report:
    main_parts.append(f"for the period ending {period_of_report}")

main_phrase = " ".join(main_parts).strip()

paren_parts = []
if accession:
    paren_parts.append(f"Accession: {accession}")
if filed_at:
    paren_parts.append(f"Filed: {filed_at}")
if cik:
    # normalize to 10-digit padded CIK if present as integer or padded string
    cik_str = str(cik)
    paren_parts.append(f"CIK: {cik_str}")

# Build final sentence
if meta_sentence_parts:
    meta_sentence = "This chunk is from the company " + " ".join(meta_sentence_parts)
else:
    meta_sentence = "This chunk is from the filing"

if main_phrase:
    meta_sentence += f" {main_phrase}"

if paren_parts:
    meta_sentence += " (" + ", ".join(paren_parts) + ")"

# Ensure punctuation
meta_sentence = re.sub(r'\s+', ' ', meta_sentence).strip()
if not meta_sentence.endswith("."):
    meta_sentence += "."

print("[info] Metadata sentence constructed:")
print(meta_sentence)

meta_token_count = count_tokens(meta_sentence)
print(f"[info] Metadata token count: {meta_token_count} tokens")

if meta_token_count >= TOKEN_LIMIT:
    raise RuntimeError(f"Metadata sentence consumes {meta_token_count} tokens which is >= TOKEN_LIMIT ({TOKEN_LIMIT}). "
                       "Please shorten the metadata sentence or increase TOKEN_LIMIT.")

EFFECTIVE_TOKEN_LIMIT = TOKEN_LIMIT - meta_token_count
print(f"[info] EFFECTIVE_TOKEN_LIMIT for content (excluding metadata): {EFFECTIVE_TOKEN_LIMIT} tokens")

# ---------- prepare and chunk (one chunk per fact) ----------
chunks: List[Dict[str,Any]] = []

for idx, fact in enumerate(facts_list):
    # Extract text for the fact; prefer 'embed_text' or 'display_text'
    if isinstance(fact, dict):
        embed_text = fact.get("embed_text") or fact.get("display_text") or fact.get("text") or fact.get("value") or ""
        provenance = fact.get("provenance") or {}
        original = fact.get("original") or fact.get("original_fact") or fact.get("originalFact") or fact.get("raw") or None
    else:
        embed_text = str(fact)
        provenance = {}
        original = None

    # Normalize whitespace but preserve newlines if user wants them? Facts are typically single-line.
    # We will normalize internal whitespace to single spaces (facts are short).
    embed_text_clean = re.sub(r'\s+', ' ', str(embed_text)).strip()

    # Build full chunk text by prepending metadata sentence and separating with two newlines
    full_text = meta_sentence + "\n\n" + embed_text_clean

    # Token counts
    meta_toks = meta_token_count
    content_toks = count_tokens(embed_text_clean)
    full_toks = count_tokens(full_text)

    over_effective = content_toks > EFFECTIVE_TOKEN_LIMIT

    chunk_obj = {
        "chunk_id": f"fact_chunk_{idx}",
        "chunk_index": idx,
        "fact_index": idx,
        "meta_sentence": meta_sentence,
        "meta_token_count": meta_toks,
        "content_token_count": content_toks,
        "effective_token_limit": EFFECTIVE_TOKEN_LIMIT,
        "over_effective_limit": bool(over_effective),
        "token_count_full": full_toks,
        "text": full_text,
        "content_text": embed_text_clean,
        "provenance": provenance,
        "original": original
    }

    chunks.append(chunk_obj)

print(f"[ok] Created {len(chunks)} chunks (one per fact).")

# ---------- Save outputs ----------
out_obj = {
    "generated_at_utc": __import__("datetime").datetime.utcnow().isoformat() + "Z",
    "model_name": MODEL_NAME,
    "token_limit": TOKEN_LIMIT,
    "meta_sentence": meta_sentence,
    "meta_token_count": meta_token_count,
    "effective_token_limit": EFFECTIVE_TOKEN_LIMIT,
    "chunk_count": len(chunks),
    "chunks": chunks
}

with open(OUT_JSON, "w", encoding="utf-8") as fh:
    json.dump(out_obj, fh, indent=2, ensure_ascii=False)
print("[ok] Wrote chunk JSON to:", OUT_JSON)

# CSV preview
with open(OUT_CSV, "w", encoding="utf-8", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["chunk_id", "chunk_index", "fact_index", "over_effective_limit",
                     "meta_tokens", "content_tokens", "token_count_full", "text_preview"])
    for c in chunks:
        preview = c["content_text"][:400].replace("\n", "\\n")
        writer.writerow([c["chunk_id"], c["chunk_index"], c["fact_index"], c["over_effective_limit"],
                         c["meta_token_count"], c["content_token_count"], c["token_count_full"], preview])
print("[ok] Wrote CSV preview to:", OUT_CSV)

print("Done. Summary:")
print("  input facts:", len(facts_list))
print("  chunks created:", len(chunks))
print("  TOKEN_LIMIT:", TOKEN_LIMIT, "meta tokens:", meta_token_count, "effective:", EFFECTIVE_TOKEN_LIMIT)


[info] Loading tokenizer: BAAI/bge-large-en-v1.5 ...
[ok] Tokenizer loaded.
[info] Metadata sentence constructed:
This chunk is from the company Apple Inc. (AAPL) 10-K for the period ending 2024-09-28 (Accession: 0000320193-24-000123, Filed: 2024-11-01T10:01:36Z, CIK: 320193).
[info] Metadata token count: 66 tokens
[info] EFFECTIVE_TOKEN_LIMIT for content (excluding metadata): 446 tokens


Token indices sequence length is longer than the specified maximum sequence length for this model (1152 > 512). Running this sequence through the model will result in indexing errors


[ok] Created 1320 chunks (one per fact).
[ok] Wrote chunk JSON to: /content/drive/My Drive/SEC-API/AAPL/fact_chunks_with_metadata_single/xbrl_fact_chunks_with_metadata_single.json
[ok] Wrote CSV preview to: /content/drive/My Drive/SEC-API/AAPL/fact_chunks_with_metadata_single/xbrl_fact_chunks_with_metadata_single_preview.csv
Done. Summary:
  input facts: 1320
  chunks created: 1320
  TOKEN_LIMIT: 512 meta tokens: 66 effective: 446


  "generated_at_utc": __import__("datetime").datetime.utcnow().isoformat() + "Z",


## Embeddings: Uisng BAAI(Beijing Academy of Artificial Intelligence) bge(BAAI general embeddings) v1.5 large model from sentence transformers

#### Method 2: Created the whole text instead of just text preview from the chunks in the sec embedding's metadata file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#### Method 1: Created only the text preview instead of whole text from the chunks in the sec embedding's metadata file
# 0) Install dependencies
!pip install -q sentence-transformers ftfy



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sentence_transformers import SentenceTransformer
import torch
print("torch device:", "cuda" if torch.cuda.is_available() else "cpu")

torch device: cuda


In [None]:
# Colab-ready: Batch-embed 21 item JSON files + 1 XBRL JSON file and store embeddings + metadata to Google Drive.
# This version stores the FULL chunk text in the metadata JSONL ('text' field) instead of only a preview.
# Paste & run in one Colab cell. Edit CONFIG paths below before running.

# 0) Install deps (run in Colab)
!pip install -q sentence-transformers ftfy

# 1) Imports
import os, json, time, glob
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import torch
from sentence_transformers import SentenceTransformer

# 2) Mount Google Drive (Colab)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 3) CONFIG - edit these paths before running
ITEM_FILES_DIR = '/content/drive/MyDrive/Embeddings/AAPL/2024/textual_chunks'   # folder where your 21 item JSONs reside
ITEM_GLOB = 'item_*.normalized.chunks_with_metadata.json'  # glob pattern for item files
XBRL_JSON_PATH = '/content/drive/MyDrive/Embeddings/AAPL/2024/xbrl_facts_chunks/xbrl_fact_chunks_with_metadata_single.json'  # XBRL file path
OUT_DIR = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings-1'   # output folder on Drive
BATCH_SIZE = 64   # tune for GPU memory (reduce if OOM)
MODEL_NAME = "BAAI/bge-large-en-v1.5"  # embedding model name (your chosen model)

# If you want to STORE full text in metadata set this True (we do so)
store_full_text = True

os.makedirs(OUT_DIR, exist_ok=True)
EMBEDDING_FILE = os.path.join(OUT_DIR, 'sec_embeddings.npy')
METADATA_FILE = os.path.join(OUT_DIR, 'sec_embeddings_metadata.jsonl')
INDEX_FILE = os.path.join(OUT_DIR, 'sec_embeddings_index.json')

print("Output directory:", OUT_DIR)
print("Scanning ITEM files in:", ITEM_FILES_DIR, "with glob:", ITEM_GLOB)
print("XBRL JSON path:", XBRL_JSON_PATH)
print("BATCH_SIZE:", BATCH_SIZE, "MODEL:", MODEL_NAME)

# -----------------------------
# 4) Loaders for JSON file formats
# -----------------------------
def load_chunks_from_item_json(path):
    """
    Expects JSON with root['chunks'] list where each chunk dict contains text and metadata.
    Returns list of dicts: {'id','text','payload'}
    The payload will include provenance and (if store_full_text=True) the full 'text' will also be added to metadata later.
    """
    with open(path, 'r', encoding='utf-8') as f:
        root = json.load(f)
    chunks = []
    for idx, c in enumerate(root.get('chunks', [])):
        cid = c.get('chunk_id') or c.get('id') or c.get('chunkId') or f"{Path(path).stem}_chunk_{idx}"
        # get the actual chunk text from common fields
        text = c.get('text') or c.get('content_text') or c.get('content') or c.get('sentence') or ""
        payload = c.get('metadata') or c.get('payload') or {}
        # augment payload with provenance fields
        payload['_source_type'] = 'item_text'
        payload['_source_file'] = path
        payload['_chunk_id'] = cid
        # **DO NOT** truncate text here; we'll write full text into metadata later if requested
        # but keep a preview too for convenience
        payload['_text_preview'] = (text[:500] + '...') if len(text) > 500 else text
        chunks.append({'id': cid, 'text': text, 'payload': payload})
    return chunks

def load_chunks_from_xbrl_json(path):
    """
    Expects root['chunks'] list for XBRL; returns same shape as item loader.
    """
    with open(path, 'r', encoding='utf-8') as f:
        root = json.load(f)
    chunks = []
    for idx, c in enumerate(root.get('chunks', [])):
        cid = c.get('chunk_id') or c.get('id') or c.get('fact_id') or f"{Path(path).stem}_fact_{idx}"
        # xbrl facts often have content_text / text / statement
        text = c.get('content_text') or c.get('text') or c.get('statement') or ""
        provenance = c.get('provenance', {}) or {}
        original = c.get('original', {}) or {}
        payload = {
            '_source_type': 'xbrl_fact',
            '_source_file': path,
            '_chunk_id': cid,
            'provenance_path': provenance.get('provenance_path') or provenance.get('section_path') or provenance.get('path'),
            'period_type': provenance.get('period_type'),
            'unit': provenance.get('unit'),
        }
        if isinstance(original, dict) and 'value' in original:
            payload['original_value'] = original.get('value')
        else:
            payload['original'] = original
        payload['_text_preview'] = (text[:500] + '...') if len(text) > 500 else text
        chunks.append({'id': cid, 'text': text, 'payload': payload})
    return chunks

# -----------------------------
# 5) Discover item files
# -----------------------------
item_pattern = os.path.join(ITEM_FILES_DIR, ITEM_GLOB)
item_files = sorted(glob.glob(item_pattern))
# fallback: include any JSON with 'item' in name if none found
if not item_files:
    fallback = [p for p in glob.glob(os.path.join(ITEM_FILES_DIR, '*.json')) if 'item' in Path(p).name.lower()]
    item_files = sorted(fallback)
print(f"Found {len(item_files)} item JSON files.")

# secondary fallback if none are found (Colab upload area)
if not item_files:
    candidate = sorted(glob.glob('/mnt/data/*.json'))
    candidate = [p for p in candidate if os.path.abspath(p) != os.path.abspath(XBRL_JSON_PATH)]
    if len(candidate) >= 1:
        print("Fallback: using JSON files in /mnt/data (excluding xbrl):", candidate)
        item_files = candidate

# -----------------------------
# 6) Load chunks from all items
# -----------------------------
all_chunks = []
for fpath in item_files:
    try:
        ch = load_chunks_from_item_json(fpath)
        print(f"Loaded {len(ch)} chunks from {Path(fpath).name}")
        # augment ids with filename prefix to guarantee uniqueness across files
        pref = Path(fpath).stem
        for c in ch:
            c['id'] = f"{pref}::{c['id']}"
            all_chunks.append(c)
    except Exception as e:
        print("Error loading", fpath, ":", str(e))

# -----------------------------
# 7) Load xbrl chunks (single file)
# -----------------------------
if os.path.exists(XBRL_JSON_PATH):
    try:
        xbrl_chunks = load_chunks_from_xbrl_json(XBRL_JSON_PATH)
        print(f"Loaded {len(xbrl_chunks)} xbrl chunks from", Path(XBRL_JSON_PATH).name)
        pref = Path(XBRL_JSON_PATH).stem
        for c in xbrl_chunks:
            c['id'] = f"{pref}::{c['id']}"
            all_chunks.append(c)
    except Exception as e:
        print("Error loading XBRL file:", str(e))
else:
    print("XBRL JSON not found at path:", XBRL_JSON_PATH)

TOTAL = len(all_chunks)
print("TOTAL chunks across all item files + xbrl:", TOTAL)
if TOTAL == 0:
    raise SystemExit("No chunks found. Check paths and filenames.")

# -----------------------------
# 8) Load model (sentence-transformers)
# -----------------------------
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", DEVICE)
print("Loading model:", MODEL_NAME, "(this may take a while)...")

try:
    model = SentenceTransformer(MODEL_NAME, device=DEVICE)
except Exception as e:
    print("Failed to load model", MODEL_NAME, "via SentenceTransformer:", str(e))
    raise

EMB_DIM = model.get_sentence_embedding_dimension()
print("Embedding dimension:", EMB_DIM)

# -----------------------------
# 9) Prepare output files & memmap (backup pre-existing)
# -----------------------------
# Backup existing files if present
if os.path.exists(EMBEDDING_FILE):
    bak = EMBEDDING_FILE + ".bak_" + time.strftime("%Y%m%d_%H%M%S")
    print("Backing up existing embeddings file to:", bak)
    os.rename(EMBEDDING_FILE, bak)
if os.path.exists(METADATA_FILE):
    bakm = METADATA_FILE + ".bak_" + time.strftime("%Y%m%d_%H%M%S")
    print("Backing up existing metadata file to:", bakm)
    os.rename(METADATA_FILE, bakm)
if os.path.exists(INDEX_FILE):
    baki = INDEX_FILE + ".bak_" + time.strftime("%Y%m%d_%H%M%S")
    print("Backing up existing index file to:", baki)
    os.rename(INDEX_FILE, baki)

# Create memmap file for embeddings
print("Creating memmap:", EMBEDDING_FILE, "shape=({}, {})".format(TOTAL, EMB_DIM))
emb_memmap = np.memmap(EMBEDDING_FILE, dtype='float32', mode='w+', shape=(TOTAL, EMB_DIM))
meta_f = open(METADATA_FILE, 'w', encoding='utf-8')

# WARNING: storing full text for every point can create a very large metadata file and large Qdrant payloads.
# Make sure you have enough Drive space and that you really want full-text inside the payloads.
if store_full_text:
    print("NOTE: full text WILL be written into metadata JSONL. This will increase file size significantly.")

# -----------------------------
# 10) Encode in batches and write metadata with full text
# -----------------------------
row_index = 0
start_time = time.time()
for batch_start in tqdm(range(0, TOTAL, BATCH_SIZE), desc="Embedding batches"):
    batch_end = min(TOTAL, batch_start + BATCH_SIZE)
    batch = all_chunks[batch_start:batch_end]
    texts = [c['text'] for c in batch]
    ids = [c['id'] for c in batch]
    # replace empty texts with placeholder
    texts = [t if (t and t.strip()) else "[EMPTY_CHUNK]" for t in texts]
    # Encode to numpy (convert_to_numpy True)
    embs = model.encode(texts, show_progress_bar=False, convert_to_numpy=True, batch_size=len(texts))
    embs = embs.astype('float32')
    # L2-normalize for cosine similarity
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    embs = embs / norms
    # write into memmap
    emb_memmap[row_index: row_index + embs.shape[0], :] = embs
    # write metadata lines (with full 'text' when store_full_text True)
    for i, c in enumerate(batch):
        payload = c.get('payload', {})
        full_text = c.get('text') if store_full_text else (payload.get('_text_preview') or c.get('text', ''))
        # Build metadata record - include 'text' (full) rather than only preview
        metadata_record = {
            'row_index': row_index + i,
            'id': c['id'],
            'source_type': payload.get('_source_type'),
            'source_file': payload.get('_source_file'),
            'chunk_id': payload.get('_chunk_id'),
            'text': full_text,                      # <-- full text included here
            'text_preview': payload.get('_text_preview'),
            # common XBRL fields if present
            'provenance_path': payload.get('provenance_path'),
            'period_type': payload.get('period_type'),
            'unit': payload.get('unit'),
            'original_value': payload.get('original_value') if 'original_value' in payload else payload.get('original'),
            'embedding_model': MODEL_NAME,
            'created_at_utc': time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
        }
        # Write the JSON line
        meta_f.write(json.dumps(metadata_record, ensure_ascii=False) + '\n')
    row_index += embs.shape[0]

# flush & close
emb_memmap.flush()
meta_f.close()
elapsed = time.time() - start_time
print(f"Finished: wrote {TOTAL} embeddings (dim={EMB_DIM}) to {EMBEDDING_FILE}")
print("Metadata JSONL with full text:", METADATA_FILE)
print("Elapsed time: {:.2f} sec (avg {:.4f} sec per chunk)".format(elapsed, elapsed / max(1, TOTAL)))

# -----------------------------
# 11) Write index file
# -----------------------------
index_info = {
    'total_vectors': TOTAL,
    'embedding_dim': EMB_DIM,
    'embedding_file': EMBEDDING_FILE,
    'metadata_file': METADATA_FILE,
    'model_name': MODEL_NAME,
    'generated_at_utc': time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    'item_files': item_files,
    'xbrl_file': XBRL_JSON_PATH
}
with open(INDEX_FILE, 'w', encoding='utf-8') as f:
    json.dump(index_info, f, indent=2)
print("Index info written to:", INDEX_FILE)

# -----------------------------
# 12) Quick verification: compute cosine similarity of first 3 (if present)
# -----------------------------
print("Verifying first 3 embeddings (cosine sim):")
mm = np.memmap(EMBEDDING_FILE, dtype='float32', mode='r', shape=(TOTAL, EMB_DIM))
sample_count = min(3, TOTAL)
if sample_count > 0:
    sample = mm[:sample_count]
    sim = sample @ sample.T  # dot because normalized
    print(sim)
else:
    print("No embeddings to verify.")


Mounted at /content/drive
Output directory: /content/drive/MyDrive/Embeddings/SEC-API-embeddings-1
Scanning ITEM files in: /content/drive/MyDrive/Embeddings/AAPL/2024/textual_chunks with glob: item_*.normalized.chunks_with_metadata.json
XBRL JSON path: /content/drive/MyDrive/Embeddings/AAPL/2024/xbrl_facts_chunks/xbrl_fact_chunks_with_metadata_single.json
BATCH_SIZE: 64 MODEL: BAAI/bge-large-en-v1.5
Found 21 item JSON files.
Loaded 15 chunks from item_1.normalized.chunks_with_metadata.json
Loaded 1 chunks from item_10.normalized.chunks_with_metadata.json
Loaded 1 chunks from item_11.normalized.chunks_with_metadata.json
Loaded 1 chunks from item_12.normalized.chunks_with_metadata.json
Loaded 1 chunks from item_13.normalized.chunks_with_metadata.json
Loaded 1 chunks from item_14.normalized.chunks_with_metadata.json
Loaded 5 chunks from item_15.normalized.chunks_with_metadata.json
Loaded 89 chunks from item_1A.normalized.chunks_with_metadata.json
Loaded 1 chunks from item_1B.normalized.ch

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Embedding dimension: 1024
Creating memmap: /content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings.npy shape=(1517, 1024)
NOTE: full text WILL be written into metadata JSONL. This will increase file size significantly.


Embedding batches:   0%|          | 0/24 [00:00<?, ?it/s]

Finished: wrote 1517 embeddings (dim=1024) to /content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings.npy
Metadata JSONL with full text: /content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings_metadata.jsonl
Elapsed time: 40.79 sec (avg 0.0269 sec per chunk)
Index info written to: /content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings_index.json
Verifying first 3 embeddings (cosine sim):
[[0.9999997  0.80346596 0.7471311 ]
 [0.80346596 1.0000002  0.8342905 ]
 [0.7471311  0.8342905  1.0000002 ]]


#### Cleaning and Visualizing the Embeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files
uploaded = files.upload()


Saving xbrl_fact_chunks_with_metadata_single.json to xbrl_fact_chunks_with_metadata_single.json


In [None]:
import json

# Open and load JSON file
with open("xbrl_fact_chunks_with_metadata_single.json", "r") as f:
    data = json.load(f)


In [None]:
print(type(data))

<class 'dict'>


In [None]:
data

{'generated_at_utc': '2025-10-01T12:09:47.045922Z',
 'model_name': 'BAAI/bge-large-en-v1.5',
 'token_limit': 512,
 'meta_sentence': 'This chunk is from the company Apple Inc. (AAPL) 10-K for the period ending 2024-09-28 (Accession: 0000320193-24-000123, Filed: 2024-11-01T10:01:36Z, CIK: 320193).',
 'meta_token_count': 66,
 'effective_token_limit': 446,
 'chunk_count': 1320,
 'chunks': [{'chunk_id': 'fact_chunk_0',
   'chunk_index': 0,
   'fact_index': 0,
   'meta_sentence': 'This chunk is from the company Apple Inc. (AAPL) 10-K for the period ending 2024-09-28 (Accession: 0000320193-24-000123, Filed: 2024-11-01T10:01:36Z, CIK: 320193).',
   'meta_token_count': 66,
   'content_token_count': 10,
   'effective_token_limit': 446,
   'over_effective_limit': False,
   'token_count_full': 76,
   'text': 'This chunk is from the company Apple Inc. (AAPL) 10-K for the period ending 2024-09-28 (Accession: 0000320193-24-000123, Filed: 2024-11-01T10:01:36Z, CIK: 320193).\n\nCover Page — Document Ty

In [None]:
len(data)

8

In [None]:
from google.colab import files
uploaded = files.upload()


Saving item_1.normalized.chunks_with_metadata.json to item_1.normalized.chunks_with_metadata.json


In [None]:
import json

# Open and load JSON file
with open("item_1.normalized.chunks_with_metadata.json", "r") as f:
    data1 = json.load(f)


In [None]:
data1

{'item_file': '/content/drive/My Drive/SEC-API/AAPL/normalized_files_per_item/item_1.normalized.json',
 'item_basename': 'item_1.normalized',
 'model_name': 'BAAI/bge-large-en-v1.5',
 'token_limit_total': 512,
 'metadata_token_count': 72,
 'effective_content_token_limit': 440,
 'para_overlap': 3,
 'sent_overlap': 3,
 'paragraph_count': 66,
 'paragraph_token_counts': [4,
  2,
  53,
  1,
  1,
  39,
  1,
  57,
  1,
  40,
  7,
  101,
  41,
  11,
  12,
  1,
  1,
  21,
  2,
  69,
  2,
  28,
  2,
  37,
  78,
  2,
  29,
  1,
  137,
  3,
  120,
  1,
  113,
  12,
  114,
  201,
  3,
  89,
  101,
  40,
  3,
  94,
  2,
  107,
  99,
  95,
  12,
  6,
  115,
  2,
  80,
  3,
  85,
  3,
  68,
  4,
  23,
  3,
  55,
  1,
  70,
  3,
  86,
  12,
  2,
  238],
 'chunks': [{'chunk_id': 'item_1.normalized_chunk_0',
   'item_id': 'item_1.normalized',
   'text': "This chunk is from the company Apple Inc.'s (AAPL) 10-K filing for the period ending 2024-09-28 (Accession: 0000320193-24-000123, Filed: 2024-11-01T10:0

#### Cleaning and Visualizing Embeddings Created

In [None]:
# Cleaning from Method 1
from numpy.lib.format import open_memmap
import numpy as np
import json

EMBED_FILE = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings/sec_embeddings.npy'   # raw file
OUT_CLEAN = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings/sec_embeddings_clean.npy'
INDEX_FILE = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings/sec_embeddings_index.json'

with open(INDEX_FILE,'r') as f:
    idx = json.load(f)
N = idx['total_vectors']; D = idx['embedding_dim']

# source memmap (raw)
src = np.memmap(EMBED_FILE, dtype='float32', mode='r', shape=(N, D))

# create output .npy memmap with header (open_memmap)
dst = open_memmap(OUT_CLEAN, dtype='float32', shape=(N, D), mode='w+')

chunk = 10000   # rows per copy, tune to avoid RAM issues
for i in range(0, N, chunk):
    j = min(N, i + chunk)
    dst[i:j] = src[i:j]   # copy slice
    dst.flush()

print("Wrote clean .npy to:", OUT_CLEAN)


Wrote clean .npy to: /content/drive/MyDrive/Embeddings/SEC-API-embeddings/sec_embeddings_clean.npy


In [None]:
arr = np.load(OUT_CLEAN, mmap_mode='r')  # this will work without allow_pickle


In [None]:
arr

memmap([[ 0.00840656, -0.02438282, -0.00719392, ..., -0.02528281,
         -0.02430285,  0.00802166],
        [-0.01045125,  0.01484788, -0.01530072, ..., -0.02855879,
         -0.02964094,  0.00546957],
        [ 0.00871433,  0.03161473, -0.02306499, ..., -0.04339588,
         -0.02668651, -0.00517998],
        ...,
        [ 0.02388813,  0.04406147, -0.00868293, ..., -0.02377439,
         -0.01509905, -0.04820295],
        [ 0.0399663 ,  0.01229513, -0.01085633, ..., -0.02877697,
         -0.00490306, -0.04422011],
        [ 0.01681141, -0.01721017,  0.01822152, ...,  0.00790824,
         -0.00484863, -0.02153864]], dtype=float32)

In [None]:
arr.shape

(1517, 1024)

In [None]:
arr[0]

memmap([ 0.00840656, -0.02438282, -0.00719392, ..., -0.02528281,
        -0.02430285,  0.00802166], dtype=float32)

In [None]:
# Cleaning from Method 2
from numpy.lib.format import open_memmap
import numpy as np
import json

EMBED_FILE = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings.npy'   # raw file
OUT_CLEAN = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings_clean.npy'
INDEX_FILE = '/content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings_index.json'

with open(INDEX_FILE,'r') as f:
    idx = json.load(f)
N = idx['total_vectors']; D = idx['embedding_dim']

# source memmap (raw)
src = np.memmap(EMBED_FILE, dtype='float32', mode='r', shape=(N, D))

# create output .npy memmap with header (open_memmap)
dst = open_memmap(OUT_CLEAN, dtype='float32', shape=(N, D), mode='w+')

chunk = 10000   # rows per copy, tune to avoid RAM issues
for i in range(0, N, chunk):
    j = min(N, i + chunk)
    dst[i:j] = src[i:j]   # copy slice
    dst.flush()

print("Wrote clean .npy to:", OUT_CLEAN)


Wrote clean .npy to: /content/drive/MyDrive/Embeddings/SEC-API-embeddings-1/sec_embeddings_clean.npy
