In [1]:
import csv
import time
import requests

DATASET = "c3po-ai/edgar-corpus"
CONFIG  = "full"
SPLIT   = "train"

#generator on filenames
def chunked(xs, n):
    for i in range(0, len(xs), n):
        yield xs[i:i+n]

def fetch_all_by_filename(filenames, chunk_size=20, page_size=100, max_retries=8):
    url = "https://datasets-server.huggingface.co/filter"
    out = []

    for fn_chunk in chunked(list(filenames), chunk_size):
        where = "(" + " OR ".join([f"\"filename\"='{fn}'" for fn in fn_chunk]) + ")"
        params = dict(
            dataset=DATASET,
            config=CONFIG,
            split=SPLIT,
            where=where,
            offset=0,
            length=page_size,
        )

        while True:
            delay = 1.0
            for attempt in range(max_retries):
                resp = requests.get(url, params=params, timeout=60)
                if resp.status_code == 414:
                    raise RuntimeError(
                        "HTTP 414 URI Too Long. Reduce chunk_size (try 10–15)."
                    )
                try:
                    r = resp.json()
                except Exception:
                    raise RuntimeError(
                        f"Non-JSON response (status {resp.status_code}): {resp.text[:300]}"
                    )
                if "error" in r:
                    msg = str(r["error"])
                    print(f"[filter error] attempt={attempt+1} offset={params['offset']} msg={msg}")
                    # Retry transient errors (index warming, temporary backend hiccups)
                    time.sleep(delay)
                    delay = min(delay * 2, 30)
                    continue

                batch = [x["row"] for x in r.get("rows", [])]
                out.extend(batch)
                print('successful chunk fetch')
                break  # success, exit retry loop
            # paginate within this chunk
            if len(batch) < page_size:
                break
            params["offset"] += page_size

    return out


filenames = ['92116_1993.txt', '103730_1993.txt', '100240_1993.txt', '46207_1993.txt', '60041_1993.txt', '40878_1993.txt', '800287_1993.txt', '50178_1993.txt', '725625_1993.txt', '66479_1993.txt', '854094_1993.txt', '846972_1993.txt', '59478_1993.txt', '7383_1993.txt', '711404_1993.txt', '75042_1993.txt', '18497_1993.txt', '93469_1993.txt', '354869_1993.txt', '36966_1993.txt', '799036_1993.txt', '846902_1993.txt', '717605_1993.txt', '92487_1993.txt', '101320_1993.txt', '64605_1993.txt', '17797_1993.txt', '216228_1993.txt', '791445_1993.txt', '719264_1993.txt', '92050_1993.txt', '702163_1993.txt', '75252_1993.txt', '65660_1993.txt', '37748_1993.txt', '701345_1993.txt', '310431_1993.txt', '764037_1993.txt', '201461_1993.txt', '741612_1993.txt', '104669_1993.txt', '797463_1993.txt', '51720_1993.txt', '100826_1993.txt', '40533_1993.txt', '101830_1993.txt', '215419_1993.txt', '40454_1993.txt', '77227_1993.txt', '65358_1993.txt', '54502_1993.txt', '893486_1993.txt', '91576_1993.txt', '94601_1993.txt', '832427_1993.txt', '893928_1993.txt', '10456_1993.txt', '53347_1993.txt', '74208_1993.txt', '66904_1993.txt', '315189_1993.txt', '93675_1993.txt', '836400_1993.txt', '20164_1993.txt', '36270_1993.txt', '40874_1993.txt', '81061_1993.txt', '92195_1993.txt', '67646_1993.txt', '49423_1993.txt', '814153_1993.txt', '20947_1993.txt', '2024_1993.txt', '33798_1993.txt', '90185_1993.txt', '36377_1993.txt', '81100_1993.txt', '100783_1993.txt', '766829_1993.txt', '16906_1993.txt', '766701_1993.txt', '73124_1993.txt', '46080_1993.txt', '740868_1993.txt', '708823_1993.txt', '49071_1993.txt', '67686_1993.txt', '52466_1993.txt', '799319_1993.txt', '60026_1993.txt', '72909_1993.txt', '801124_1993.txt', '105839_1993.txt', '25445_1993.txt', '47129_1993.txt', '205402_1993.txt', '36326_1993.txt', '79879_1993.txt', '42293_1993.txt', '3153_1993.txt', '311871_1993.txt', '728586_1993.txt', '96638_1993.txt', '99193_1993.txt', '277821_1993.txt', '72903_1993.txt', '30554_1993.txt', '774203_1993.txt', '7323_1993.txt', '55785_1993.txt', '79732_1993.txt', '732714_1993.txt', '740582_1993.txt', '72971_1993.txt', '103392_1993.txt', '50548_1993.txt', '101382_1993.txt', '840216_1993.txt', '107832_1993.txt', '36995_1993.txt', '35527_1993.txt', '814677_1993.txt', '100923_1993.txt', '66740_1993.txt', '92236_1993.txt', '200245_1993.txt', '310433_1993.txt', '754737_1993.txt', '74783_1993.txt', '1800_1993.txt', '105418_1993.txt', '711513_1993.txt', '48305_1993.txt', '9659_1993.txt', '351145_1993.txt', '15840_1993.txt', '71180_1993.txt', '34501_1993.txt', '7431_1993.txt', '7649_1993.txt', '25600_1993.txt', '34285_1993.txt', '4310_1993.txt', '49588_1993.txt', '732712_1993.txt', '722573_1993.txt', '64803_1993.txt', '817473_1993.txt', '310569_1993.txt', '859119_1993.txt', '277509_1993.txt', '351825_1993.txt', '64782_1993.txt', '75527_1993.txt', '36672_1993.txt', '705752_1993.txt']

rows = fetch_all_by_filename(filenames, chunk_size=15)
print("rows fetched:", len(rows))

if rows:
    with open("edgar_subset.csv", "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=rows[0].keys())
        w.writeheader()
        w.writerows(rows)

[filter error] attempt=1 offset=0 msg=the dataset index is loading, this can take a minute
[filter error] attempt=2 offset=0 msg=Unexpected error.
[filter error] attempt=3 offset=0 msg=Unexpected error.
[filter error] attempt=4 offset=0 msg=the dataset index is loading, this can take a minute
[filter error] attempt=5 offset=0 msg=Unexpected error.
[filter error] attempt=6 offset=0 msg=the dataset index is loading, this can take a minute
successful chunk fetch
[filter error] attempt=1 offset=0 msg=Unexpected error.
[filter error] attempt=2 offset=0 msg=Unexpected error.
[filter error] attempt=3 offset=0 msg=Unexpected error.
successful chunk fetch
successful chunk fetch
successful chunk fetch
[filter error] attempt=1 offset=0 msg=Unexpected error.
successful chunk fetch
successful chunk fetch
successful chunk fetch
successful chunk fetch
successful chunk fetch
successful chunk fetch
successful chunk fetch
rows fetched: 156


In [3]:
import pandas as pd

df = pd.DataFrame(rows)
df.to_csv("edgar_subset_rows.csv", index=False)
state_df = pd.read_csv("data/cleaned_data.csv")
df_merged = df.merge(state_df, on="filename", how="left")
df_merged
df_merged.to_csv("156_edgar_merged.csv", index=False)

In [4]:
import pandas as pd

df = pd.read_csv("edgar_subset.csv")
df.head()

Unnamed: 0,filename,cik,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,...,section_8,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15
0,732712_1993.txt,732712,1993,Item 1. Business GENERAL\nBell Atlantic Corpor...,,,Item 2. Properties\nThe principal properties o...,Item 3. Legal Proceedings\nPre-Divestiture Con...,Item 4. Submission of Matters to a Vote of Sec...,Item 5. Market for Registrant's Common Equity ...,...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,,,Item 10. Directors and Executive Officers of R...,Item 11. Executive Compensation\nFor informati...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,"Item 14. Exhibits, Financial Statement Schedul...",
1,722573_1993.txt,722573,1993,Item 1. Business --------\nGeneral -------\nMa...,,,Item 2. Properties ----------\nThe Company's o...,Item 3. Legal Proceedings -----------------\na...,Item 4. Submission of Matters to a Vote of Sec...,Item 5. Market for the Registrant's Common Sto...,...,Item 8. Financial Statements and Supplementary...,Item 9. Changes in and Disagreements with Acco...,,,"Item 10. Directors, Executive Officers, Promot...",Item 11. Executive Compensation --------------...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,"Item 14. Exhibits, Financial Statement Schedul...",
2,75527_1993.txt,75527,1993,ITEM 1. BUSINESS\nPACIFIC ENTERPRISES\nPacific...,,,"ITEM 2. PROPERTIES\nPacific Library Tower, a w...",ITEM 3. LEGAL PROCEEDINGS\nExcept for the matt...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY ...,...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nInformation r...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",
3,812427_1993.txt,812427,1993,ITEM 1. BUSINESS\nTHE COMPANY\nThe Company was...,,,ITEM 2. PROPERTIES\nThe Company owns and opera...,ITEM 3. LEGAL PROCEEDINGS\nThe Company is a de...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY ...,...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThe following...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",
4,912025_1993.txt,912025,1993,ITEM 1. BUSINESS.\nGeneral Development of Busi...,,,ITEM 2. PROPERTIES\nGeneral. Green Acres Mall ...,ITEM 3. LEGAL PROCEEDINGS.\nNone.\nITEM 4.,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY ...,...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION.\nEXECUTIVE CO...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",


In [16]:
df['section_1'] = df['section_1'].str.split('Business GENERAL\n', expand=True)[1]
df.head()

Unnamed: 0,filename,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,section_6,...,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15,hq_state
0,103730_1993.txt,1993,,,,Item 2. PROPERTIES - - - ------- ----------\nT...,Item 3. LEGAL PROCEEDINGS - - - ------- ------...,Item 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,Item 5. MARKET FOR REGISTRANT'S COMMON STOCK A...,Item 6. SELECTED FINANCIAL DATA - - - ------- ...,...,Item 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,,,,,"Item 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,Pennsylvania
1,50178_1993.txt,1993,,,,ITEM 2. PROPERTIES\nThe properties of the Comp...,ITEM 3. LEGAL PROCEEDINGS\nPre-divestiture Con...,,,,...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,,,,,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,Illinois
2,799036_1993.txt,1993,,,,Item 2. Properties\nThe principal executive of...,Item 3. Legal Proceedings\nThe Company and its...,Item 4. Submission of Matters to a Vote of Sec...,Item 5. Market for Registrant's Common Equity ...,Item 6. Selected Consolidated Financial Data\n...,...,Item 9. Changes in and Disagreements With Acco...,,,Item 10. Directors and Executive Officers of t...,Item 11. Executive Compensation\nThe section o...,Item 12. Security Ownership of Certain Benefic...,Item 13. Certain Relationships and Related Tra...,"Item 14. Exhibits, Financial Statement Schedul...",,Illinois
3,17797_1993.txt,1993,,,,ITEM 2. PROPERTIES _______ __________\nIn addi...,ITEM 3. LEGAL PROCEEDINGS ______ _____________...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON EQU...,,...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION _______ ______...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,North Carolina
4,92050_1993.txt,1993,,,,Item 2. Properties.\nCertain information about...,"Item 3. Legal Proceedings.\nIn December 1990, ...",Item 4. Submission of Matters to a Vote of Sec...,Item 5. Market for Registrant's Common Equity ...,Item 6. Selected Financial Data\nPAGE\nItem 7.,...,Item 9. Change in and Disagreements with Accou...,,,,,,,"Item 14. Exhibits, Financial Statement Schedul...",,Florida
