In [12]:
# Cell 1: Imports and helper definitions
#Imports
import sys
sys.path.append('../scripts/')

from pathlib import Path
import json
import pandas as pd
from parse_utils import pdf_to_pages, html_to_text
from llm_client import extract_block

In [13]:
# Cell 2: Function to parse a directory of 497K PDFs

def parse_497k_directory(directory: str, model: str = 'o4-mini') -> pd.DataFrame:
    """
    Parse all .pdf AND .htm/.html files in `directory`, 
    dispatching based on extension, and return a DataFrame.
    """
    results = []
    data_dir = Path(directory)

    for path in sorted(data_dir.iterdir()):
        suffix = path.suffix.lower()
        if suffix == '.pdf':
            # PDF path → pages → text
            pages = pdf_to_pages(path)
            text = "\n".join(pages)
        elif suffix in ('.htm', '.html'):
            # HTML path → BeautifulSoup → text
            text = html_to_text(path)
        else:
            # skip anything else
            continue

        # send to your LLM extractor
        record = extract_block(text, model=model)
        record['source_file'] = path.name
        results.append(record)

    return pd.DataFrame(results)

In [14]:
# Cell 3: Run parsing and display results

df = parse_497k_directory('../data/raw/a5filings_test', model='o4-mini')
df.head()

Unnamed: 0,fund_name,ticker,underlying_type,underlying_asset,fund_basis,leveraged_etf,leverage_multiple,rebalancing_timescale,inception_date,management_fee,expense_fee,total_operating_fee,net_total_after_waiver,investment_objective,principal_strategies,source_file
0,ProShares Ultra S&P500,SSO,index,SPX,swaps,True,2.0,daily,,0.75,0.14,0.89,0.89,"ProShares Ultra S&P500 (the ""Fund"") seeks dail...",The Fund invests in financial instruments that...,1174610_0001683863-25-003318_f41376d1.htm
1,ProShares UltraPro Short S&P500,SPXU,index,SPX,swaps,True,-3.0,daily,,0.75,0.14,0.89,0.89,"The Fund seeks daily investment results, befor...",The Fund invests principally in swap agreement...,1174610_0001683863-25-003320_f41378d1.htm
2,Direxion Daily S&P 500 High Beta Bull 3X Shares,HIBL,index,S&P 500 High Beta Index,swaps,True,3.0,daily,11/07/2019,0.75,0.23,0.98,,"The Fund seeks daily investment results, befor...",The Fund invests at least 80% of its net asset...,1424958_0001193125-25-039969_d882427d497k.htm
3,Direxion Daily Travel & Vacation Bull 2X Shares,OOTO,index,BlueStar Travel and Vacation Index,swaps,True,2.0,daily,06/10/2021,0.75,0.37,1.12,0.98,"The Fund seeks daily investment results, befor...",The Fund invests at least 80% of its net asset...,1424958_0001193125-25-039976_d889800d497k.htm
4,KraneShares 2x Long BABA Daily ETF,KBAB,single_stock,BABA,swaps,True,2.0,daily,03/12/2025,1.25,0.01,1.26,0.99,"The Fund seeks daily investment results, befor...","Under normal circumstances, the Fund invests a...",1547576_0001829126-25-001710_kraneshares_497k.htm


In [15]:
# Cell 4: Save to JSONL for downstream use
out_path = '../data/extracted/'
out_file = 'a5filings_test.jsonl'
output_path = Path(out_path) / out_file

output_path.parent.mkdir(exist_ok=True, parents=True)
with output_path.open('w', encoding='utf-8') as f:
    for rec in df.to_dict(orient='records'):
        f.write(json.dumps(rec, ensure_ascii=False) + '\n')

print(f"Saved parsed data to {output_path}")

Saved parsed data to ../data/extracted/a5filings_test.jsonl
