In [1]:
# Cell 1: Imports and helper definitions
#Imports
import sys
sys.path.append('../scripts/')

from pathlib import Path
import json
import pandas as pd
from pdf_utils import pdf_to_pages
from llm_client import extract_block

In [2]:
# Cell 2: Function to parse a directory of 497K PDFs

def parse_497k_directory(directory: str, model: str = 'o4-mini') -> pd.DataFrame:
    """
    Parse all 497K PDFs in the given directory and return a DataFrame of extracted metadata.
    """
    results = []
    pdf_dir = Path(directory)
    for pdf_path in sorted(pdf_dir.glob('*.pdf')):
        pages = pdf_to_pages(pdf_path)
        text = "\n".join(pages)
        data = extract_block(text, model=model)
        data.update({'source_file': pdf_path.name})
        results.append(data)
    return pd.DataFrame(results)

In [3]:
# Cell 3: Run parsing and display results

df = parse_497k_directory('../data/raw/test', model='o4-mini')
df.head()

Unnamed: 0,fund_name,ticker,underlying_type,underlying_asset,fund_basis,leveraged_etf,leverage_multiple,rebalancing_timescale,inception_date,management_fee,expense_fee,total_operating_fee,net_total_after_waiver,investment_objective,principal_strategies,source_file
0,Direxion Daily AAPL Bull 2X Shares,AAPU,single-stock,AAPL,swaps,True,2.0,daily,08/09/2022,0.75,0.22,0.97,,"The Fund seeks daily investment results, befor...","Under normal circumstances, the Fund invests a...",aapu_497k.pdf
1,iShares Inflation Hedged U.S. Aggregate Bond ETF,AGIH,bond,AGG,swaps,False,,daily,06/17/2022,0.13,0.0,0.16,0.13,The iShares Inflation Hedged U.S. Aggregate Bo...,The Fund seeks to track the investment results...,agih_497k.pdf
2,GraniteShares 2x Short COIN Daily ETF,CONI,single-stock,COIN,swaps,True,-2.0,daily,,0.99,0.44,1.43,1.15,"The Fund seeks daily investment results, befor...",The Fund uses primarily swap agreements and ma...,coni_497k.pdf
3,iShares Gold Strategy ETF,IAUF,commodity,,swaps,False,,,,0.25,0.06,0.31,0.25,"The Fund seeks to provide exposure, on a total...",The Fund seeks to achieve its investment objec...,iauf_497k.pdf
4,Defiance Daily Target 2X Long MSTR ETF,MSTX,single-stock,MSTR,swaps,True,2.0,daily,10/30/2024,1.29,0.0,1.29,,"The Fund seeks daily investment results, befor...",The Fund uses swap agreements referencing MSTR...,mstx_497k.pdf


In [4]:
# Cell 4: Save to JSONL for downstream use

out_path = '../data/extracted/'
out_file = 'parsed_497k_results.jsonl'
output_path = Path(out_path + out_file)
with output_path.open('w', encoding='utf-8') as f:
    for rec in df.to_dict(orient='records'):
        f.write(json.dumps(rec, ensure_ascii=False) + '\n')

print(f"Saved parsed data to {output_path}")

Saved parsed data to ../data/extracted/parsed_497k_results.jsonl
