In [1]:
# Cell 1: Imports and helper definitions
#Imports
import sys
sys.path.append('../scripts/')

from pathlib import Path
import json
import pandas as pd
from pdf_utils import pdf_to_pages
from llm_client import extract_block

In [2]:
# Cell 2: Function to parse a directory of 497K PDFs

def parse_497k_directory(directory: str, model: str = 'o4-mini') -> pd.DataFrame:
    """
    Parse all 497K PDFs in the given directory and return a DataFrame of extracted metadata.
    """
    results = []
    pdf_dir = Path(directory)
    for pdf_path in sorted(pdf_dir.glob('*.pdf')):
        pages = pdf_to_pages(pdf_path)
        text = "\n".join(pages)
        data = extract_block(text, model=model)
        data.update({'source_file': pdf_path.name})
        results.append(data)
    return pd.DataFrame(results)

In [3]:
# Cell 3: Run parsing and display results

df = parse_497k_directory('../data/raw/test', model='o4-mini')
df.head()

Unnamed: 0,fund_name,ticker,underlying_theme,primary_basis,benchmark_underlying,leverage_percent,rebalancing_timescale,inception_date,management_fee,expense_fee,total_operating_fee,net_total_after_waiver,distribution_frequency,tax_status,investment_objective,principal_strategies,source_file
0,Direxion Daily AAPL Bull 2X Shares,AAPU,single-stock,swaps,AAPL (Apple Inc),200%,daily,08/09/2022,0.75%,0.14%,0.97%,,,,"The Fund seeks daily investment results, befor...","Under normal circumstances, invests at least 8...",aapu_497k.pdf
1,iShares Inflation Hedged U.S. Aggregate Bond ETF,AGIH,bond,swaps,BlackRock Inflation Hedged U.S. Aggregate Bond...,,daily,2022-06-23,0.13%,0.03%,0.16%,0.13%,,taxable,The Fund seeks to track the investment results...,Under normal circumstances the Fund invests at...,agih_497k.pdf
2,GraniteShares 2x Short COIN Daily ETF,CONI,single-stock,swaps,"Coinbase Global, Inc. Class A (NASDAQ: COIN)",-200%,Daily,,0.99%,0.44%,1.43%,1.15%,,Taxable as ordinary income,"The Fund seeks daily investment results, befor...",The Fund attempts to replicate -200% of the da...,coni_497k.pdf
3,iShares Gold Strategy ETF,IAUF,commodity,swaps,Bloomberg Composite Gold Index,,,,0.25%,0.06%,0.31%,0.25%,,,"The Fund seeks to provide exposure, on a total...",The Fund invests primarily in (i) exchange-tra...,iauf_497k.pdf
4,Defiance Daily Target 2X Long MSTR ETF,MSTX,single-stock,swaps,MicroStrategy Incorporated (Nasdaq: MSTR),200%,daily,,1.29%,0.00%,1.29%,1.29%,,,"The Fund seeks daily investment results, befor...",The Fund is an actively managed ETF that seeks...,mstx_497k.pdf


In [4]:
# Cell 4: Save to JSONL for downstream use

out_path = '../data/extracted/'
out_file = 'parsed_497k_results.jsonl'
output_path = Path(out_path + out_file)
with output_path.open('w', encoding='utf-8') as f:
    for rec in df.to_dict(orient='records'):
        f.write(json.dumps(rec, ensure_ascii=False) + '\n')

print(f"Saved parsed data to {output_path}")

Saved parsed data to ../data/extracted/parsed_497k_results.jsonl
