In [7]:
import json
from pydantic import TypeAdapter
from typing import List
from pii_detection.pydantic_models import Essay


def get_data():
    with open("../data/train_shard.json") as f:
        essays = json.load(f)
    essays = TypeAdapter(List[Essay]).validate_python(essays)
    return essays

essays = get_data()

In [None]:
# INTERESTING_DOCUMENTS = [9854, 6243, 4777, 3202, 379, 6611, 3709, 8642, 7308, 6853, 4438, 6849, 9665, 1798, 4227, 6900, 4465, 6999, 4394, 4381, 5397, 12184, 9543, 6591, 6577, 5653, 2769, 6537, 5662, 2926, 6450, 2672, 3565, 472, 609, 12043, 6457, 12456, 7680, 4913, 5263, 10609, 7156, 3894, 12267, 7786, 11900, 12483, 9801, 9138, 10693]
# essays_with_multiple_pii = [essay for essay in essays if essay.document in INTERESTING_DOCUMENTS]

In [17]:
from concurrent.futures import ThreadPoolExecutor
from time import sleep
from tqdm import tqdm
from pii_detection.pydantic_models import Essay
from pathlib import Path


GEN_DIR = Path.cwd().parent / "data" / "generated"
GEN_DIR.mkdir(parents=True, exist_ok=True)
N_WORKERS = 10


def job_fn(essay: Essay):
    dst_json = GEN_DIR / f"{str(essay.document).zfill(6)}.json"
    if not dst_json.exists():
        new_essay = essay.rewrite_essay()
        dst_json.write_text(new_essay.model_dump_json(indent=2))

essays = [essay for essay in essays if not (GEN_DIR / f"{str(essay.document).zfill(6)}.json").exists()]
pool = ThreadPoolExecutor(max_workers=N_WORKERS)
threads = [pool.submit(job_fn, essay) for essay in essays]


with tqdm(total=len(threads)) as pbar:
    n_running = len(threads)
    while n_running > 0:
        n_running = sum(thread.running() for thread in threads)
        n_done = len(threads) - n_running
        pbar.update(n_done - pbar.n)
        sleep(0.1)

pool.shutdown(wait=True)


 51%|█████     | 2373/4644 [58:10<55:40,  1.47s/it]  


KeyboardInterrupt: 