In [None]:
import spacy
from spacy.tokens import DocBin, Doc, Span
from tqdm import tqdm
import traceback
import pandas as pd
import pickle
import random
import numpy as np
from pathlib import Path
import json
spacy.require_gpu()

### this assumes that you already have annotated your archive with entities using LabelStudio
### see this section's README for more information
### this notebook creates 10 training-validation sets of each sample size (100 to 500, spaced by 100) each with holdout sets for evaluation

In [None]:
corpus = []

# annotated archive from LabelStudio
# annotated for person name, mailing address, email address, 
# ID_num (identifiers that look like credit card numbers or social security numbers), 
# organization name

with open("archive.jsonl", "r") as f:
    for line in f:
        corpus.append(json.loads(line))

In [None]:
original = corpus.copy()

In [None]:
len(original)

In [None]:
original[0]

In [None]:

# convert JSONL to spaCy doc format
# taken from https://github.com/explosion/spaCy/discussions/10202
def jsonl2doc(jsonl_dict, nlp) -> Doc:
    doc = nlp(jsonl_dict["text"])
    ents = []
    for ann in jsonl_dict.get("spans", []):
        span = Span(
            doc,
            ann["token_start"],
            ann["token_end"] + 1,
            ann["label"],
        )
        ents.append(span)
    doc.set_ents(ents)
    return doc

In [None]:
# create datasets with holdout evaluation set from training size of 100 to 500
nlp = spacy.blank("en")
n = 100 # size of the evaluation set

for k in tqdm(range(0, 10)):
    random.shuffle(original)
    holdout = original[:n]

    train = original[n:]

    for i in tqdm(np.arange(100, 550, 100)):
    

    
        

        to_hold = []
        doc_bin = DocBin()
    
        random_sample = random.sample(train, i)
        for j in random_sample :
            doc_bin.add(jsonl2doc(j, nlp))
        if not Path(f"data_curve/{i}_samples/").exists():
            Path(f"data_curve/{i}_samples/").mkdir(parents=True)
        assert (len(doc_bin) == i)
        doc_bin.to_disk(f"data_curve/{i}_samples/train_{k}.spacy")
        
    doc_bin = DocBin()
    for i in holdout:
        doc_bin.add(jsonl2doc(i, nlp))
    assert (len(doc_bin) == 100)
    doc_bin.to_disk(f"data_curve/holdout_{k}.spacy")


In [None]:
len(doc_bin)

In [None]:
len(set([i['text'] for i in corpus]) - set([i['text'] for i in holdout]))