In [1]:
import sys
from pathlib import Path

workind_directory = str(Path.cwd().parent)
if workind_directory not in sys.path:
    sys.path.append(str(workind_directory))

In [2]:
import json
import pandas as pd
import numpy as np


with open('../data/raw/train.json') as f:
    data = json.load(f)
    
N = len(data)
print("All essays have unique document id.")
assert len(set([data[i]["document"] for i in range(len(data))])) == N

All essays have unique document id.


In [3]:
print("len(data):", N)
print("keys:", list(data[0].keys()))

len(data): 6807
keys: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels']


In [4]:
from pii_detection.data_split_utils import shuffle_and_split

print("Shuffle data and split into train/val/test.")
train, val, test = shuffle_and_split(data, save_dir="../data")


print("len(train):", len(train))
print("len(val):", len(val))
print("len(test):", len(test))
assert len(train) + len(val) + len(test) == N


# read data/val.json
for name in ["train", "val", "test"]:
    with open(f"../data/{name}_shard.json", "r") as f:
        dataset = json.load(f)
    assert dataset == locals()[name]

Shuffle data and split into train/val/test.
len(train): 4764
len(val): 1362
len(test): 681


In [5]:
from pii_detection.data_split_utils import get_labels_count

df = get_labels_count(train, val, test)
df

Unnamed: 0,train,val,test
O,3490479,994158.0,505157.0
B-NAME_STUDENT,939,316.0,110.0
I-NAME_STUDENT,767,229.0,100.0
B-URL_PERSONAL,89,14.0,7.0
B-ID_NUM,56,13.0,9.0
B-EMAIL,31,5.0,3.0
I-PHONE_NUM,15,0.0,0.0
I-STREET_ADDRESS,10,10.0,0.0
B-PHONE_NUM,6,0.0,0.0
B-USERNAME,5,0.0,1.0


In [6]:
from pii_detection.data_split_utils import map_labels_to_essays

labels_to_essays = map_labels_to_essays(data)
labels_to_essays

B-PHONE_NUM                                  [4381, 4777, 6243, 9854]
I-URL_PERSONAL                                                 [3202]
B-URL_PERSONAL      [317, 379, 472, 1309, 1798, 2672, 2700, 3202, ...
I-NAME_STUDENT      [7, 10, 16, 20, 56, 86, 93, 104, 112, 123, 166...
I-PHONE_NUM                                        [4381, 4777, 6243]
O                   [7, 10, 16, 20, 56, 86, 93, 104, 112, 123, 136...
B-NAME_STUDENT      [7, 10, 16, 20, 56, 86, 93, 104, 112, 123, 136...
I-ID_NUM                                                      [19280]
B-STREET_ADDRESS                                        [9854, 11442]
B-USERNAME                             [3351, 4462, 5716, 7786, 8642]
B-ID_NUM            [609, 2926, 3565, 4717, 4913, 4971, 5023, 5069...
I-STREET_ADDRESS                                        [9854, 11442]
B-EMAIL             [379, 2769, 3709, 4227, 4381, 4438, 4465, 4777...
Name: essay_ids, dtype: object

In [7]:
from pii_detection.data_split_utils import get_essays_with_rare_labels

rare_essay_ids = get_essays_with_rare_labels(labels_to_essays)
print(rare_essay_ids)

# all lists into one set:
rare_essay_ids = set([item for sublist in rare_essay_ids for item in sublist])
rare_essay_ids

B-PHONE_NUM               [4381, 4777, 6243, 9854]
I-URL_PERSONAL                              [3202]
I-PHONE_NUM                     [4381, 4777, 6243]
I-ID_NUM                                   [19280]
B-STREET_ADDRESS                     [9854, 11442]
B-USERNAME          [3351, 4462, 5716, 7786, 8642]
I-STREET_ADDRESS                     [9854, 11442]
Name: essay_ids, dtype: object


{3202, 3351, 4381, 4462, 4777, 5716, 6243, 7786, 8642, 9854, 11442, 19280}

In [8]:
from pii_detection.data_split_utils import _save_split_shards

rare_essays = [essay for essay in data if essay["document"] in rare_essay_ids]
non_rare_essays = [essay for essay in data if essay["document"] not in rare_essay_ids]

train, val, test = shuffle_and_split(non_rare_essays)
train += rare_essays

assert len(train) + len(val) + len(test) == N
_save_split_shards(train, val, test, save_dir="../data")


get_labels_count(train, val, test)

Unnamed: 0,train,val,test
O,3493006,997734.0,499054.0
B-NAME_STUDENT,978,276.0,111.0
I-NAME_STUDENT,740,258.0,98.0
B-URL_PERSONAL,84,17.0,9.0
B-ID_NUM,50,24.0,4.0
B-EMAIL,28,10.0,1.0
I-STREET_ADDRESS,20,0.0,0.0
I-PHONE_NUM,15,0.0,0.0
B-USERNAME,6,0.0,0.0
B-PHONE_NUM,6,0.0,0.0
