## Data Preview

In [1]:
import pandas as pd

df = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json', orient="records")

print(f"There are {len(df)} essays in the train dataset")
df.head()  ## each row is an essay

There are 6807 essays in the train dataset


Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


All essays have a **unique** document id:

In [2]:
df["document"].is_unique

True

In [3]:
df.set_index("document", inplace=True)

## Stratified Sampling

In [4]:
labels = df["labels"]

labels

document
7        [O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...
10       [B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...
16       [O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...
20       [O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...
56       [O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...
                               ...                        
22678    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
22679    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
22681    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
22684    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
22687    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
Name: labels, Length: 6807, dtype: object

#### Some labels are less prevalent than others:

In [5]:
labels.explode().value_counts().to_frame()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
O,4989794
B-NAME_STUDENT,1365
I-NAME_STUDENT,1096
B-URL_PERSONAL,110
B-ID_NUM,78
B-EMAIL,39
I-STREET_ADDRESS,20
I-PHONE_NUM,15
B-USERNAME,6
B-PHONE_NUM,6


#### Some documents contain multiple PII labels. How many **unique** documents contain each label?

In [6]:
labels_uniqueness = \
(labels
 .explode()
 .to_frame()
 .reset_index()
 .groupby("labels")
 .nunique()
 .rename(columns={'document': 'n_unique_documents'})
 .sort_values(by="n_unique_documents", ascending=False))

labels_uniqueness

Unnamed: 0_level_0,n_unique_documents
labels,Unnamed: 1_level_1
O,6807
B-NAME_STUDENT,891
I-NAME_STUDENT,814
B-URL_PERSONAL,72
B-ID_NUM,33
B-EMAIL,24
B-USERNAME,5
B-PHONE_NUM,4
I-PHONE_NUM,3
B-STREET_ADDRESS,2


In [7]:
RARE_LABELS_THRESHOLD = 2
RARE_LABELS = (labels_uniqueness < RARE_LABELS_THRESHOLD).query("n_unique_documents").index.tolist()

RARE_LABELS

['I-ID_NUM', 'I-URL_PERSONAL']

In [8]:
essays_with_rare = df["labels"].apply(lambda essay_labels: any(rare_label in essay_labels for rare_label in RARE_LABELS))

df_rare = df.loc[essays_with_rare]
df_nonrare = df.loc[~essays_with_rare]

print(f"There are {len(df_rare)} essays with rare labels, and {len(df_nonrare)} essays with non rare labels.")

There are 2 essays with rare labels, and 6805 essays with non rare labels.


In [9]:
labels_uniqueness_map = labels_uniqueness.reset_index().reset_index(names="uniqueness").set_index("labels")["uniqueness"]

labels_uniqueness_map

labels
O                    0
B-NAME_STUDENT       1
I-NAME_STUDENT       2
B-URL_PERSONAL       3
B-ID_NUM             4
B-EMAIL              5
B-USERNAME           6
B-PHONE_NUM          7
I-PHONE_NUM          8
B-STREET_ADDRESS     9
I-STREET_ADDRESS    10
I-ID_NUM            11
I-URL_PERSONAL      12
Name: uniqueness, dtype: int64

labeling each essay with its most unique label:

In [10]:
stratified_sampling_categories = (
    df_nonrare['labels'].apply(
        lambda essay_labels: 
        max(labels_uniqueness_map.loc[label] for label in set(essay_labels))
    )
)

In [11]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_nonrare, test_size=0.2, random_state=123, stratify=stratified_sampling_categories)

df_train = pd.concat([df_train, df_rare])

#### Labels count in each split after stratified sampling:

In [12]:
pd.DataFrame(
    {
        "train": df_train["labels"].explode().value_counts(),
        "val": df_val["labels"].explode().value_counts()
    }
).fillna(0).assign(val=lambda frame: frame.val.astype(int)).loc[labels_uniqueness_map.index]

Unnamed: 0_level_0,train,val
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
O,4008002,981792
B-NAME_STUDENT,1085,280
I-NAME_STUDENT,869,227
B-URL_PERSONAL,90,20
B-ID_NUM,61,17
B-EMAIL,29,10
B-USERNAME,4,2
B-PHONE_NUM,5,1
I-PHONE_NUM,12,3
B-STREET_ADDRESS,2,0


## Save

In [13]:
!mkdir -p data_split

df_train.to_json("/kaggle/working/data_split/train.json", orient="records")
df_val.to_json("/kaggle/working/data_split/val.json", orient="records")