### DEPENDENCIES

In [17]:
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

from datasets import load_dataset
from datasets import Dataset
import pandas as pd

### IMPORT THE DATASET

In [3]:
ds_raw = load_dataset("ai4privacy/pii-masking-400k")

In [4]:
ds_raw

DatasetDict({
    train: Dataset({
        features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 325517
    })
    validation: Dataset({
        features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 81379
    })
})

In [5]:
ds_raw["train"][0]

{'source_text': '<p>My child faozzsd379223 (DOB: May/58) will undergo treatment with Dr. faozzsd379223, office at Hill Road. Our ZIP code is 28170-6392. Consult policy M.UE.227995. Contact number: 0070.606.322.6244. Handle transactions with 6225427220412963. Queries? Email: faozzsd379223@outlook.com.</p>',
 'locale': 'US',
 'language': 'en',
 'split': 'train',
 'privacy_mask': [{'label': 'USERNAME',
   'start': 12,
   'end': 25,
   'value': 'faozzsd379223',
   'label_index': 2},
  {'label': 'DATEOFBIRTH',
   'start': 32,
   'end': 38,
   'value': 'May/58',
   'label_index': 1},
  {'label': 'USERNAME',
   'start': 72,
   'end': 85,
   'value': 'faozzsd379223',
   'label_index': 1},
  {'label': 'STREET',
   'start': 97,
   'end': 106,
   'value': 'Hill Road',
   'label_index': 1},
  {'label': 'ZIPCODE',
   'start': 124,
   'end': 134,
   'value': '28170-6392',
   'label_index': 1},
  {'label': 'TELEPHONENUM',
   'start': 180,
   'end': 197,
   'value': '0070.606.322.6244',
   'label_inde

### EXTRACTING ENGLISH DATA POINTS 

In [6]:
ds_t = ds_raw["train"]
ds_v = ds_raw["validation"]

cores = multiprocessing.cpu_count()

#how to run them concurrently?
ds_t = ds_t.filter(lambda example: example["language"] == "en", num_proc = cores)
ds_v = ds_v.filter(lambda example: example["language"] == "en", num_proc = cores)

### VIEWING AVAILABLE CLASSES

In [7]:
ner_classes = []

for row in ds_t:
   for token_class in row["mbert_token_classes"]:
      if token_class not in ner_classes:
         ner_classes.append(token_class)

In [8]:
ner_classes

['O',
 'B-USERNAME',
 'I-USERNAME',
 'B-DATEOFBIRTH',
 'I-DATEOFBIRTH',
 'B-STREET',
 'I-STREET',
 'B-ZIPCODE',
 'I-ZIPCODE',
 'B-TELEPHONENUM',
 'I-TELEPHONENUM',
 'B-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'B-EMAIL',
 'I-EMAIL',
 'B-CITY',
 'I-CITY',
 'B-BUILDINGNUM',
 'B-GIVENNAME',
 'I-GIVENNAME',
 'B-SURNAME',
 'I-SURNAME',
 'I-BUILDINGNUM',
 'B-IDCARDNUM',
 'I-IDCARDNUM',
 'B-PASSWORD',
 'I-PASSWORD',
 'B-DRIVERLICENSENUM',
 'I-DRIVERLICENSENUM',
 'B-SOCIALNUM',
 'I-SOCIALNUM',
 'B-ACCOUNTNUM',
 'I-ACCOUNTNUM',
 'B-TAXNUM',
 'I-TAXNUM']

In [9]:
ds_t

Dataset({
    features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
    num_rows: 68275
})

### DATA PROCESSING

In [10]:
ds_t = ds_t.remove_columns(["locale", "language", "split", "uid"])
ds_t = ds_t.rename_column("source_text", "tokens")

In [11]:
ds_t

Dataset({
    features: ['tokens', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
    num_rows: 68275
})

In [12]:
def add_special_tokens(example):
    example["mbert_tokens"].insert(0, "[CLS]")
    example["mbert_tokens"].append("[SEP]")
    return example

ds_t = ds_t.map(add_special_tokens)

In [13]:
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [14]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [15]:
tokenizer.is_fast

True

In [18]:
ds_t[0]

{'tokens': '<p>My child faozzsd379223 (DOB: May/58) will undergo treatment with Dr. faozzsd379223, office at Hill Road. Our ZIP code is 28170-6392. Consult policy M.UE.227995. Contact number: 0070.606.322.6244. Handle transactions with 6225427220412963. Queries? Email: faozzsd379223@outlook.com.</p>',
 'privacy_mask': [{'label': 'USERNAME',
   'start': 12,
   'end': 25,
   'value': 'faozzsd379223',
   'label_index': 2},
  {'label': 'DATEOFBIRTH',
   'start': 32,
   'end': 38,
   'value': 'May/58',
   'label_index': 1},
  {'label': 'USERNAME',
   'start': 72,
   'end': 85,
   'value': 'faozzsd379223',
   'label_index': 1},
  {'label': 'STREET',
   'start': 97,
   'end': 106,
   'value': 'Hill Road',
   'label_index': 1},
  {'label': 'ZIPCODE',
   'start': 124,
   'end': 134,
   'value': '28170-6392',
   'label_index': 1},
  {'label': 'TELEPHONENUM',
   'start': 180,
   'end': 197,
   'value': '0070.606.322.6244',
   'label_index': 1},
  {'label': 'CREDITCARDNUMBER',
   'start': 224,
   

In [22]:
inputs = tokenizer(ds_t[0]["tokens"])

In [24]:
inputs.tokens()

['[CLS]',
 '<',
 'p',
 '>',
 'My',
 'child',
 'f',
 '##ao',
 '##zz',
 '##s',
 '##d',
 '##37',
 '##9',
 '##22',
 '##3',
 '(',
 'D',
 '##O',
 '##B',
 ':',
 'May',
 '/',
 '58',
 ')',
 'will',
 'undergo',
 'treatment',
 'with',
 'Dr',
 '.',
 'f',
 '##ao',
 '##zz',
 '##s',
 '##d',
 '##37',
 '##9',
 '##22',
 '##3',
 ',',
 'office',
 'at',
 'Hill',
 'Road',
 '.',
 'Our',
 'ZIP',
 'code',
 'is',
 '281',
 '##70',
 '-',
 '63',
 '##9',
 '##2',
 '.',
 'Consul',
 '##t',
 'policy',
 'M',
 '.',
 'U',
 '##E',
 '.',
 '227',
 '##9',
 '##9',
 '##5',
 '.',
 'Contact',
 'number',
 ':',
 '00',
 '##70',
 '.',
 '60',
 '##6',
 '.',
 '322',
 '.',
 '62',
 '##44',
 '.',
 'Hand',
 '##le',
 'transactions',
 'with',
 '62',
 '##25',
 '##42',
 '##7',
 '##22',
 '##0',
 '##41',
 '##29',
 '##6',
 '##3',
 '.',
 'Que',
 '##ries',
 '?',
 'Em',
 '##ail',
 ':',
 'f',
 '##ao',
 '##zz',
 '##s',
 '##d',
 '##37',
 '##9',
 '##22',
 '##3',
 '@',
 'outlook',
 '.',
 'com',
 '.',
 '<',
 '/',
 'p',
 '>',
 '[SEP]']