### DEPENDENCIES

In [81]:
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

from datasets import load_dataset
from datasets import Dataset
import pandas as pd

### Pandas configuration

In [82]:
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

### Data preprocessing

In [84]:
# loading the dataset
PII = load_dataset("ai4privacy/pii-masking-400k")

# splitting the dataset into train and validation
#ds_t = ds_raw["train"]
#ds_v = ds_raw["validation"]

# extracting English data points
cores = multiprocessing.cpu_count()

# how to run them concurrently?
PII = PII.filter(lambda example: example["language"] == "en", num_proc = cores)

# remove unnecessary columns
PII = PII.remove_columns(["locale", "language", "split", "uid"])

Filter (num_proc=6): 100%|██████████| 325517/325517 [00:06<00:00, 47294.24 examples/s]
Filter (num_proc=6): 100%|██████████| 81379/81379 [00:01<00:00, 43256.13 examples/s]


In [85]:
PII

DatasetDict({
    train: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 68275
    })
    validation: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 17046
    })
})

In [87]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

### Adding special labels(for [CLS] and [SEP])

In [88]:
def add_special_tokens(example):
    example["mbert_tokens"].insert(0, "[CLS]")
    example["mbert_tokens"].append("[SEP]")
    return example

PII = PII.map(add_special_tokens)

Map: 100%|██████████| 68275/68275 [00:10<00:00, 6352.42 examples/s]
Map: 100%|██████████| 17046/17046 [00:02<00:00, 6378.34 examples/s]


In [89]:
def create_ids(example):
    example["input_ids"] = tokenizer.convert_tokens_to_ids(example["mbert_tokens"])
    return example

PII = PII.map(create_ids)

Map: 100%|██████████| 68275/68275 [00:11<00:00, 5842.32 examples/s]
Map: 100%|██████████| 17046/17046 [00:02<00:00, 6016.02 examples/s]


In [90]:
PII

DatasetDict({
    train: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes', 'input_ids'],
        num_rows: 68275
    })
    validation: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes', 'input_ids'],
        num_rows: 17046
    })
})

In [91]:
def create_attention_masks(example):
    example["attention_mask"] = [1] * (len(example["mbert_tokens"]) - 1)
    return example

PII = PII.map(create_attention_masks)

Map: 100%|██████████| 68275/68275 [00:09<00:00, 7251.74 examples/s]
Map: 100%|██████████| 17046/17046 [00:02<00:00, 7303.72 examples/s]


In [92]:
PII

DatasetDict({
    train: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes', 'input_ids', 'attention_mask'],
        num_rows: 68275
    })
    validation: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes', 'input_ids', 'attention_mask'],
        num_rows: 17046
    })
})

### Getting available NER classes and assigning unique NER labels

In [95]:
ner_classes = []
ner_ids = {}
ID = 0

# getting available NER classes
for row in PII["train"]:
   for token_class in row["mbert_token_classes"]:
      if token_class not in ner_classes:
         ner_classes.append(token_class)

# assigning unique IDs
for ner_class in ner_classes:
   ner_ids[ner_class] = None

for ner_class in ner_ids.keys():
   ner_ids[ner_class] = ID
   ID+=1

In [97]:
print(ner_classes)
print(ner_ids)

['O', 'B-USERNAME', 'I-USERNAME', 'B-DATEOFBIRTH', 'I-DATEOFBIRTH', 'B-STREET', 'I-STREET', 'B-ZIPCODE', 'I-ZIPCODE', 'B-TELEPHONENUM', 'I-TELEPHONENUM', 'B-CREDITCARDNUMBER', 'I-CREDITCARDNUMBER', 'B-EMAIL', 'I-EMAIL', 'B-CITY', 'I-CITY', 'B-BUILDINGNUM', 'B-GIVENNAME', 'I-GIVENNAME', 'B-SURNAME', 'I-SURNAME', 'I-BUILDINGNUM', 'B-IDCARDNUM', 'I-IDCARDNUM', 'B-PASSWORD', 'I-PASSWORD', 'B-DRIVERLICENSENUM', 'I-DRIVERLICENSENUM', 'B-SOCIALNUM', 'I-SOCIALNUM', 'B-ACCOUNTNUM', 'I-ACCOUNTNUM', 'B-TAXNUM', 'I-TAXNUM']
{'O': 0, 'B-USERNAME': 1, 'I-USERNAME': 2, 'B-DATEOFBIRTH': 3, 'I-DATEOFBIRTH': 4, 'B-STREET': 5, 'I-STREET': 6, 'B-ZIPCODE': 7, 'I-ZIPCODE': 8, 'B-TELEPHONENUM': 9, 'I-TELEPHONENUM': 10, 'B-CREDITCARDNUMBER': 11, 'I-CREDITCARDNUMBER': 12, 'B-EMAIL': 13, 'I-EMAIL': 14, 'B-CITY': 15, 'I-CITY': 16, 'B-BUILDINGNUM': 17, 'B-GIVENNAME': 18, 'I-GIVENNAME': 19, 'B-SURNAME': 20, 'I-SURNAME': 21, 'I-BUILDINGNUM': 22, 'B-IDCARDNUM': 23, 'I-IDCARDNUM': 24, 'B-PASSWORD': 25, 'I-PASSWORD': 

### Creating labels

In [98]:
def create_label_list(token_classes):
    labels = []
    for token_class in token_classes:
        labels.append(ner_ids[token_class])
    return labels

def create_labels(example):
    example["labels"] = create_label_list(example["mbert_token_classes"])
    return example

PII = PII.map(create_labels)

Map: 100%|██████████| 68275/68275 [00:09<00:00, 7032.78 examples/s]
Map: 100%|██████████| 17046/17046 [00:02<00:00, 7142.93 examples/s]


In [99]:
PII

DatasetDict({
    train: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 68275
    })
    validation: Dataset({
        features: ['source_text', 'privacy_mask', 'masked_text', 'mbert_tokens', 'mbert_token_classes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17046
    })
})

In [100]:
PII["train"]["labels"]

[[0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  5,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  8,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  9,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  10,
  0,
  0,
  0,
  0,
  0,
  0,
  11,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  13,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  15,
  16,
  16,
  16,
  16,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
 

In [203]:
# display how NER labels are related to tokens
def display_label_token_relation(data):
    tokens = data["mbert_tokens"]
    labels = data["labels"]
    i = 0
    d = {}
    for i in range(len(labels)):
        d[tokens[i]] = [labels[i]]
    dt = pd.DataFrame(d)
    print(dt)

In [204]:
display_label_token_relation(ds_t[0])

   <  p  >  My  child  fa  ##oz  ##zs  ##d  ##3  ##7  ##9  ##22  (  DO  ##B  \
0  0  0  0   0      0  13    14    14   14   14   14   14    14  0   0    0   

   :  May  /  58  )  will  under  ##go  treatment  with  Dr  .  ,  office  at  \
0  0    3  0   4  0     0      0     0          0     0   0  0  0       0   0   

   Hill  Road  Our  Z  ##IP  code  is  281  ##70  -  639  ##2  Con  ##sul  \
0     5     6    0  0     0     0   0    7     8  8    8    8    0      0   

   ##t  policy  M  UE  227  ##99  ##5  Contact  number  007  ##0  606  322  \
0    0       0  0   0    0     0   12        0       0    9   10   10   10   

   624  ##4  Hand  ##le  transaction  ##s  622  ##42  ##04  ##12  ##6  Que  \
0   10   10     0     0            0    0   11    12    12    12   12    0   

   ##ries  ?  Em  ##ail   @  out  ##lo  ##ok  com  
0       0  0   0      0  14   14    14    14   14  


In [205]:
print(ds_t[0]["mbert_tokens"])
print(ds_t[0]["labels"])

['<', 'p', '>', 'My', 'child', 'fa', '##oz', '##zs', '##d', '##3', '##7', '##9', '##22', '##3', '(', 'DO', '##B', ':', 'May', '/', '58', ')', 'will', 'under', '##go', 'treatment', 'with', 'Dr', '.', 'fa', '##oz', '##zs', '##d', '##3', '##7', '##9', '##22', '##3', ',', 'office', 'at', 'Hill', 'Road', '.', 'Our', 'Z', '##IP', 'code', 'is', '281', '##70', '-', '639', '##2', '.', 'Con', '##sul', '##t', 'policy', 'M', '.', 'UE', '.', '227', '##99', '##5', '.', 'Contact', 'number', ':', '007', '##0', '.', '606', '.', '322', '.', '624', '##4', '.', 'Hand', '##le', 'transaction', '##s', 'with', '622', '##5', '##42', '##7', '##22', '##04', '##12', '##9', '##6', '##3', '.', 'Que', '##ries', '?', 'Em', '##ail', ':', 'fa', '##oz', '##zs', '##d', '##3', '##7', '##9', '##22', '##3', '@', 'out', '##lo', '##ok', '.', 'com', '.', '<', '/', 'p', '>']
[0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 8, 8

### Displaying how token classes are related to BERT tokens

In [207]:
def display_token_classes_relationship():
    for i in range(len(ds_t[0]["mbert_token_classes"])):
        print(f"{ds_t[0]["mbert_tokens"][i]} - {ds_t[0]["mbert_token_classes"][i]}")

display_token_classes_relationship()

< - O
p - O
> - O
My - O
child - O
fa - B-USERNAME
##oz - I-USERNAME
##zs - I-USERNAME
##d - I-USERNAME
##3 - I-USERNAME
##7 - I-USERNAME
##9 - I-USERNAME
##22 - I-USERNAME
##3 - I-USERNAME
( - O
DO - O
##B - O
: - O
May - B-DATEOFBIRTH
/ - I-DATEOFBIRTH
58 - I-DATEOFBIRTH
) - O
will - O
under - O
##go - O
treatment - O
with - O
Dr - O
. - O
fa - B-USERNAME
##oz - I-USERNAME
##zs - I-USERNAME
##d - I-USERNAME
##3 - I-USERNAME
##7 - I-USERNAME
##9 - I-USERNAME
##22 - I-USERNAME
##3 - I-USERNAME
, - O
office - O
at - O
Hill - B-STREET
Road - I-STREET
. - O
Our - O
Z - O
##IP - O
code - O
is - O
281 - B-ZIPCODE
##70 - I-ZIPCODE
- - I-ZIPCODE
639 - I-ZIPCODE
##2 - I-ZIPCODE
. - O
Con - O
##sul - O
##t - O
policy - O
M - O
. - O
UE - O
. - O
227 - O
##99 - O
##5 - O
. - O
Contact - O
number - O
: - O
007 - B-TELEPHONENUM
##0 - I-TELEPHONENUM
. - I-TELEPHONENUM
606 - I-TELEPHONENUM
. - I-TELEPHONENUM
322 - I-TELEPHONENUM
. - I-TELEPHONENUM
624 - I-TELEPHONENUM
##4 - I-TELEPHONENUM
. - O
Hand

In [209]:
def add_special_labels(example):
    example["labels"].insert(0, -100)
    example["labels"].append(-100)
    return example

ds_t = ds_t.map(add_special_labels)

In [210]:
display_label_token_relation(ds_t[0])

   [CLS]  <  p  >  My  child  fa  ##oz  ##zs  ##d  ##3  ##7  ##9  ##22  (  DO  \
0   -100  0  0  0   0      0  13    14    14   14   14   14   14    14  0   0   

   ##B  :  May  /  58  )  will  under  ##go  treatment  with  Dr  .  ,  \
0    0  0    3  0   4  0     0      0     0          0     0   0  0  0   

   office  at  Hill  Road  Our  Z  ##IP  code  is  281  ##70  -  639  ##2  \
0       0   0     5     6    0  0     0     0   0    7     8  8    8    8   

   Con  ##sul  ##t  policy  M  UE  227  ##99  ##5  Contact  number  007  ##0  \
0    0      0    0       0  0   0    0     0   12        0       0    9   10   

   606  322  624  ##4  Hand  ##le  transaction  ##s  622  ##42  ##04  ##12  \
0   10   10   10   10     0     0            0    0   11    12    12    12   

   ##6  Que  ##ries  ?  Em  ##ail   @  out  ##lo  ##ok  com  [SEP]  
0   12    0       0  0   0      0  14   14    14    14   14   -100  


In [3]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer

In [4]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [219]:
def tokenize(example):
    example = tokenizer(example["source_text"])
    return example
data = ds_t.map(tokenize)

Map:  37%|███▋      | 25269/68275 [00:06<00:09, 4414.47 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 68275/68275 [00:15<00:00, 4325.49 examples/s]


In [228]:
inputs = tokenizer(ds_t[0]["source_text"])

In [233]:
inputs.word_ids(batch_index=0)

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 7,
 7,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 30,
 31,
 32,
 32,
 32,
 33,
 34,
 34,
 35,
 36,
 37,
 38,
 38,
 39,
 40,
 40,
 40,
 40,
 41,
 42,
 43,
 44,
 45,
 45,
 46,
 47,
 47,
 48,
 49,
 50,
 51,
 51,
 52,
 53,
 53,
 54,
 55,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 57,
 58,
 58,
 59,
 60,
 60,
 61,
 62,
 62,
 62,
 62,
 62,
 62,
 62,
 62,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 None]

In [187]:
print(inputs.tokens() == ds_t[0]["mbert_tokens"])

False


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [1]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-cased" 
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [194]:
from transformers import TrainingArguments, Trainer

In [6]:
text = "hello bro"
encoded_input = tokenizer(text, return_tensors="pt")

In [7]:
encoded_input

{'input_ids': tensor([[  101, 19082,  9304,  1186,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [9]:
output = model(**encoded_input)

In [10]:
output

TokenClassifierOutput(loss=None, logits=tensor([[[-0.0092,  0.1742],
         [ 0.1868,  0.5005],
         [ 0.1918,  0.1713],
         [ 0.2635,  0.0566],
         [ 0.3261, -0.3607]]], grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)