In [None]:
import multiprocessing
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from model_checkpoint import model_checkpoint

### Utility functions

In [None]:
def print_length_correlation(data):
        print("input_ids -", len(data["input_ids"]))
        print("attention_mask -", len(data["attention_mask"]))
        print("labels -", len(data["labels"]))

### Data preprocessing

In [None]:
# loading the dataset
PII = load_dataset("ai4privacy/pii-masking-400k")

# extracting English data points
cores = multiprocessing.cpu_count()

# how to run them concurrently?
PII = PII.filter(lambda example: example["language"] == "en", num_proc = cores)

# remove unnecessary columns
PII = PII.remove_columns(["locale", "language", "split", "uid"])

In [None]:
PII

### Exploratory data analysis

In [None]:
def count_classes():
    classes = {}

    for row in PII["train"]:
        for value in row["privacy_mask"]:
            if value["label"] not in classes.keys():
                classes[value["label"]] = 1
            else:
                classes[value["label"]] += 1

    return classes

In [None]:
def show_class_distribution(classes):
    keys = classes.keys()
    counts = classes.values()

    plt.figure(figsize=(35, 10))
    plt.bar(keys, counts, color="black")

    plt.xlabel('Classes')
    plt.ylabel('Counts')
    plt.title('Class Distribution')

    plt.show()

In [None]:
show_class_distribution(count_classes())

In [None]:
# shuffle the train dataset
PII = PII.shuffle(seed=33)

# reduce the train and validation datasets down to approx. 27000 training data points and approx. 12000 validation data points
select_train = int(0.40 * len(PII["train"]))
select_validation = int(0.70 * len(PII["validation"]))
PII["train"] = PII["train"].select(range(select_train))
PII["validation"] = PII["validation"].select(range(select_validation))

In [None]:
PII

In [None]:
show_class_distribution(count_classes())

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### Adding special tokens([CLS] and [SEP]) to "mbert_tokens"

In [None]:
def add_special_tokens(example):
    example["mbert_tokens"].insert(0, "[CLS]")
    example["mbert_tokens"].append("[SEP]")
    return example

PII = PII.map(add_special_tokens)

In [None]:
print(PII["train"][0]["mbert_tokens"])

### Creating "input_ids" for "mbert_tokens"

In [None]:
def create_ids(example):
    example["input_ids"] = tokenizer.convert_tokens_to_ids(example["mbert_tokens"])
    return example

PII = PII.map(create_ids)

In [None]:
print(PII["train"][0]["input_ids"])
print(PII["train"][0]["mbert_tokens"])

### Creating attention masks

In [None]:
def create_attention_masks(example):
    example["attention_mask"] = [1] * (len(example["mbert_tokens"]))
    return example

PII = PII.map(create_attention_masks)

In [None]:
PII

### Getting available NER classes and assigning unique NER class IDs

In [None]:
ner_classes = []
ner_ids = {}
ID = 0

# getting available NER classes
for row in PII["train"]:
   for token_class in row["mbert_token_classes"]:
      if token_class not in ner_classes:
         ner_classes.append(token_class)

# assigning unique IDs
for ner_class in ner_classes:
   ner_ids[ner_class] = None

for ner_class in ner_ids.keys():
   ner_ids[ner_class] = ID
   ID+=1

In [None]:
print(ner_classes)
print(ner_ids)

### Creating labels for NER classes

In [None]:
def create_label_list(token_classes):
    labels = []
    for token_class in token_classes:
        labels.append(ner_ids[token_class])
    return labels

def create_labels(example):
    example["labels"] = create_label_list(example["mbert_token_classes"])
    return example

PII = PII.map(create_labels)

In [None]:
PII

In [None]:
print_length_correlation(PII["train"][0])

In [None]:
def add_special_labels(example):
    example["labels"].insert(0, -100)
    example["labels"].append(-100)
    return example

PII = PII.map(add_special_labels)

In [None]:
print_length_correlation(PII["train"][0])

In [None]:
# find the longest input_id

def find_longest_id(data):
    max_length = 0
    for row in data["input_ids"]:
        if len(row) > max_length:
            max_length = len(row)
    return max_length

In [None]:
longest_id = find_longest_id(PII["train"])
longest_validation_id = find_longest_id(PII["validation"]) 
if longest_id < longest_validation_id:
    longest_id = longest_validation_id 

In [None]:
def pad(examples):
    max_length = longest_id
    return tokenizer.pad(examples, padding="max_length", max_length=max_length) 

PII = PII.map(pad, batched=True)

In [None]:
print("input_ids -", PII["train"][0]["input_ids"])
print("attention_mask -", PII["train"][0]["attention_mask"])

In [None]:
print_length_correlation(PII["train"][0])

In [None]:
def pad_labels(example):
    labels_len = len(example["labels"])
    input_ids_len = len(example["input_ids"]) 
    if labels_len < input_ids_len:
        for i in range(input_ids_len - labels_len):
            example["labels"].append(-100) 
    return example

PII = PII.map(pad_labels)

In [None]:
print("labels -", PII["train"][0]["labels"])
print("input_ids -", PII["train"][0]["input_ids"])
print("attention_mask -", PII["train"][0]["attention_mask"])

In [None]:
print_length_correlation(PII["train"][0])

In [None]:
label2id = ner_ids
print(label2id)

In [None]:
id2label = {}
for value, key in enumerate(ner_ids):
    id2label[value] = key

print(id2label)

In [None]:
tokenizer.save_pretrained("./preprocessing_tokenizer")

In [None]:
PII.save_to_disk("./dataset")