In [77]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [78]:
dataset_name = "ai4privacy/pii-masking-300k"

In [79]:
samples = 1000

In [80]:
model_name = "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2"

In [81]:
def load_data(dataset_name):
    dataset = load_dataset(dataset_name)
    filtered_dataset = dataset.filter(lambda example: example['language'] == 'English')
    df = filtered_dataset['train'].to_pandas() 
    df.head()
    return df

In [82]:
df = load_data(dataset_name=dataset_name)

In [83]:
len(df)

29908

In [84]:
df.head(10)

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set
0,Subject: Group Messaging for Admissions Proces...,Subject: Group Messaging for Admissions Proces...,"[{'value': 'wynqvrh053', 'start': 287, 'end': ...","[[440, 453, ""USERNAME""], [430, 437, ""TIME""], [...","[Sub, ##ject, :, Group, Mess, ##aging, for, Ad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40767A,English,train
1,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...,- Meeting at [TIME]\n- [USERNAME] - Meeting at...,"[{'value': '2:33 PM', 'start': 13, 'end': 20, ...","[[74, 81, ""TIME""], [50, 60, ""USERNAME""], [40, ...","[-, Meeting, at, 2, :, 33, PM, -, N, ##23, -, ...","[O, O, O, B-TIME, I-TIME, I-TIME, I-TIME, O, O...",40767B,English,train
2,Subject: Admission Notification - Great Britai...,Subject: Admission Notification - Great Britai...,"[{'value': '5:24am', 'start': 263, 'end': 269,...","[[395, 407, ""SOCIALNUMBER""], [358, 375, ""EMAIL...","[Sub, ##ject, :, Ad, ##mission, Not, ##ificati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40768A,English,train
3,Card: KB90324ER\n Country: GB\n Building: ...,Card: [IDCARD]\n Country: [COUNTRY]\n Buil...,"[{'value': 'KB90324ER', 'start': 6, 'end': 15,...","[[390, 393, ""STATE""], [368, 378, ""CITY""], [346...","[Card, :, KB, ##90, ##32, ##4, ##ER, \, n, Cou...","[O, O, B-IDCARD, I-IDCARD, I-IDCARD, I-IDCARD,...",40768B,English,train
4,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so...","N, WA14 5RW\n Password: [PASS]\n\n...and so ...","[{'value': 'r]iD1#8', 'start': 26, 'end': 33, ...","[[336, 352, ""DATE""], [26, 33, ""PASS""]]","[N, ,, W, ##A, ##14, 5, ##R, ##W, \, n, Pass, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PASS...",40768C,English,train
5,Subject: Admission Application Attachments Con...,Subject: Admission Application Attachments Con...,"[{'value': '301025226', 'start': 311, 'end': 3...","[[311, 320, ""PASSPORT""]]","[Sub, ##ject, :, Ad, ##mission, Application, A...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40769A,English,train
6,981\n- Social Security Number: 554.575.9355\n-...,981\n- Social Security Number: [SOCIALNUMBER]\...,"[{'value': '554.575.9355', 'start': 30, 'end':...","[[263, 279, ""TEL""], [226, 249, ""EMAIL""], [206,...","[981, -, Social, Security, Number, :, 554, ., ...","[O, O, O, O, O, O, O, O, B-SOCIALNUMBER, I-SOC...",40769B,English,train
7,s carefully and inform us immediately if there...,s carefully and inform us immediately if there...,"[{'value': 'Rue des Écoles', 'start': 320, 'en...","[[342, 354, ""DATE""], [338, 340, ""TIME""], [320,...","[s, care, ##fully, and, info, ##rm, us, immedi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40769C,English,train
8,"- id_1:\n Feb 8, 1986\n iloweintögl\n 4929-...",- id_1:\n [BOD]\n [USERNAME]\n [TEL]\n Det...,"[{'value': 'Feb 8, 1986', 'start': 10, 'end': ...","[[287, 300, ""TEL""], [280, 284, ""USERNAME""], [2...","[-, id, _, 1, :, Feb, 8, ,, 1986, il, ##owe, #...","[O, O, O, O, O, O, B-BOD, I-BOD, I-BOD, I-BOD,...",40772A,English,train
9,y involved in community service initiatives.\n...,y involved in community service initiatives.\n...,"[{'value': 'Apr 29, 1973', 'start': 55, 'end':...","[[339, 355, ""TEL""], [314, 336, ""USERNAME""], [2...","[y, involved, in, community, service, initiati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-BOD,...",40772B,English,train


In [85]:
def filtered_dataset(rows, df):
    return df.head(rows)

In [86]:
df_filtered = filtered_dataset(samples, df)

In [87]:
len(df_filtered)

1000

In [88]:
predicted_sensitive_values = []
true_sensitive_values = []
all_words =[]

In [89]:
import re
def clean_text(text):
    text = text.replace('\\n', ' ')  # Replace literal '\n' with space
    text = re.sub(r'\s+', ' ', text).strip()  # Replace any extra spaces with a single space
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = text.replace("**", ' ')
    return text

In [90]:
def get_all_words(source_texts):
    for text in source_texts:
        words = []
        sentence = clean_text(text)
        words = sentence.split()
        all_words.append(words)

In [91]:
def get_true_labels(df_filtered):
    for i in range(len(df_filtered)):
        label = [item["value"] for item in df_filtered['privacy_mask'][i]]
        label = ["".join(value.split()) for value in label]
        true_sensitive_values.append(label)

In [92]:
predicted_sensitive_values = []
true_sensitive_values = []
all_words =[]

get_all_words(df_filtered['source_text'])
get_true_labels(df_filtered)

Prediction

In [93]:
def load_model(model_name):
    # Load pre-trained BERT tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    return tokenizer, model

In [94]:
tokenizer, model = load_model(model_name)

In [95]:
def run_predictions(source_texts):
    
    for text in source_texts:
        prediction = []
        ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")
        sentence = clean_text(text)

        results = ner_pipeline(sentence)

        # Print results
        for entity in results:
            # print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")
            prediction.append(entity['word'])
        
        predicted_sensitive_values.append(prediction)

        

In [96]:
predicted_sensitive_values = []
run_predictions(df_filtered['source_text'])

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
D

In [97]:
def get_measures(true_sensitive_values, predicted_sensitive_values, all_words):
    metrics_mapping = []
    for true_list, pred_list, all_word_list in zip(true_sensitive_values, predicted_sensitive_values, all_words):
        tp = 0
        tn=0
        fp = 0 
        fn =0
        for true_value in true_list:
            # if true_value in pred_list:
            if any(true_value in pred_value for pred_value in pred_list):
                tp += 1  # Correctly identified (True Positive)
            else:
                print(f"Missed entity (False Negative): {true_value}")
                fn += 1  # Missed entity

        # Check for False Positives
        for predicted_value in pred_list:
            # if predicted_value not in true_list:
            if not any(true_value in predicted_value for true_value in true_list):
                print(f"Incorrectly identified (False Positive): {predicted_value}")
                fp += 1  # Incorrectly identified

        tn = len(all_word_list) - (tp + fp + fn)
        metrics_mapping.append({"tp":tp, "fp":fp, "fn":fn, "tn":tn})
    return metrics_mapping


In [98]:
metrics_mapping = get_measures(true_sensitive_values, predicted_sensitive_values, all_words)

Missed entity (False Negative): wynqvrh053
Missed entity (False Negative): 10:20am
Missed entity (False Negative): pdmjrsyoz1460
Incorrectly identified (False Positive): 
Incorrectly identified (False Positive): -Meeting
Missed entity (False Negative): N23
Missed entity (False Negative): 11:29pm
Incorrectly identified (False Positive): GreatBritain
Incorrectly identified (False Positive): GreatBritain
Missed entity (False Negative): GB
Missed entity (False Negative): ENG
Missed entity (False Negative): BS347HU,BS347HZ
Missed entity (False Negative): ENG
Incorrectly identified (False Positive): BS347HU,
Incorrectly identified (False Positive): BS347HZ
Incorrectly identified (False Positive): WA145R
Incorrectly identified (False Positive): WA145RW
Incorrectly identified (False Positive): GreatBritain
Incorrectly identified (False Positive): Great
Incorrectly identified (False Positive): Britain
Incorrectly identified (False Positive): ROSAL
Incorrectly identified (False Positive): 955306

In [99]:
def evaluate_model(metrics_mapping):
    # Initialize totals
    total_tp = 0
    total_fp = 0
    total_fn = 0
    total_tn = 0

    # Sum up the tp, fp, fn, tn for all sentences
    for metrics in metrics_mapping:
        total_tp += metrics["tp"]
        total_fp += metrics["fp"]
        total_fn += metrics["fn"]
        total_tn += metrics["tn"]

    # Compute the metrics
    accuracy = (total_tp + total_tn) / (total_tp + total_fp + total_fn + total_tn)
    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Print the metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")


In [100]:
evaluate_model(metrics_mapping)

Accuracy: 0.9278
Precision: 0.7608
Recall: 0.5916
F1 Score: 0.6656


In [101]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")
sentence = clean_text(df_filtered['source_text'][9])

results = ner_pipeline(sentence)

# Print results
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")

Device set to use cpu


Entity: Apr29,1973, Label: DOB, Score: 0.9389
Entity: 29summikota, Label: DATE, Score: 0.4776
Entity: +51-063-367.7939, Label: PHONENUMBER, Score: 0.9291
Entity: May26,1993, Label: DOB, Score: 0.8120
Entity: npvhxlrgvjdhzjaf439498, Label: PASSWORD, Score: 0.8108
Entity: +20-914.834.1296, Label: PHONENUMBER, Score: 0.7936


In [102]:
true_sensitive_values[9]

['Apr29,1973',
 '29summikota',
 '+51-063-367.7939',
 'May26,1993',
 'npvhxlrgvjdhzjaf439498',
 '+20-914.834.1296']

Samples = 10 

Accuracy: 0.9495, 
Precision: 0.7647,
Recall: 0.8125,
F1 Score: 0.7879


Samples = 100

Accuracy: 0.9329,
Precision: 0.7618,
Recall: 0.6232,
F1 Score: 0.6856


Samples = 1000

Accuracy: 0.9278,
Precision: 0.7608,
Recall: 0.5916,
F1 Score: 0.6656