In [3]:
from datasets import load_dataset
from transformers import AutoConfig
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


#### Load the dataset 

In [41]:
dataset_name = "ai4privacy/pii-masking-300k"


# ai4privacy/pii-masking-200k

In [42]:
samples = 10

In [6]:
model_name = "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2"

In [53]:
def load_data(dataset_name):
    dataset = load_dataset(dataset_name)
    filtered_dataset = dataset.filter(lambda example: example['language'] == 'English')
    df = filtered_dataset['train'].to_pandas() 
    df.head()
    return df

In [54]:
df = load_data(dataset_name=dataset_name)

Filter: 100%|██████████| 177677/177677 [01:00<00:00, 2928.59 examples/s]
Filter: 100%|██████████| 47728/47728 [00:16<00:00, 2959.57 examples/s]


In [55]:
df.head(10)

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set
0,Subject: Group Messaging for Admissions Proces...,Subject: Group Messaging for Admissions Proces...,"[{'value': 'wynqvrh053', 'start': 287, 'end': ...","[[440, 453, ""USERNAME""], [430, 437, ""TIME""], [...","[Sub, ##ject, :, Group, Mess, ##aging, for, Ad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40767A,English,train
1,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...,- Meeting at [TIME]\n- [USERNAME] - Meeting at...,"[{'value': '2:33 PM', 'start': 13, 'end': 20, ...","[[74, 81, ""TIME""], [50, 60, ""USERNAME""], [40, ...","[-, Meeting, at, 2, :, 33, PM, -, N, ##23, -, ...","[O, O, O, B-TIME, I-TIME, I-TIME, I-TIME, O, O...",40767B,English,train
2,Subject: Admission Notification - Great Britai...,Subject: Admission Notification - Great Britai...,"[{'value': '5:24am', 'start': 263, 'end': 269,...","[[395, 407, ""SOCIALNUMBER""], [358, 375, ""EMAIL...","[Sub, ##ject, :, Ad, ##mission, Not, ##ificati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40768A,English,train
3,Card: KB90324ER\n Country: GB\n Building: ...,Card: [IDCARD]\n Country: [COUNTRY]\n Buil...,"[{'value': 'KB90324ER', 'start': 6, 'end': 15,...","[[390, 393, ""STATE""], [368, 378, ""CITY""], [346...","[Card, :, KB, ##90, ##32, ##4, ##ER, \, n, Cou...","[O, O, B-IDCARD, I-IDCARD, I-IDCARD, I-IDCARD,...",40768B,English,train
4,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so...","N, WA14 5RW\n Password: [PASS]\n\n...and so ...","[{'value': 'r]iD1#8', 'start': 26, 'end': 33, ...","[[336, 352, ""DATE""], [26, 33, ""PASS""]]","[N, ,, W, ##A, ##14, 5, ##R, ##W, \, n, Pass, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PASS...",40768C,English,train
5,Subject: Admission Application Attachments Con...,Subject: Admission Application Attachments Con...,"[{'value': '301025226', 'start': 311, 'end': 3...","[[311, 320, ""PASSPORT""]]","[Sub, ##ject, :, Ad, ##mission, Application, A...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40769A,English,train
6,981\n- Social Security Number: 554.575.9355\n-...,981\n- Social Security Number: [SOCIALNUMBER]\...,"[{'value': '554.575.9355', 'start': 30, 'end':...","[[263, 279, ""TEL""], [226, 249, ""EMAIL""], [206,...","[981, -, Social, Security, Number, :, 554, ., ...","[O, O, O, O, O, O, O, O, B-SOCIALNUMBER, I-SOC...",40769B,English,train
7,s carefully and inform us immediately if there...,s carefully and inform us immediately if there...,"[{'value': 'Rue des Écoles', 'start': 320, 'en...","[[342, 354, ""DATE""], [338, 340, ""TIME""], [320,...","[s, care, ##fully, and, info, ##rm, us, immedi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40769C,English,train
8,"- id_1:\n Feb 8, 1986\n iloweintögl\n 4929-...",- id_1:\n [BOD]\n [USERNAME]\n [TEL]\n Det...,"[{'value': 'Feb 8, 1986', 'start': 10, 'end': ...","[[287, 300, ""TEL""], [280, 284, ""USERNAME""], [2...","[-, id, _, 1, :, Feb, 8, ,, 1986, il, ##owe, #...","[O, O, O, O, O, O, B-BOD, I-BOD, I-BOD, I-BOD,...",40772A,English,train
9,y involved in community service initiatives.\n...,y involved in community service initiatives.\n...,"[{'value': 'Apr 29, 1973', 'start': 55, 'end':...","[[339, 355, ""TEL""], [314, 336, ""USERNAME""], [2...","[y, involved, in, community, service, initiati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-BOD,...",40772B,English,train


In [56]:
def filtered_dataset(rows, df):
    return df.head(rows)

In [57]:
df_filtered = filtered_dataset(samples, df)

In [58]:
df_filtered.head(5)

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set
0,Subject: Group Messaging for Admissions Proces...,Subject: Group Messaging for Admissions Proces...,"[{'value': 'wynqvrh053', 'start': 287, 'end': ...","[[440, 453, ""USERNAME""], [430, 437, ""TIME""], [...","[Sub, ##ject, :, Group, Mess, ##aging, for, Ad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40767A,English,train
1,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...,- Meeting at [TIME]\n- [USERNAME] - Meeting at...,"[{'value': '2:33 PM', 'start': 13, 'end': 20, ...","[[74, 81, ""TIME""], [50, 60, ""USERNAME""], [40, ...","[-, Meeting, at, 2, :, 33, PM, -, N, ##23, -, ...","[O, O, O, B-TIME, I-TIME, I-TIME, I-TIME, O, O...",40767B,English,train
2,Subject: Admission Notification - Great Britai...,Subject: Admission Notification - Great Britai...,"[{'value': '5:24am', 'start': 263, 'end': 269,...","[[395, 407, ""SOCIALNUMBER""], [358, 375, ""EMAIL...","[Sub, ##ject, :, Ad, ##mission, Not, ##ificati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40768A,English,train
3,Card: KB90324ER\n Country: GB\n Building: ...,Card: [IDCARD]\n Country: [COUNTRY]\n Buil...,"[{'value': 'KB90324ER', 'start': 6, 'end': 15,...","[[390, 393, ""STATE""], [368, 378, ""CITY""], [346...","[Card, :, KB, ##90, ##32, ##4, ##ER, \, n, Cou...","[O, O, B-IDCARD, I-IDCARD, I-IDCARD, I-IDCARD,...",40768B,English,train
4,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so...","N, WA14 5RW\n Password: [PASS]\n\n...and so ...","[{'value': 'r]iD1#8', 'start': 26, 'end': 33, ...","[[336, 352, ""DATE""], [26, 33, ""PASS""]]","[N, ,, W, ##A, ##14, 5, ##R, ##W, \, n, Pass, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PASS...",40768C,English,train


getting the label list the model is trained on 

In [47]:
config = AutoConfig.from_pretrained(model_name)
label_list = list(config.id2label.values())
print(label_list)

['O', 'B-CITY', 'I-CITY', 'B-FIRSTNAME', 'I-FIRSTNAME', 'B-USERNAME', 'I-USERNAME', 'B-JOBTYPE', 'B-PREFIX', 'I-PREFIX', 'B-LASTNAME', 'B-EMAIL', 'I-EMAIL', 'B-NEARBYGPSCOORDINATE', 'I-NEARBYGPSCOORDINATE', 'B-ACCOUNTNUMBER', 'I-ACCOUNTNUMBER', 'B-ACCOUNTNAME', 'I-ACCOUNTNAME', 'B-MIDDLENAME', 'I-MIDDLENAME', 'B-COUNTY', 'I-COUNTY', 'B-AGE', 'B-CREDITCARDCVV', 'B-DOB', 'I-DOB', 'B-MASKEDNUMBER', 'I-MASKEDNUMBER', 'B-PASSWORD', 'I-PASSWORD', 'B-SEX', 'B-STATE', 'B-COMPANYNAME', 'I-COMPANYNAME', 'B-PHONEIMEI', 'I-PHONEIMEI', 'B-STREET', 'I-STREET', 'B-SSN', 'I-SSN', 'B-IPV4', 'I-IPV4', 'B-USERAGENT', 'I-USERAGENT', 'B-MAC', 'I-MAC', 'B-PIN', 'I-PIN', 'B-IP', 'I-IP', 'B-URL', 'I-URL', 'B-CURRENCYSYMBOL', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-VEHICLEVRM', 'I-VEHICLEVRM', 'I-AMOUNT', 'B-ETHEREUMADDRESS', 'I-ETHEREUMADDRESS', 'B-BITCOINADDRESS', 'I-BITCOINADDRESS', 'B-LITECOINADDRESS', 'I-LITECOINADDRESS', 'I-JOBTYPE', 'B-CREDITCARDNUMBER', 'I-CREDITCARDNUMBER', 'B-IPV6', 'I-IPV6', 'I-L

#### Load the model and tokenizer

In [13]:
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [59]:
df_filtered['source_text']  #viewing the source text to test 

0    Subject: Group Messaging for Admissions Proces...
1    - Meeting at 2:33 PM\n- N23 - Meeting at 11:29...
2    Subject: Admission Notification - Great Britai...
3    Card: KB90324ER\n   Country: GB\n   Building: ...
4    N, WA14 5RW\n   Password: r]iD1#8\n\n...and so...
5    Subject: Admission Application Attachments Con...
6    981\n- Social Security Number: 554.575.9355\n-...
7    s carefully and inform us immediately if there...
8    - id_1:\n  Feb 8, 1986\n  iloweintögl\n  4929-...
9    y involved in community service initiatives.\n...
Name: source_text, dtype: object

In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [61]:
#get the tokens of the test data sentence, since it is present in the dataset 
test_texts = [list(arr) for arr in df_filtered['mbert_text_tokens'].tolist()]


In [62]:
type(test_texts)

list

In [63]:
type(test_texts[0])

list

In [64]:
true_labels = df_filtered['mbert_bio_labels'].tolist() #ground truth of label list for the filtered dataset

In [65]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'O', 'B-USERNAME',
       'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-USERNAME',
       'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME',
       'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'O', 'B-TIME', 'I-TIME',
       'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME',
       'I-TIME', 'I-TIME', 'O', 'B-USERNAME', 'I-USERNAME'], dtype=object)

In [66]:
type(true_labels[0])

numpy.ndarray

In [22]:

encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt",
    is_split_into_words=True, 
    return_offsets_mapping=True
)

model_inputs = {key: val for key, val in encodings.items() if key != "offset_mapping"}



In [23]:
type(df_filtered['mbert_text_tokens'][1].tolist())

list

In [67]:
all_aligned_predictions = []  # Store predictions for all sentences
all_aligned_tokens = []

for i in range(len(df_filtered)):  # Loop through all sentences
    
    encodings = tokenizer(
        df_filtered['mbert_text_tokens'][i].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=True, 
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])  
    word_ids = encodings.word_ids()
    
    # Filter tokens to only keep the first subword of each word
    filtered_tokens = []
    filtered_word_ids = set()

    for token, word_id in zip(tokens, word_ids):
        if word_id is not None and word_id not in filtered_word_ids:
            filtered_tokens.append(token)
            filtered_word_ids.add(word_id)

    model_inputs = {key: val for key, val in encodings.items() if key != "offset_mapping"}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**model_inputs)

    logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices

    # Map predictions to original tokens
    aligned_predictions = []
    aligned_tokens = []
    previous_word_id = None

    # Align predictions with words
    for token, word_id, pred in zip(tokens, word_ids, predictions[0]):  
        if word_id is None:  
            continue  # Skip special tokens ([CLS], [SEP], etc.)

        if word_id != previous_word_id:  # Only take the first subword's prediction
            aligned_predictions.append(label_list[pred])
            aligned_tokens.append(token)

        previous_word_id = word_id 

    # Store predictions for this sentence
    all_aligned_predictions.append(aligned_predictions)
    all_aligned_tokens.append(aligned_tokens)




In [68]:
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: Subject: Group Messaging for Admissions Process

Good morning, everyone,

I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:

- wynqvrh053 - Meeting at 10:20am
- luka.burg - Meeting at 21
- qahil.wittauer - Meeting at quarter past 13
- gholamhossein.ruschke - Meeting at 9:47 PM
- pdmjrsyoz1460 
Length of actual label list = 
 121
Actual Tokens ['Sub' '##ject' ':' 'Group' 'Mess' '##aging' 'for' 'Ad' '##mission' '##s'
 'Process' 'Good' 'morning' ',' 'everyone' ',' 'I' 'hope' 'this' 'message'
 'finds' 'you' 'well' '.' 'As' 'we' 'continue' 'our' 'admission' '##s'
 'processes' ',' 'I' 'would' 'like' 'to' 'update' 'you' 'on' 'the'
 'latest' 'developments' 'and' 'key' 'information' '.' 'Please' 'find'
 'below' 'the' 'time' '##line' 'for' 'our' 'upcoming' 'meetings' ':' '-'
 'w' '##yn' '##q' '##vr' '##h' '##0' '##53' '-' 'Mee

In [25]:
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?
Length of actual label list = 
 48
Actual Tokens ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 'in' 'our' 'Op' '##timi' '##zation' 'curriculum' '.' 'Can'
 'you' 'pl' '##eas' '##e' 'collect' 'it' '?']

Predicted Aligned Tokens: ['▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '▁#', '▁:', '▁06', '▁-', '▁1847', '▁#', '▁-', '▁866', '▁#', '▁#', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '▁#', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '▁#', '▁collect', '▁it', '▁?']

In [26]:
# this is the code to get the predicted result for one sentence of the dataset, use this to understand how the tokens 
# made, and compare token wise the predicted labels and original labels 
index = 0
original_tokens = df_filtered['mbert_text_tokens'][index]  #ground truth of label list for the first sentence

encodings = tokenizer(
        df_filtered['mbert_text_tokens'][index].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=True, 
        return_offsets_mapping=True
    )

# Maps subwords (which are broken down further by the tokenizer) to original words present in the token list originally
tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])
offsets = encodings["offset_mapping"]
word_ids = encodings.word_ids()

# Select only the first subword per word to match dataset tokens
filtered_tokens = []
filtered_word_ids = set()

for token, word_id in zip(tokens, word_ids):
    if word_id is not None and word_id not in filtered_word_ids:
        filtered_tokens.append(token)
        filtered_word_ids.add(word_id)

# Print result
print("Filtered Tokens:", filtered_tokens)
print("Original Tokens:", original_tokens)

# Get model predictions
with torch.no_grad():
    outputs = model(**model_inputs)

logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices

# Map predictions to original tokens
aligned_predictions = []
aligned_tokens = []
previous_word_id = None

    # Align predictions with words
for token, word_id, pred in zip(tokens, word_ids, predictions[0]):  
    if word_id is None:  
        continue  # Skip special tokens ([CLS], [SEP], etc.)

    if word_id != previous_word_id:  # Only take the first subword's prediction
        aligned_predictions.append(label_list[pred])
        aligned_tokens.append(token)

    previous_word_id = word_id 

# Print final mapped predictions
print("Original Tokens:", original_tokens)
print("Predicted Labels:", aligned_predictions)

print(f"True labels list size = {len(true_labels[index])}")
print(f"True token list size = {len(df_filtered['mbert_text_tokens'][index])}")
print(f"Predicted labels list size = {len(aligned_predictions)}")

print(f"Filtered tokens = \n {filtered_tokens}")

print(f"True label = \n {true_labels[index]} \n")
print(f"Predicted label = \n {aligned_predictions} \n")

accuracy = accuracy_score(true_labels[index], aligned_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels[index], aligned_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Filtered Tokens: ['▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '▁#', '▁:', '▁06', '▁-', '▁1847', '▁#', '▁-', '▁866', '▁#', '▁#', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '▁#', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '▁#', '▁collect', '▁it', '▁?']
Original Tokens: ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 'in' 'our' 'Op' '##timi' '##zation' 'curriculum' '.' 'Can'
 'you' 'pl' '##eas' '##e' 'collect' 'it' '?']
Original Tokens: ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [181]:
len(true_labels[0])

48

In [201]:
len(aligned_predictions)

48

In [108]:
len(true_labels[0])

48

In [27]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'B-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-JOBAREA', 'I-JOBAREA', 'I-JOBAREA', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

In [30]:
len(all_aligned_predictions)

10

In [69]:
flat_predictions = [label for sentence in all_aligned_predictions for label in sentence]

In [70]:
flat_true_labels = [label for sentence in true_labels for label in sentence]

In [71]:
# Compute Accuracy, Precision, Recall, and F1-score
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Samples: {samples}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Samples: 10
Accuracy: 0.6891
Precision: 0.7578
Recall: 0.6891
F1-score: 0.7195


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
type(true_labels)

list

In [34]:
type(all_aligned_predictions)

list

In [266]:
len(all_aligned_predictions)

10

In [267]:
all_aligned_predictions[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-JOBAREA',
 'I-JOBAREA',
 'I-JOBAREA',
 'O',
 'O',
 'O',
 'O',
 'B-FIRSTNAME',
 'I-FIRSTNAME',
 'I-FIRSTNAME',
 'O',
 'O',
 'O']

'E'

In [281]:
true_labels

'O'

In [72]:
# Compare aligned predictions with actual predictions to find the 

prediction_analysis = [] #store the count of TP, FP, FN, TN

for predictions, label_list in zip(all_aligned_predictions, true_labels):
    tp =0
    tn = 0
    fp = 0
    fn =0
    miss_classified = 0
    count = 0
    print(f"Prediction = {predictions}, \n True label = {label_list}")
    true_labels_list = label_list.tolist()
    for pred, true_label in zip(predictions, true_labels_list):
        count +=1
        print(f"Pred = {pred}, True label = {true_label}")
        if pred == true_label:
            if pred != 'O':  # Correctly predicted an entity
                tp +=1
            else:
                tn +=1 # Correctly predicted a non-entity
        else:
            if pred!='O' and true_label == 'O':
             # Model falsely predicted an entity when it should be "O"
                fp +=1
            elif pred=='O' and true_label!='O':
            # Model failed to detect an entity (missed it)
                fn +=1
            else:
            # Model predicted the wrong entity (e.g., "B-ORG" instead of "B-PER")
                tp +=1
                miss_classified += 1  # Counts as a wrongly identified entity
        print(f"tp = {tp}, fp = {fp}, tn ={tn}, fn={fn}\n")
    prediction_analysis.append({"tp":tp, "fp":fp, "fn":fn, "tn":tn, "miss_classified":miss_classified, "tokens":count })



Prediction = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-BIC', 'I-PASSWORD', 'I-PASSWORD', 'I-PASSWORD', 'I-BIC', 'O', 'I-BIC', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'O', 'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'O', 'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'O', 'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'O', 'B-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN'], 
 True label 

In [73]:
print(prediction_analysis)

[{'tp': 23, 'fp': 23, 'fn': 6, 'tn': 69, 'miss_classified': 12, 'tokens': 121}, {'tp': 10, 'fp': 5, 'fn': 7, 'tn': 80, 'miss_classified': 3, 'tokens': 102}, {'tp': 28, 'fp': 9, 'fn': 0, 'tn': 95, 'miss_classified': 10, 'tokens': 132}, {'tp': 64, 'fp': 10, 'fn': 4, 'tn': 77, 'miss_classified': 44, 'tokens': 155}, {'tp': 12, 'fp': 16, 'fn': 0, 'tn': 90, 'miss_classified': 8, 'tokens': 118}, {'tp': 2, 'fp': 5, 'fn': 1, 'tn': 76, 'miss_classified': 2, 'tokens': 84}, {'tp': 36, 'fp': 40, 'fn': 22, 'tn': 35, 'miss_classified': 25, 'tokens': 133}, {'tp': 2, 'fp': 6, 'fn': 0, 'tn': 71, 'miss_classified': 2, 'tokens': 79}, {'tp': 14, 'fp': 18, 'fn': 10, 'tn': 63, 'miss_classified': 14, 'tokens': 105}, {'tp': 24, 'fp': 18, 'fn': 11, 'tn': 60, 'miss_classified': 24, 'tokens': 113}]


In [79]:
total_tp = sum(entry['tp'] for entry in prediction_analysis)
total_fp = sum(entry['fp'] for entry in prediction_analysis)
total_fn = sum(entry['fn'] for entry in prediction_analysis)
total_tn = sum(entry['tn'] for entry in prediction_analysis)

total_missclassified = sum(entry['miss_classified'] for entry in prediction_analysis)
total_tokens = sum(entry['tokens'] for entry in prediction_analysis)

In [80]:
print(f"Misclassified count = {total_missclassified}")
print(f"Total number of tokens = {total_tokens}")

Misclassified count = 144
Total number of tokens = 1142


In [75]:
# Compute metrics
accuracy = (total_tp + total_tn) / (total_tp + total_fp + total_fn + total_tn)
precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) != 0 else 0
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) != 0 else 0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

# Output results
accuracy, precision, recall, f1_score

(0.8152364273204904, 0.589041095890411, 0.7789855072463768, 0.6708268330733229)

In [38]:
all_aligned_predictions[3]

['O',
 'O',
 'O',
 'B-ZIPCODE',
 'I-ZIPCODE',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'O',
 'O',
 'O',
 'O',
 'O']

In [37]:
true_labels[3]

array(['O', 'O', 'O', 'B-BUILDINGNUMBER', 'I-BUILDINGNUMBER', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'I-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'I-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'O', 'O', 'O', 'O', 'O'], dtype=object)