In [None]:
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoConfig
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

#### Load the dataset 

In [2]:
dataset_name = "ai4privacy/pii-masking-200k"

In [3]:
samples = 10

In [4]:
model_name = "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2"

In [5]:
def load_data(dataset_name):
    dataset = load_dataset(dataset_name)
    filtered_dataset = dataset.filter(lambda example: example['language'] == 'en')
    df = filtered_dataset['train'].to_pandas() 
    df.head()
    return df

In [6]:
df = load_data(dataset_name=dataset_name)

In [7]:
def filtered_dataset(rows, df):
    return df.head(rows)

In [8]:
df_filtered = filtered_dataset(samples, df)

getting the label list the model is trained on 

In [9]:
config = AutoConfig.from_pretrained(model_name)
label_list = list(config.id2label.values())
print(label_list)

['O', 'B-CITY', 'I-CITY', 'B-FIRSTNAME', 'I-FIRSTNAME', 'B-USERNAME', 'I-USERNAME', 'B-JOBTYPE', 'B-PREFIX', 'I-PREFIX', 'B-LASTNAME', 'B-EMAIL', 'I-EMAIL', 'B-NEARBYGPSCOORDINATE', 'I-NEARBYGPSCOORDINATE', 'B-ACCOUNTNUMBER', 'I-ACCOUNTNUMBER', 'B-ACCOUNTNAME', 'I-ACCOUNTNAME', 'B-MIDDLENAME', 'I-MIDDLENAME', 'B-COUNTY', 'I-COUNTY', 'B-AGE', 'B-CREDITCARDCVV', 'B-DOB', 'I-DOB', 'B-MASKEDNUMBER', 'I-MASKEDNUMBER', 'B-PASSWORD', 'I-PASSWORD', 'B-SEX', 'B-STATE', 'B-COMPANYNAME', 'I-COMPANYNAME', 'B-PHONEIMEI', 'I-PHONEIMEI', 'B-STREET', 'I-STREET', 'B-SSN', 'I-SSN', 'B-IPV4', 'I-IPV4', 'B-USERAGENT', 'I-USERAGENT', 'B-MAC', 'I-MAC', 'B-PIN', 'I-PIN', 'B-IP', 'I-IP', 'B-URL', 'I-URL', 'B-CURRENCYSYMBOL', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-VEHICLEVRM', 'I-VEHICLEVRM', 'I-AMOUNT', 'B-ETHEREUMADDRESS', 'I-ETHEREUMADDRESS', 'B-BITCOINADDRESS', 'I-BITCOINADDRESS', 'B-LITECOINADDRESS', 'I-LITECOINADDRESS', 'I-JOBTYPE', 'B-CREDITCARDNUMBER', 'I-CREDITCARDNUMBER', 'B-IPV6', 'I-IPV6', 'I-L

#### Load the model and tokenizer

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
df_filtered['source_text']  #viewing the source text to test 

0    A student's assessment was found on device bea...
1    Dear Omer, as per our records, your license 78...
2    Kattie could you please share your recomndatio...
3    Emergency supplies in 16356 need a refill. Use...
4    The 88 old child at 5862, has showcased an unu...
5    Your recent hospital data recorded on 29/12/19...
6    Dear Trans male, Let's clear this misunderstan...
7    The wellness portal is accessible at [-71.6702...
8    Carleton, the new interactive educational tool...
9    1. Customer query received at 10:18 PM from Hu...
Name: source_text, dtype: object

In [135]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [None]:
#get the tokens of the test data sentence, since it is present in the dataset 
test_texts = [list(arr) for arr in df_filtered['mbert_text_tokens'].tolist()]


In [167]:
type(test_texts)

list

In [168]:
type(test_texts[0])

list

In [282]:
true_labels = df_filtered['mbert_bio_labels'].tolist() #ground truth of label list for the filtered dataset

In [276]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'B-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-JOBAREA', 'I-JOBAREA', 'I-JOBAREA', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

In [277]:
type(true_labels[0])

numpy.ndarray

In [211]:

encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt",
    is_split_into_words=True, 
    return_offsets_mapping=True
)

model_inputs = {key: val for key, val in encodings.items() if key != "offset_mapping"}



In [241]:
type(df_filtered['mbert_text_tokens'][1].tolist())

list

In [244]:
all_aligned_predictions = []  # Store predictions for all sentences
all_aligned_tokens = []

for i in range(len(df_filtered)):  # Loop through all sentences
    
    encodings = tokenizer(
        df_filtered['mbert_text_tokens'][i].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=True, 
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])  
    word_ids = encodings.word_ids()
    
    # Filter tokens to only keep the first subword of each word
    filtered_tokens = []
    filtered_word_ids = set()

    for token, word_id in zip(tokens, word_ids):
        if word_id is not None and word_id not in filtered_word_ids:
            filtered_tokens.append(token)
            filtered_word_ids.add(word_id)

    model_inputs = {key: val for key, val in encodings.items() if key != "offset_mapping"}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**model_inputs)

    logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices

    # Map predictions to original tokens
    aligned_predictions = []
    aligned_tokens = []
    previous_word_id = None

    # Align predictions with words
    for token, word_id, pred in zip(tokens, word_ids, predictions[0]):  
        if word_id is None:  
            continue  # Skip special tokens ([CLS], [SEP], etc.)

        if word_id != previous_word_id:  # Only take the first subword's prediction
            aligned_predictions.append(label_list[pred])
            aligned_tokens.append(token)

        previous_word_id = word_id 

    # Store predictions for this sentence
    all_aligned_predictions.append(aligned_predictions)
    all_aligned_tokens.append(aligned_tokens)




In [245]:
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?
Length of actual label list = 
 48
Actual Tokens ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 'in' 'our' 'Op' '##timi' '##zation' 'curriculum' '.' 'Can'
 'you' 'pl' '##eas' '##e' 'collect' 'it' '?']

Predicted Aligned Tokens: ['▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '▁#', '▁:', '▁06', '▁-', '▁1847', '▁#', '▁-', '▁866', '▁#', '▁#', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '▁#', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '▁#', '▁collect', '▁it', '▁?']

In [255]:
# this is the code to get the predicted result for one sentence of the dataset, use this to understand how the tokens 
# made, and compare token wise the predicted labels and original labels 
index = 0
original_tokens = df_filtered['mbert_text_tokens'][index]  #ground truth of label list for the first sentence

encodings = tokenizer(
        df_filtered['mbert_text_tokens'][index].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=True, 
        return_offsets_mapping=True
    )

# Maps subwords (which are broken down further by the tokenizer) to original words present in the token list originally
tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])
offsets = encodings["offset_mapping"]
word_ids = encodings.word_ids()

# Select only the first subword per word to match dataset tokens
filtered_tokens = []
filtered_word_ids = set()

for token, word_id in zip(tokens, word_ids):
    if word_id is not None and word_id not in filtered_word_ids:
        filtered_tokens.append(token)
        filtered_word_ids.add(word_id)

# Print result
print("Filtered Tokens:", filtered_tokens)
print("Original Tokens:", original_tokens)

# Get model predictions
with torch.no_grad():
    outputs = model(**model_inputs)

logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices

# Map predictions to original tokens
aligned_predictions = []
aligned_tokens = []
previous_word_id = None

    # Align predictions with words
for token, word_id, pred in zip(tokens, word_ids, predictions[0]):  
    if word_id is None:  
        continue  # Skip special tokens ([CLS], [SEP], etc.)

    if word_id != previous_word_id:  # Only take the first subword's prediction
        aligned_predictions.append(label_list[pred])
        aligned_tokens.append(token)

    previous_word_id = word_id 

# Print final mapped predictions
print("Original Tokens:", original_tokens)
print("Predicted Labels:", aligned_predictions)

print(f"True labels list size = {len(true_labels[index])}")
print(f"True token list size = {len(df_filtered['mbert_text_tokens'][index])}")
print(f"Predicted labels list size = {len(aligned_predictions)}")

print(f"Filtered tokens = \n {filtered_tokens}")

print(f"True label = \n {true_labels[index]} \n")
print(f"Predicted label = \n {aligned_predictions} \n")

accuracy = accuracy_score(true_labels[index], aligned_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels[index], aligned_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Filtered Tokens: ['▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '▁#', '▁:', '▁06', '▁-', '▁1847', '▁#', '▁-', '▁866', '▁#', '▁#', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '▁#', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '▁#', '▁collect', '▁it', '▁?']
Original Tokens: ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 'in' 'our' 'Op' '##timi' '##zation' 'curriculum' '.' 'Can'
 'you' 'pl' '##eas' '##e' 'collect' 'it' '?']
Original Tokens: ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [183]:
tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])
print(tokens)

['[CLS]', '▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '#', 'E', '▁#', '#', 'I', '▁:', '▁06', '▁-', '▁1847', '▁#', '#', '55', '▁-', '▁866', '▁#', '#', '85', '▁#', '#', '1', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '#', 't', 'imi', '▁#', '#', 'zation', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '#', 'ea', 's', '▁#', '#', 'e', '▁collect', '▁it', '▁?', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [181]:
len(true_labels[0])

48

In [201]:
len(aligned_predictions)

48

In [108]:
len(true_labels[0])

48

In [203]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'B-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-JOBAREA', 'I-JOBAREA', 'I-JOBAREA', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

In [202]:
aligned_predictions

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-JOBAREA',
 'I-JOBAREA',
 'I-JOBAREA',
 'O',
 'O',
 'O',
 'O',
 'B-FIRSTNAME',
 'I-FIRSTNAME',
 'I-FIRSTNAME',
 'O',
 'O',
 'O']

In [225]:
len(all_aligned_predictions)

10

In [246]:
flat_predictions = [label for sentence in all_aligned_predictions for label in sentence]

In [247]:
flat_true_labels = [label for sentence in true_labels for label in sentence]

In [249]:
# Compute Accuracy, Precision, Recall, and F1-score
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Samples: {samples}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Samples: 10
Accuracy: 0.9730
Precision: 0.9799
Recall: 0.9730
F1-score: 0.9754


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [259]:
type(true_labels)

list

In [260]:
type(all_aligned_predictions)

list

In [266]:
len(all_aligned_predictions)

10

In [267]:
all_aligned_predictions[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-JOBAREA',
 'I-JOBAREA',
 'I-JOBAREA',
 'O',
 'O',
 'O',
 'O',
 'B-FIRSTNAME',
 'I-FIRSTNAME',
 'I-FIRSTNAME',
 'O',
 'O',
 'O']

'E'

In [281]:
true_labels

'O'

In [285]:
# Compare aligned predictions with actual predictions to find the 

prediction_analysis = [] #store the count of TP, FP, FN, TN

for predictions, label_list in zip(all_aligned_predictions, true_labels):
    tp =0
    tn = 0
    fp = 0
    fn =0
    count = 0
    print(f"Prediction = {predictions}, \n True label = {label_list}")
    true_labels_list = label_list.tolist()
    for pred, true_label in zip(predictions, true_labels_list):
        count +=1
        print(f"Pred = {pred}, True label = {true_label}")
        if pred == true_label:
            if pred != 'O':  # Correctly predicted an entity
                tp +=1
            else:
                tn +=1 # Correctly predicted a non-entity
        else:
            if pred!='O' and true_label == 'O':
             # Model falsely predicted an entity when it should be "O"
                fp +=1
            elif pred=='O' and true_label!='O':
            # Model failed to detect an entity (missed it)
                fn +=1
            else:
            # Model predicted the wrong entity (e.g., "B-ORG" instead of "B-PER")
                fp += 1  # Counts as a wrongly identified entity
        print(f"tp = {tp}, fp = {fp}, tn ={tn}, fn={fn}\n")
    prediction_analysis.append({"tp":tp, "fp":fp, "fn":fn, "tn":tn, "tokens":count })



Prediction = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-JOBAREA', 'I-JOBAREA', 'I-JOBAREA', 'O', 'O', 'O', 'O', 'B-FIRSTNAME', 'I-FIRSTNAME', 'I-FIRSTNAME', 'O', 'O', 'O'], 
 True label = ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B-PHONEIMEI'
 'I-PHONEIMEI' 'I-PHONEIMEI' 'I-PHONEIMEI' 'I-PHONEIMEI' 'I-PHONEIMEI'
 'I-PHONEIMEI' 'I-PHONEIMEI' 'I-PHONEIMEI' 'I-PHONEIMEI' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B-JOBAREA' 'I-JOBAREA' 'I-JOBAREA' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']
Pred = O, True label = O
tp = 0, fp = 0, tn =1, fn=0

Pred = O, True label = O
tp = 0, fp = 0, tn =2, fn=0

Pred = O, True label = O
tp = 0, fp = 0, tn =3, fn=0

Pred = O, True label = O
tp = 0, fp = 0, tn =4, fn=0

Pred = O, True label = O
tp = 0, fp = 

In [286]:
print(prediction_analysis)

[{'tp': 13, 'fp': 3, 'fn': 0, 'tn': 32, 'tokens': 48}, {'tp': 14, 'fp': 0, 'fn': 0, 'tn': 31, 'tokens': 45}, {'tp': 11, 'fp': 0, 'fn': 0, 'tn': 20, 'tokens': 31}, {'tp': 0, 'fp': 11, 'fn': 0, 'tn': 14, 'tokens': 25}, {'tp': 14, 'fp': 0, 'fn': 0, 'tn': 26, 'tokens': 40}, {'tp': 33, 'fp': 0, 'fn': 0, 'tn': 25, 'tokens': 58}, {'tp': 14, 'fp': 0, 'fn': 0, 'tn': 52, 'tokens': 66}, {'tp': 52, 'fp': 0, 'fn': 0, 'tn': 31, 'tokens': 83}, {'tp': 24, 'fp': 0, 'fn': 0, 'tn': 22, 'tokens': 46}, {'tp': 28, 'fp': 0, 'fn': 0, 'tn': 49, 'tokens': 77}]


In [290]:
all_aligned_predictions[3]

['O',
 'O',
 'O',
 'B-ZIPCODE',
 'I-ZIPCODE',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'O',
 'O',
 'O',
 'O',
 'O']

In [291]:
true_labels[3]

array(['O', 'O', 'O', 'B-BUILDINGNUMBER', 'I-BUILDINGNUMBER', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'I-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'I-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'O', 'O', 'O', 'O', 'O'], dtype=object)