In [9]:
from datasets import load_dataset
from transformers import AutoConfig
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


#### Load the dataset 

In [1]:
dataset_name = "ai4privacy/pii-masking-200k"


# ai4privacy/pii-masking-200k

In [2]:
samples = 10

In [78]:
model_name = "Isotonic/deberta-v3-base_finetuned_ai4privacy_v2"

In [11]:
def load_data(dataset_name):
    dataset = load_dataset(dataset_name)
    filtered_dataset = dataset.filter(lambda example: example['language'] == 'en')
    df = filtered_dataset['train'].to_pandas() 
    df.head()
    return df

In [12]:
df = load_data(dataset_name=dataset_name)

In [13]:
df.head(10)

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"[{'value': '06-184755-866851-3', 'start': 57, ...","[[0, 57, ""O""], [57, 75, ""PHONEIMEI""], [75, 138...","[A, student, ', s, assessment, was, found, on,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...",165761,en,train
1,"Dear Omer, as per our records, your license 78...","Dear [FIRSTNAME], as per our records, your lic...","[{'value': 'Omer', 'start': 5, 'end': 9, 'labe...","[[0, 5, ""O""], [5, 9, ""FIRSTNAME""], [9, 44, ""O""...","[Dear, Omer, ,, as, per, our, records, ,, your...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-VEH...",165762,en,train
2,Kattie could you please share your recomndatio...,[FIRSTNAME] could you please share your recomn...,"[{'value': 'Kattie', 'start': 0, 'end': 6, 'la...","[[0, 6, ""FIRSTNAME""], [6, 75, ""O""], [75, 77, ""...","[Kat, ##tie, could, you, pl, ##eas, ##e, share...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...",165763,en,train
3,Emergency supplies in 16356 need a refill. Use...,Emergency supplies in [BUILDINGNUMBER] need a ...,"[{'value': '16356', 'start': 22, 'end': 27, 'l...","[[0, 22, ""O""], [22, 27, ""BUILDINGNUMBER""], [27...","[Emergency, supplies, in, 1635, ##6, need, a, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...",165764,en,train
4,"The 88 old child at 5862, has showcased an unu...","The [AGE] old child at [BUILDINGNUMBER], has s...","[{'value': '88', 'start': 4, 'end': 6, 'label'...","[[0, 4, ""O""], [4, 6, ""AGE""], [6, 20, ""O""], [20...","[The, 88, old, child, at, 586, ##2, ,, has, sh...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...",165765,en,train
5,Your recent hospital data recorded on 29/12/19...,Your recent hospital data recorded on [DOB] re...,"[{'value': '29/12/1957', 'start': 38, 'end': 4...","[[0, 38, ""O""], [38, 48, ""DOB""], [48, 115, ""O""]...","[Your, recent, hospital, data, recorded, on, 2...","[O, O, O, O, O, O, B-DOB, I-DOB, I-DOB, I-DOB,...",165766,en,train
6,"Dear Trans male, Let's clear this misunderstan...","Dear [GENDER], Let's clear this misunderstandi...","[{'value': 'Trans male', 'start': 5, 'end': 15...","[[0, 5, ""O""], [5, 15, ""GENDER""], [15, 91, ""O""]...","[Dear, Trans, male, ,, Let, ', s, clear, this,...","[O, B-GENDER, I-GENDER, O, O, O, O, O, O, O, O...",165767,en,train
7,The wellness portal is accessible at [-71.6702...,The wellness portal is accessible at [NEARBYGP...,"[{'value': '[-71.6702,-107.6572]', 'start': 37...","[[0, 37, ""O""], [37, 57, ""NEARBYGPSCOORDINATE""]...","[The, well, ##ness, portal, is, accessible, at...","[O, O, O, O, O, O, O, B-NEARBYGPSCOORDINATE, I...",165768,en,train
8,"Carleton, the new interactive educational tool...","[FIRSTNAME], the new interactive educational t...","[{'value': 'Carleton', 'start': 0, 'end': 8, '...","[[0, 8, ""FIRSTNAME""], [8, 69, ""O""], [69, 88, ""...","[Carleton, ,, the, new, interactive, education...","[B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O, O,...",165769,en,train
9,1. Customer query received at 10:18 PM from Hu...,1. Customer query received at [TIME] from [JOB...,"[{'value': '10:18 PM', 'start': 30, 'end': 38,...","[[0, 30, ""O""], [30, 38, ""TIME""], [38, 44, ""O""]...","[1, ., Custom, ##er, quer, ##y, received, at, ...","[O, O, O, O, O, O, O, O, B-TIME, I-TIME, I-TIM...",165770,en,train


In [14]:
def filtered_dataset(rows, df):
    return df.head(rows)

In [15]:
df_filtered = filtered_dataset(samples, df)

In [16]:
df_filtered.head(5)

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"[{'value': '06-184755-866851-3', 'start': 57, ...","[[0, 57, ""O""], [57, 75, ""PHONEIMEI""], [75, 138...","[A, student, ', s, assessment, was, found, on,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...",165761,en,train
1,"Dear Omer, as per our records, your license 78...","Dear [FIRSTNAME], as per our records, your lic...","[{'value': 'Omer', 'start': 5, 'end': 9, 'labe...","[[0, 5, ""O""], [5, 9, ""FIRSTNAME""], [9, 44, ""O""...","[Dear, Omer, ,, as, per, our, records, ,, your...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-VEH...",165762,en,train
2,Kattie could you please share your recomndatio...,[FIRSTNAME] could you please share your recomn...,"[{'value': 'Kattie', 'start': 0, 'end': 6, 'la...","[[0, 6, ""FIRSTNAME""], [6, 75, ""O""], [75, 77, ""...","[Kat, ##tie, could, you, pl, ##eas, ##e, share...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...",165763,en,train
3,Emergency supplies in 16356 need a refill. Use...,Emergency supplies in [BUILDINGNUMBER] need a ...,"[{'value': '16356', 'start': 22, 'end': 27, 'l...","[[0, 22, ""O""], [22, 27, ""BUILDINGNUMBER""], [27...","[Emergency, supplies, in, 1635, ##6, need, a, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...",165764,en,train
4,"The 88 old child at 5862, has showcased an unu...","The [AGE] old child at [BUILDINGNUMBER], has s...","[{'value': '88', 'start': 4, 'end': 6, 'label'...","[[0, 4, ""O""], [4, 6, ""AGE""], [6, 20, ""O""], [20...","[The, 88, old, child, at, 586, ##2, ,, has, sh...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...",165765,en,train


getting the label list the model is trained on 

In [29]:
config = AutoConfig.from_pretrained(model_name)
label_list = list(config.id2label.values())
print(label_list)

['O', 'B-CITY', 'I-CITY', 'B-FIRSTNAME', 'I-FIRSTNAME', 'B-USERNAME', 'I-USERNAME', 'B-JOBTYPE', 'B-PREFIX', 'I-PREFIX', 'B-LASTNAME', 'B-EMAIL', 'I-EMAIL', 'B-NEARBYGPSCOORDINATE', 'I-NEARBYGPSCOORDINATE', 'B-ACCOUNTNUMBER', 'I-ACCOUNTNUMBER', 'B-ACCOUNTNAME', 'I-ACCOUNTNAME', 'B-MIDDLENAME', 'I-MIDDLENAME', 'B-COUNTY', 'I-COUNTY', 'B-AGE', 'B-CREDITCARDCVV', 'B-DOB', 'I-DOB', 'B-MASKEDNUMBER', 'I-MASKEDNUMBER', 'B-PASSWORD', 'I-PASSWORD', 'B-SEX', 'B-STATE', 'B-COMPANYNAME', 'I-COMPANYNAME', 'B-PHONEIMEI', 'I-PHONEIMEI', 'B-STREET', 'I-STREET', 'B-SSN', 'I-SSN', 'B-IPV4', 'I-IPV4', 'B-USERAGENT', 'I-USERAGENT', 'B-MAC', 'I-MAC', 'B-PIN', 'I-PIN', 'B-IP', 'I-IP', 'B-URL', 'I-URL', 'B-CURRENCYSYMBOL', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-VEHICLEVRM', 'I-VEHICLEVRM', 'I-AMOUNT', 'B-ETHEREUMADDRESS', 'I-ETHEREUMADDRESS', 'B-BITCOINADDRESS', 'I-BITCOINADDRESS', 'B-LITECOINADDRESS', 'I-LITECOINADDRESS', 'I-JOBTYPE', 'B-CREDITCARDNUMBER', 'I-CREDITCARDNUMBER', 'B-IPV6', 'I-IPV6', 'I-L

#### Load the model and tokenizer

In [24]:
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [25]:
df_filtered['source_text']  #viewing the source text to test 

0    A student's assessment was found on device bea...
1    Dear Omer, as per our records, your license 78...
2    Kattie could you please share your recomndatio...
3    Emergency supplies in 16356 need a refill. Use...
4    The 88 old child at 5862, has showcased an unu...
5    Your recent hospital data recorded on 29/12/19...
6    Dear Trans male, Let's clear this misunderstan...
7    The wellness portal is accessible at [-71.6702...
8    Carleton, the new interactive educational tool...
9    1. Customer query received at 10:18 PM from Hu...
Name: source_text, dtype: object

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [17]:
#get the tokens of the test data sentence, since it is present in the dataset 
test_texts = [list(arr) for arr in df_filtered['mbert_text_tokens'].tolist()]


In [18]:
type(test_texts)

list

In [19]:
type(test_texts[0])

list

In [20]:
true_labels = df_filtered['mbert_bio_labels'].tolist() #ground truth of label list for the filtered dataset

In [21]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'B-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-JOBAREA', 'I-JOBAREA', 'I-JOBAREA', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

In [22]:
type(true_labels[0])

numpy.ndarray

In [27]:

encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt",
    is_split_into_words=True, 
    return_offsets_mapping=True
)

model_inputs = {key: val for key, val in encodings.items() if key != "offset_mapping"}



In [23]:
type(df_filtered['mbert_text_tokens'][1].tolist())

list

In [59]:
all_aligned_predictions = []  # Store predictions for all sentences
all_aligned_tokens = []

for i in range(len(df_filtered)):  # Loop through all sentences
    
    encodings = tokenizer(
        df_filtered['source_text'][i],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=False, 
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])  
    word_ids = encodings.word_ids()
    
    # Filter tokens to only keep the first subword of each word
    filtered_tokens = []
    filtered_word_ids = set()

    for token, word_id in zip(tokens, word_ids):
        if word_id is not None and word_id not in filtered_word_ids:
            filtered_tokens.append(token)
            filtered_word_ids.add(word_id)

    model_inputs = {key: val for key, val in encodings.items() if key != "offset_mapping"}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**model_inputs)

    logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices

    # Map predictions to original tokens
    aligned_predictions = []
    aligned_tokens = []
    previous_word_id = None

    # Align predictions with words
    for token, word_id, pred in zip(tokens, word_ids, predictions[0]):  
        if word_id is None:  
            continue  # Skip special tokens ([CLS], [SEP], etc.)

        if word_id != previous_word_id:  # Only take the first subword's prediction
            aligned_predictions.append(label_list[pred])
            aligned_tokens.append(token)

        previous_word_id = word_id 

    # Store predictions for this sentence
    all_aligned_predictions.append(aligned_predictions)
    all_aligned_tokens.append(aligned_tokens)



In [76]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
print(f"Original sentence - {df_filtered['source_text'][3]}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

sentence = df_filtered['source_text'][3]

results = ner_pipeline(sentence)

for entity in results:
    print(f"Token: {entity['word']}, Entity: {entity['entity_group']}, Score: {entity['score']:.4f}")


Original sentence - Emergency supplies in 16356 need a refill. Use 5890724654311332 to pay for them.


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Token: 16, Entity: BUILDINGNUMBER, Score: 0.9657
Token: 356, Entity: BUILDINGNUMBER, Score: 0.9792
Token: 58, Entity: ACCOUNTNUMBER, Score: 0.9988
Token: 90724654311332, Entity: ACCOUNTNUMBER, Score: 0.9994


### Evaluation method without tokenizing!

In [89]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Initialize NER pipeline without automatic grouping
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example sentence
sentence = df_filtered['source_text'][2]
print("\n")
print(f"Original sentence - {sentence}")

# Run token classification
results = ner_pipeline(sentence)

# Manual entity grouping
grouped_entities = []
current_entity = {"word": "", "entity": "", "score": []}

for entity in results:
    word = entity["word"]
    label = entity["entity_group"]
    score = entity["score"]

    # If the word is a subword or part of the same entity, merge it
    if current_entity and (current_entity["entity"] == label or word in "-"):
        current_entity["word"] += word  # Concatenate words
        current_entity["score"].append(score)  # Store confidence scores
    else:
        if current_entity:  # Save previous entity
            if current_entity["score"]:  # Ensure the score list is not empty
                current_entity["score"] = sum(current_entity["score"]) / len(current_entity["score"])  # Average score
            else:
                current_entity["score"] = 0  # Fallback in case of an empty score list
            
            if current_entity['score']>0:
                grouped_entities.append(current_entity)

        # Start a new entity
        current_entity = {"word": word, "entity": label, "score": [score]}

# Append last entity if valid and score > 0
if current_entity and current_entity["word"]:
    if current_entity["score"]:  
        current_entity["score"] = sum(current_entity["score"]) / len(current_entity["score"])
    
    if current_entity["score"] > 0:  # **Skip zero-score entities**
        grouped_entities.append(current_entity)

# Print grouped entities
for entity in grouped_entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']:.4f}")


Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.




Original sentence - Kattie could you please share your recomndations about vegetarian diet for 72 old Intersex person with 158centimeters?
Entity: Kattie, Label: FIRSTNAME, Score: 0.9994
Entity: 72, Label: AGE, Score: 0.9995
Entity: Intersex person, Label: GENDER, Score: 0.8549
Entity: 158centimeters, Label: HEIGHT, Score: 0.9997


In [63]:
len(true_labels[0])

48

In [61]:
len(all_aligned_predictions[0])

27

In [60]:
all_aligned_tokens[0]

['▁A',
 '▁student',
 '▁assessment',
 '▁was',
 '▁found',
 '▁on',
 '▁device',
 '▁bearing',
 '▁IMEI',
 '▁06',
 '▁The',
 '▁document',
 '▁falls',
 '▁under',
 '▁the',
 '▁various',
 '▁topics',
 '▁discussed',
 '▁in',
 '▁our',
 '▁Optimization',
 '▁curriculum',
 '▁Can',
 '▁you',
 '▁please',
 '▁collect',
 '▁it']

In [119]:
len(all_aligned_predictions)

1000

In [120]:
#for 1000 samples 
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: Subject: Group Messaging for Admissions Process

Good morning, everyone,

I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:

- wynqvrh053 - Meeting at 10:20am
- luka.burg - Meeting at 21
- qahil.wittauer - Meeting at quarter past 13
- gholamhossein.ruschke - Meeting at 9:47 PM
- pdmjrsyoz1460 
Length of actual label list = 
 121
Actual Tokens ['Sub' '##ject' ':' 'Group' 'Mess' '##aging' 'for' 'Ad' '##mission' '##s'
 'Process' 'Good' 'morning' ',' 'everyone' ',' 'I' 'hope' 'this' 'message'
 'finds' 'you' 'well' '.' 'As' 'we' 'continue' 'our' 'admission' '##s'
 'processes' ',' 'I' 'would' 'like' 'to' 'update' 'you' 'on' 'the'
 'latest' 'developments' 'and' 'key' 'information' '.' 'Please' 'find'
 'below' 'the' 'time' '##line' 'for' 'our' 'upcoming' 'meetings' ':' '-'
 'w' '##yn' '##q' '##vr' '##h' '##0' '##53' '-' 'Mee

In [109]:
#for 100 samples 
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: Subject: Group Messaging for Admissions Process

Good morning, everyone,

I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:

- wynqvrh053 - Meeting at 10:20am
- luka.burg - Meeting at 21
- qahil.wittauer - Meeting at quarter past 13
- gholamhossein.ruschke - Meeting at 9:47 PM
- pdmjrsyoz1460 
Length of actual label list = 
 121
Actual Tokens ['Sub' '##ject' ':' 'Group' 'Mess' '##aging' 'for' 'Ad' '##mission' '##s'
 'Process' 'Good' 'morning' ',' 'everyone' ',' 'I' 'hope' 'this' 'message'
 'finds' 'you' 'well' '.' 'As' 'we' 'continue' 'our' 'admission' '##s'
 'processes' ',' 'I' 'would' 'like' 'to' 'update' 'you' 'on' 'the'
 'latest' 'developments' 'and' 'key' 'information' '.' 'Please' 'find'
 'below' 'the' 'time' '##line' 'for' 'our' 'upcoming' 'meetings' ':' '-'
 'w' '##yn' '##q' '##vr' '##h' '##0' '##53' '-' 'Mee

In [None]:
#for 10 samples
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: Subject: Group Messaging for Admissions Process

Good morning, everyone,

I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:

- wynqvrh053 - Meeting at 10:20am
- luka.burg - Meeting at 21
- qahil.wittauer - Meeting at quarter past 13
- gholamhossein.ruschke - Meeting at 9:47 PM
- pdmjrsyoz1460 
Length of actual label list = 
 121
Actual Tokens ['Sub' '##ject' ':' 'Group' 'Mess' '##aging' 'for' 'Ad' '##mission' '##s'
 'Process' 'Good' 'morning' ',' 'everyone' ',' 'I' 'hope' 'this' 'message'
 'finds' 'you' 'well' '.' 'As' 'we' 'continue' 'our' 'admission' '##s'
 'processes' ',' 'I' 'would' 'like' 'to' 'update' 'you' 'on' 'the'
 'latest' 'developments' 'and' 'key' 'information' '.' 'Please' 'find'
 'below' 'the' 'time' '##line' 'for' 'our' 'upcoming' 'meetings' ':' '-'
 'w' '##yn' '##q' '##vr' '##h' '##0' '##53' '-' 'Mee

In [25]:
for i, (sentence, preds) in enumerate(zip(df_filtered['source_text'], all_aligned_predictions)):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Length of actual label list = \n {len(true_labels[i])}")
    print(f"Actual Tokens {df_filtered['mbert_text_tokens'][i]}\n")
    print(f"Predicted Aligned Tokens: {all_aligned_tokens[i]}\n")
    print(f"Predicted Aligned Tokens: {len(all_aligned_tokens[i])}\n")
    print(f"Length of predicted label list = \n {len(preds)}")
    print(f"Actual Labels: {true_labels[i]}")
    print(f"Predicted Labels: {preds}")
    print("-" * 50)

Sentence 1: A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?
Length of actual label list = 
 48
Actual Tokens ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 'in' 'our' 'Op' '##timi' '##zation' 'curriculum' '.' 'Can'
 'you' 'pl' '##eas' '##e' 'collect' 'it' '?']

Predicted Aligned Tokens: ['▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '▁#', '▁:', '▁06', '▁-', '▁1847', '▁#', '▁-', '▁866', '▁#', '▁#', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '▁#', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '▁#', '▁collect', '▁it', '▁?']

In [26]:
# this is the code to get the predicted result for one sentence of the dataset, use this to understand how the tokens 
# made, and compare token wise the predicted labels and original labels 
index = 0
original_tokens = df_filtered['mbert_text_tokens'][index]  #ground truth of label list for the first sentence

encodings = tokenizer(
        df_filtered['mbert_text_tokens'][index].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt",
        is_split_into_words=True, 
        return_offsets_mapping=True
    )

# Maps subwords (which are broken down further by the tokenizer) to original words present in the token list originally
tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][0])
offsets = encodings["offset_mapping"]
word_ids = encodings.word_ids()

# Select only the first subword per word to match dataset tokens
filtered_tokens = []
filtered_word_ids = set()

for token, word_id in zip(tokens, word_ids):
    if word_id is not None and word_id not in filtered_word_ids:
        filtered_tokens.append(token)
        filtered_word_ids.add(word_id)

# Print result
print("Filtered Tokens:", filtered_tokens)
print("Original Tokens:", original_tokens)

# Get model predictions
with torch.no_grad():
    outputs = model(**model_inputs)

logits = outputs.logits  # Shape: (batch_size, seq_length, num_labels)
predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Get predicted class indices

# Map predictions to original tokens
aligned_predictions = []
aligned_tokens = []
previous_word_id = None

    # Align predictions with words
for token, word_id, pred in zip(tokens, word_ids, predictions[0]):  
    if word_id is None:  
        continue  # Skip special tokens ([CLS], [SEP], etc.)

    if word_id != previous_word_id:  # Only take the first subword's prediction
        aligned_predictions.append(label_list[pred])
        aligned_tokens.append(token)

    previous_word_id = word_id 

# Print final mapped predictions
print("Original Tokens:", original_tokens)
print("Predicted Labels:", aligned_predictions)

print(f"True labels list size = {len(true_labels[index])}")
print(f"True token list size = {len(df_filtered['mbert_text_tokens'][index])}")
print(f"Predicted labels list size = {len(aligned_predictions)}")

print(f"Filtered tokens = \n {filtered_tokens}")

print(f"True label = \n {true_labels[index]} \n")
print(f"Predicted label = \n {aligned_predictions} \n")

accuracy = accuracy_score(true_labels[index], aligned_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels[index], aligned_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Filtered Tokens: ['▁A', '▁student', "▁'", '▁s', '▁assessment', '▁was', '▁found', '▁on', '▁device', '▁bearing', '▁IM', '▁#', '▁#', '▁:', '▁06', '▁-', '▁1847', '▁#', '▁-', '▁866', '▁#', '▁#', '▁-', '▁3', '▁.', '▁The', '▁document', '▁falls', '▁under', '▁the', '▁various', '▁topics', '▁discussed', '▁in', '▁our', '▁Op', '▁#', '▁#', '▁curriculum', '▁.', '▁Can', '▁you', '▁pl', '▁#', '▁#', '▁collect', '▁it', '▁?']
Original Tokens: ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 'in' 'our' 'Op' '##timi' '##zation' 'curriculum' '.' 'Can'
 'you' 'pl' '##eas' '##e' 'collect' 'it' '?']
Original Tokens: ['A' 'student' "'" 's' 'assessment' 'was' 'found' 'on' 'device' 'bearing'
 'IM' '##E' '##I' ':' '06' '-' '1847' '##55' '-' '866' '##85' '##1' '-'
 '3' '.' 'The' 'document' 'falls' 'under' 'the' 'various' 'topics'
 'discussed' 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [181]:
len(true_labels[0])

48

In [201]:
len(aligned_predictions)

48

In [108]:
len(true_labels[0])

48

In [27]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'B-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI', 'I-PHONEIMEI',
       'I-PHONEIMEI', 'I-PHONEIMEI', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-JOBAREA', 'I-JOBAREA', 'I-JOBAREA', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

In [121]:
len(all_aligned_predictions)

1000

In [122]:
flat_predictions = [label for sentence in all_aligned_predictions for label in sentence]

In [123]:
flat_true_labels = [label for sentence in true_labels for label in sentence]

In [124]:
#for 1000 samples
# Compute Accuracy, Precision, Recall, and F1-score
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Samples: {samples}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Samples: 1000
Accuracy: 0.7304
Precision: 0.5906
Recall: 0.7304
F1-score: 0.6530


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [112]:
#for 100 samples
# Compute Accuracy, Precision, Recall, and F1-score
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Samples: {samples}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Samples: 100
Accuracy: 0.7321
Precision: 0.5975
Recall: 0.7321
F1-score: 0.6580


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [71]:
# Compute Accuracy, Precision, Recall, and F1-score
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Samples: {samples}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Samples: 10
Accuracy: 0.6891
Precision: 0.7578
Recall: 0.6891
F1-score: 0.7195


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Compute Accuracy, Precision, Recall, and F1-score
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="weighted")

# Print Evaluation Metrics
print(f"Samples: {samples}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

In [33]:
type(true_labels)

list

In [34]:
type(all_aligned_predictions)

list

In [266]:
len(all_aligned_predictions)

10

In [267]:
all_aligned_predictions[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'I-PHONEIMEI',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-JOBAREA',
 'I-JOBAREA',
 'I-JOBAREA',
 'O',
 'O',
 'O',
 'O',
 'B-FIRSTNAME',
 'I-FIRSTNAME',
 'I-FIRSTNAME',
 'O',
 'O',
 'O']

'E'

In [281]:
true_labels

'O'

In [86]:
true_labels[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'O', 'B-USERNAME',
       'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-USERNAME',
       'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME',
       'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'O', 'B-TIME', 'I-TIME',
       'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME',
       'I-TIME', 'I-TIME', 'O', 'B-USERNAME', 'I-USERNAME'], dtype=object)

In [85]:
all_aligned_predictions[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-BIC',
 'I-PASSWORD',
 'I-PASSWORD',
 'I-PASSWORD',
 'I-BIC',
 'O',
 'I-BIC',
 'O',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'I-TIME',
 'I-TIME',
 'O',
 'B-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'O',
 'O',
 'O',
 'B-TIME',
 'O',
 'B-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'O',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'I-TIME',
 'O',
 'B-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'O',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'I-TIME',
 'I-TIME',
 'O',
 'B-VEHICLEVIN',
 'I-VEHICLEVIN',
 'I-VEHIC

In [89]:
# Compare aligned predictions with actual predictions to find the 

prediction_analysis = [] #store the count of TP, FP, FN, TN

for tokens_list, predictions, label_list in zip(all_aligned_tokens, all_aligned_predictions, true_labels):
    tp =0
    tn = 0
    fp = 0
    fn =0
    miss_classified = 0
    count = 0
    print(f"Prediction = {predictions}, \n True label = {label_list}")
    true_labels_list = label_list.tolist()
    for token, pred, true_label in zip(tokens_list, predictions, true_labels_list):
        count +=1
        print(f"Token = {token}, Pred = {pred}, True label = {true_label}")
        if pred == true_label:
            if pred != 'O':  # Correctly predicted an entity
                tp +=1
            else:
                tn +=1 # Correctly predicted a non-entity
        else:
            if pred!='O' and true_label == 'O':
             # Model falsely predicted an entity when it should be "O"
                fp +=1
            elif pred=='O' and true_label!='O':
            # Model failed to detect an entity (missed it)
                fn +=1
            else:
            # Model predicted the wrong entity (e.g., "B-ORG" instead of "B-PER")
                tp +=1
                miss_classified += 1  # Counts as a wrongly identified entity
        print(f"tp = {tp}, fp = {fp}, tn ={tn}, fn={fn}\n")
    prediction_analysis.append({"tp":tp, "fp":fp, "fn":fn, "tn":tn, "miss_classified":miss_classified, "tokens":count })



Prediction = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-BIC', 'I-PASSWORD', 'I-PASSWORD', 'I-PASSWORD', 'I-BIC', 'O', 'I-BIC', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'O', 'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'O', 'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'O', 'B-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'O', 'B-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN', 'I-VEHICLEVIN'], 
 True label 

In [73]:
print(prediction_analysis)

[{'tp': 23, 'fp': 23, 'fn': 6, 'tn': 69, 'miss_classified': 12, 'tokens': 121}, {'tp': 10, 'fp': 5, 'fn': 7, 'tn': 80, 'miss_classified': 3, 'tokens': 102}, {'tp': 28, 'fp': 9, 'fn': 0, 'tn': 95, 'miss_classified': 10, 'tokens': 132}, {'tp': 64, 'fp': 10, 'fn': 4, 'tn': 77, 'miss_classified': 44, 'tokens': 155}, {'tp': 12, 'fp': 16, 'fn': 0, 'tn': 90, 'miss_classified': 8, 'tokens': 118}, {'tp': 2, 'fp': 5, 'fn': 1, 'tn': 76, 'miss_classified': 2, 'tokens': 84}, {'tp': 36, 'fp': 40, 'fn': 22, 'tn': 35, 'miss_classified': 25, 'tokens': 133}, {'tp': 2, 'fp': 6, 'fn': 0, 'tn': 71, 'miss_classified': 2, 'tokens': 79}, {'tp': 14, 'fp': 18, 'fn': 10, 'tn': 63, 'miss_classified': 14, 'tokens': 105}, {'tp': 24, 'fp': 18, 'fn': 11, 'tn': 60, 'miss_classified': 24, 'tokens': 113}]


In [91]:
df_filtered['mbert_bio_labels'][1]

array(['O', 'O', 'O', 'B-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'O', 'O',
       'B-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'B-TIME', 'I-TIME',
       'I-TIME', 'I-TIME', 'I-TIME', 'B-USERNAME', 'I-USERNAME',
       'I-USERNAME', 'I-USERNAME', 'O', 'O', 'O', 'O', 'O', 'B-TIME',
       'I-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

In [92]:
all_aligned_predictions[1]

['O',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'I-TIME',
 'I-TIME',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'I-TIME',
 'I-TIME',
 'O',
 'B-USERNAME',
 'I-USERNAME',
 'I-USERNAME',
 'O',
 'O',
 'O',
 'B-TIME',
 'I-TIME',
 'I-TIME',
 'I-TIME',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

Below code to find the number of sensitive and non-sensitive labels in the sentence of the dataset

In [96]:
import numpy as np
print(np.sum(df_filtered['mbert_bio_labels'][3] == 'O'))
print(np.sum(df_filtered['mbert_bio_labels'][3] != 'O'))

87
68


In [79]:
total_tp = sum(entry['tp'] for entry in prediction_analysis)
total_fp = sum(entry['fp'] for entry in prediction_analysis)
total_fn = sum(entry['fn'] for entry in prediction_analysis)
total_tn = sum(entry['tn'] for entry in prediction_analysis)

total_missclassified = sum(entry['miss_classified'] for entry in prediction_analysis)
total_tokens = sum(entry['tokens'] for entry in prediction_analysis)

In [80]:
print(f"Misclassified count = {total_missclassified}")
print(f"Total number of tokens = {total_tokens}")

Misclassified count = 144
Total number of tokens = 1142


In [75]:
# Compute metrics
accuracy = (total_tp + total_tn) / (total_tp + total_fp + total_fn + total_tn)
precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) != 0 else 0
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) != 0 else 0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

# Output results
accuracy, precision, recall, f1_score

(0.8152364273204904, 0.589041095890411, 0.7789855072463768, 0.6708268330733229)

In [38]:
all_aligned_predictions[3]

['O',
 'O',
 'O',
 'B-ZIPCODE',
 'I-ZIPCODE',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'I-CREDITCARDNUMBER',
 'O',
 'O',
 'O',
 'O',
 'O']

In [37]:
true_labels[3]

array(['O', 'O', 'O', 'B-BUILDINGNUMBER', 'I-BUILDINGNUMBER', 'O', 'O',
       'O', 'O', 'O', 'O', 'B-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'I-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'I-MASKEDNUMBER', 'I-MASKEDNUMBER',
       'I-MASKEDNUMBER', 'O', 'O', 'O', 'O', 'O'], dtype=object)