# Simple PII Model Test

Test PII detection models on 50 samples from the gold test dataset.

In [5]:
import json
import pandas as pd
from gliner import GLiNER
from tqdm import tqdm

## Load Gold Test Dataset (50 samples)

In [6]:
# Load 50 samples from gold test dataset
samples = []
with open('../../Data/gold_testdataset_27labels.ndjson', 'r') as f:
    for i, line in enumerate(f):
        if i >= 50:  # Only take 50 samples
            break
        line = line.strip()
        if line.endswith(','):
            line = line[:-1]  # Remove trailing comma
        samples.append(json.loads(line))

print(f"Loaded {len(samples)} samples")
print(f"\nExample sample:")
print(f"Text: {samples[0]['text'][:100]}...")
print(f"Entities: {samples[0]['normalized_entities']}")

Loaded 50 samples

Example sample:
Text: Survey Date: January 26th, 1985 
City: Oak Grove 
How often do you encounter the following stressors...
Entities: [{'text': 'January 26th, 1985', 'label': 'date', 'start': 13, 'end': 31}, {'text': '660-03-8442', 'label': 'tax identification number', 'start': 137, 'end': 148}, {'text': '6290812888615710', 'label': 'credit card number', 'start': 193, 'end': 209}]


## Load Model

In [7]:
# Load the finetuned model
model = GLiNER.from_pretrained("../../finetuned_gliner")
print(f"Model loaded successfully")

Model loaded successfully


## Define Labels to Detect

In [8]:
# Labels the model can detect
labels = [
    "full name", "date", "social security number", "tax identification number",
    "credit card number", "phone number", "email address", "address",
    "passport number", "driver's license number", "bank account number",
    "credit score", "health insurance number", "medical condition",
    "medication", "ip address", "username", "organization",
    "identification number", "iban", "fax number", "medical treatment",
    "bank account balance"
]

print(f"Using {len(labels)} labels")

Using 23 labels


## Run Predictions

In [9]:
# Run predictions on all samples
results = []

for sample in tqdm(samples):
    text = sample['text']
    ground_truth = sample['normalized_entities']
    
    # Predict
    predictions = model.predict_entities(text, labels, threshold=0.3)
    
    results.append({
        'text': text,
        'ground_truth': ground_truth,
        'predictions': predictions
    })

print(f"\nCompleted predictions on {len(results)} samples")

  0%|          | 0/50 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 50/50 [00:06<00:00,  7.15it/s]


Completed predictions on 50 samples





## Calculate Simple Metrics

In [10]:
# Count correct predictions
total_ground_truth = 0
total_predictions = 0
correct_predictions = 0

for result in results:
    gt = result['ground_truth']
    pred = result['predictions']
    
    total_ground_truth += len(gt)
    total_predictions += len(pred)
    
    # Simple matching: same text and label
    for p in pred:
        for g in gt:
            if p['text'] == g['text'] and p['label'].lower() == g['label'].lower():
                correct_predictions += 1
                break

# Calculate metrics
precision = correct_predictions / total_predictions if total_predictions > 0 else 0
recall = correct_predictions / total_ground_truth if total_ground_truth > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nResults:")
print(f"Ground Truth Entities: {total_ground_truth}")
print(f"Predicted Entities: {total_predictions}")
print(f"Correct Predictions: {correct_predictions}")
print(f"\nPrecision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1 Score: {f1:.2%}")


Results:
Ground Truth Entities: 179
Predicted Entities: 195
Correct Predictions: 109

Precision: 55.90%
Recall: 60.89%
F1 Score: 58.29%


## Show Examples

In [11]:
# Show first 3 examples
for i, result in enumerate(results[:3]):
    print(f"\n{'='*80}")
    print(f"Example {i+1}")
    print(f"{'='*80}")
    print(f"Text: {result['text'][:200]}...")
    print(f"\nGround Truth ({len(result['ground_truth'])} entities):")
    for e in result['ground_truth']:
        print(f"  - {e['text']}: {e['label']}")
    print(f"\nPredictions ({len(result['predictions'])} entities):")
    for e in result['predictions']:
        print(f"  - {e['text']}: {e['label']} (score: {e['score']:.2f})")


Example 1
Text: Survey Date: January 26th, 1985 
City: Oak Grove 
How often do you encounter the following stressors? 
- Taxes and paperwork: Tax number 660-03-8442 
- Financial management: Credit Card Number 6290812...

Ground Truth (3 entities):
  - January 26th, 1985: date
  - 660-03-8442: tax identification number
  - 6290812888615710: credit card number

Predictions (4 entities):
  - January 26th, 1985: date (score: 1.00)
  - Oak Grove: address (score: 0.93)
  - 660-03-8442: tax identification number (score: 1.00)
  - 6290812888615710: credit card number (score: 1.00)

Example 2
Text: John Doe applied for a mortgage at Western Credit Union. His social security number, 987-65-4321, was needed for the application. John also provided his passport, with the number A123456789....

Ground Truth (4 entities):
  - John Doe: full name
  - Western Credit Union: organization
  - 987-65-4321: social security number
  - A123456789: passport number

Predictions (4 entities):
  - John Doe: full