# Combine Positive and Negative PII Examples

This notebook combines:
- Positive examples (data with PII) from balanced_augmented_relabeled.ndjson
- Negative examples (data without PII) from verified_negative_examples.ndjson

Output: Combined dataset for model evaluation

In [1]:
import json
import pandas as pd
from pathlib import Path
import sys

# Add parent directory to path to import utils
sys.path.insert(0, str(Path('../utils').resolve()))
from models_config import SIMPLIFIED_24_LABELS, LABEL_CONSOLIDATION_MAP

## 1. Load Positive Examples (with PII)

In [2]:
# Load positive examples
positive_file = '../data/balanced_augmented_relabeled.ndjson'

positive_examples = []
with open(positive_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line.endswith(','):
            line = line[:-1]
        if line:
            positive_examples.append(json.loads(line))

print(f"Loaded {len(positive_examples)} positive examples (with PII)")
print(f"Sample: {positive_examples[0]['text'][:100]}...")

Loaded 1827 positive examples (with PII)
Sample: Survey Date: January 26th, 1985 
City: Oak Grove 
How often do you encounter the following stressors...


## 2. Load Negative Examples (without PII)

In [3]:
# Load negative examples
negative_file = '../data/verified_negative_examples.ndjson'

negative_examples = []
with open(negative_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line.endswith(','):
            line = line[:-1]
        if line:
            negative_examples.append(json.loads(line))

print(f"Loaded {len(negative_examples)} negative examples (without PII)")
print(f"Sample: {negative_examples[0]['text'][:100]}...")

Loaded 469 negative examples (without PII)
Sample: Write a CSS code to style a paragraph with a custom font from Google Fonts....


## 3. Check the Format

In [4]:
# Check positive example format
print("Positive example structure:")
print(json.dumps(positive_examples[0], indent=2))
print()

# Check negative example format
print("Negative example structure:")
print(json.dumps(negative_examples[0], indent=2))

Positive example structure:
{
  "text": "Survey Date: January 26th, 1985 \nCity: Oak Grove \nHow often do you encounter the following stressors? \n- Taxes and paperwork: Tax number 660-03-8442 \n- Financial management: Credit Card Number 6290812888615710 \nDo you notice stress symptoms through eye color changes? Hazel",
  "entities": [
    {
      "text": "January 26th, 1985",
      "label": "date",
      "start": 13,
      "end": 31
    },
    {
      "text": "660-03-8442",
      "label": "tax identification number",
      "start": 137,
      "end": 148
    },
    {
      "text": "6290812888615710",
      "label": "credit card number",
      "start": 193,
      "end": 209
    }
  ],
  "source_dataset": "ai4privacy-400k",
  "sample_idx": 0,
  "original_entities": [
    {
      "entity": "January 26th, 1985",
      "types": [
        "date"
      ],
      "start": 13,
      "end": 31,
      "original_type": "dateofbirth",
      "canonical_type": "dateofbirth"
    },
    {
      "entity"

## 4. Standardize Format

Make sure both have the same structure: `text` and `entities`

In [5]:
# Standardize positive examples
standardized_positive = []
for item in positive_examples:
    standardized = {
        'text': item['text'],
        'entities': item.get('normalized_entities', item.get('entities', [])),
        'has_pii': True
    }
    standardized_positive.append(standardized)

# Standardize negative examples
standardized_negative = []
for item in negative_examples:
    standardized = {
        'text': item['text'],
        'entities': [],  # Negative examples have no entities
        'has_pii': False
    }
    standardized_negative.append(standardized)

print(f"Standardized {len(standardized_positive)} positive examples")
print(f"Standardized {len(standardized_negative)} negative examples")

Standardized 1827 positive examples
Standardized 469 negative examples


## 4.5. Consolidate Similar Labels

Merge similar entity types to simplify the label set

In [6]:
# Use centralized label mapping from models_config
print(f"Using label consolidation mapping with {len(LABEL_CONSOLIDATION_MAP)} mappings")
print(f"Target: {len(SIMPLIFIED_24_LABELS)} simplified labels")

# Apply mapping to positive examples
consolidated_count = 0
consolidation_stats = {}

for item in positive_examples:
    entities = item.get('entities', [])
    for entity in entities:
        old_label = entity.get('label', '')
        if old_label in LABEL_CONSOLIDATION_MAP:
            new_label = LABEL_CONSOLIDATION_MAP[old_label]
            entity['label'] = new_label
            consolidated_count += 1
            
            # Track stats
            if old_label not in consolidation_stats:
                consolidation_stats[old_label] = 0
            consolidation_stats[old_label] += 1

print(f"\nConsolidated {consolidated_count} entity labels")
print(f"\nLabel mappings applied:")
for old, new in sorted(consolidation_stats.items(), key=lambda x: x[1], reverse=True):
    print(f"  {old} → {LABEL_CONSOLIDATION_MAP[old]} ({consolidation_stats[old]} entities)")

Using label consolidation mapping with 42 mappings
Target: 23 simplified labels

Consolidated 678 entity labels

Label mappings applied:
  drivers license number → driver's license number (190 entities)
  health insurance id number → insurance number (164 entities)
  identity card number → identification number (158 entities)
  mobile phone number → phone number (99 entities)
  insurance plan number → insurance number (42 entities)
  student id number → identification number (19 entities)
  birth certificate number → identification number (4 entities)
  national health insurance number → insurance number (2 entities)


## 4.7. Remove Duplicate Entities

Remove duplicate entities from ground truth (same text appearing multiple times)

In [7]:
# Remove duplicate entities from each sample
duplicate_count = 0
total_before = 0

for item in positive_examples:
    entities = item.get('entities', [])
    total_before += len(entities)
    
    # Track seen entities (case-insensitive, by text and label)
    seen = set()
    unique_entities = []
    
    for entity in entities:
        # Create unique key from text and label
        key = (entity.get('text', '').lower().strip(), entity.get('label', ''))
        
        if key not in seen:
            seen.add(key)
            unique_entities.append(entity)
        else:
            duplicate_count += 1
    
    item['entities'] = unique_entities

total_after = sum(len(item.get('entities', [])) for item in positive_examples)

print(f"Deduplication complete:")
print(f"  Total entities before: {total_before}")
print(f"  Total entities after: {total_after}")
print(f"  Duplicates removed: {duplicate_count}")

Deduplication complete:
  Total entities before: 7603
  Total entities after: 7026
  Duplicates removed: 577


## 5. Combine and Shuffle

In [8]:
import random

# Combine all examples
combined_data = standardized_positive + standardized_negative

# Shuffle the data
random.seed(42)  # For reproducibility
random.shuffle(combined_data)

print(f"Total combined examples: {len(combined_data)}")
print(f"  - Positive (with PII): {len(standardized_positive)}")
print(f"  - Negative (no PII): {len(standardized_negative)}")

Total combined examples: 2296
  - Positive (with PII): 1827
  - Negative (no PII): 469


## 6. Save Combined Dataset

In [9]:
# Save as NDJSON
output_file = '../data/gold_test_24labels_2300sampels.ndjson'

with open(output_file, 'w') as f:
    for item in combined_data:
        f.write(json.dumps(item) + '\n')

print(f"Saved combined dataset to: {output_file}")
print(f"Total samples: {len(combined_data)}")

Saved combined dataset to: ../data/gold_test_24labels_2300sampels.ndjson
Total samples: 2296


## 7. Quick Statistics

In [10]:
# Calculate statistics
num_positive = sum(1 for item in combined_data if item['has_pii'])
num_negative = sum(1 for item in combined_data if not item['has_pii'])
total_entities = sum(len(item['entities']) for item in combined_data)

print("Dataset Statistics:")
print(f"  Total samples: {len(combined_data)}")
print(f"  Positive (with PII): {num_positive} ({num_positive/len(combined_data)*100:.1f}%)")
print(f"  Negative (no PII): {num_negative} ({num_negative/len(combined_data)*100:.1f}%)")
print(f"  Total entities: {total_entities}")
print(f"  Avg entities per positive sample: {total_entities/num_positive:.1f}")

Dataset Statistics:
  Total samples: 2296
  Positive (with PII): 1827 (79.6%)
  Negative (no PII): 469 (20.4%)
  Total entities: 7603
  Avg entities per positive sample: 4.2


## 8. Entity Type Distribution

In [11]:
# Count entities by label
from collections import Counter

entity_counts = Counter()

for item in combined_data:
    for entity in item['entities']:
        label = entity.get('label', entity.get('types', ['unknown'])[0] if 'types' in entity else 'unknown')
        entity_counts[label] += 1

# Sort by count (descending)
sorted_counts = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)

print("Entity Type Distribution:")
print(f"{'Label':<35} {'Count':>8} {'%':>8}")
print("-" * 53)

for label, count in sorted_counts:
    percentage = (count / total_entities * 100) if total_entities > 0 else 0
    print(f"{label:<35} {count:>8} {percentage:>7.1f}%")

print("-" * 53)
print(f"{'Total':<35} {total_entities:>8} {'100.0%':>8}")

Entity Type Distribution:
Label                                  Count        %
-----------------------------------------------------
full name                               1195    15.7%
date                                     950    12.5%
email address                            601     7.9%
organization                             461     6.1%
address                                  452     5.9%
phone number                             436     5.7%
medical condition                        415     5.5%
username                                 390     5.1%
amount                                   283     3.7%
bank account number                      247     3.2%
credit card number                       236     3.1%
tax identification number                222     2.9%
insurance number                         208     2.7%
medication                               207     2.7%
social security number                   203     2.7%
driver's license number                  190     2.5%
me