# Create Per-Label Datasets

Map all dataset labels to 24 simplified labels and extract up to 10k samples per label.

In [1]:
import json
import os
import random
from collections import defaultdict, Counter

# Paths
UNIFIED_DIR = "/Users/sravan/Documents/Experiments/fintuning_PII/Data/additional_datasets/unified"
OUTPUT_DIR = "/Users/sravan/Documents/Experiments/fintuning_PII/Data/training_by_label"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MAX_SAMPLES_PER_LABEL = 10000

## 1. Define the 24 Simplified Labels

In [2]:
SIMPLIFIED_24_LABELS = [
    # Personal
    "date",
    "full name",
    "username",
    
    # Government/Official IDs
    "social security number",
    "tax identification number",
    "passport number",
    "driver's license number",
    "identification number",
    
    # Contact
    "phone number",
    "address",
    "email address",
    "ip address",
    "fax number",
    
    # Financial
    "credit card number",
    "credit score",
    "bank account number",
    "amount",
    "iban",
    "insurance number",
    
    # Medical
    "medical condition",
    "medication",
    "medical treatment",
    
    # Organization
    "organization",
    
    # URL
    "url",
]

print(f"Total labels: {len(SIMPLIFIED_24_LABELS)}")
for i, label in enumerate(SIMPLIFIED_24_LABELS, 1):
    print(f"  {i:2d}. {label}")

Total labels: 24
   1. date
   2. full name
   3. username
   4. social security number
   5. tax identification number
   6. passport number
   7. driver's license number
   8. identification number
   9. phone number
  10. address
  11. email address
  12. ip address
  13. fax number
  14. credit card number
  15. credit score
  16. bank account number
  17. amount
  18. iban
  19. insurance number
  20. medical condition
  21. medication
  22. medical treatment
  23. organization
  24. url


## 2. Create Comprehensive Label Mapping

Map all variations to the 24 simplified labels.

In [3]:
# Comprehensive mapping from various label formats to SIMPLIFIED_24_LABELS
# Keys are lowercase for case-insensitive matching

LABEL_MAP = {
    # ==================== DATE ====================
    "date": "date",
    "dob": "date",
    "date of birth": "date",
    "date_of_birth": "date",
    "dateofbirth": "date",
    "birthday": "date",
    "birth_date": "date",
    "birth date": "date",
    "datum": "date",  # German/Dutch
    "fecha": "date",  # Spanish
    "data": "date",   # Italian/Portuguese
    "date_time": "date",
    "datetime": "date",
    "time": "date",
    "expiry date": "date",
    "expiry_date": "date",
    "issue date": "date",
    "hire date": "date",
    
    # ==================== FULL NAME ====================
    "full name": "full name",
    "fullname": "full name",
    "full_name": "full name",
    "name": "full name",
    "person": "full name",
    "person name": "full name",
    "person_name": "full name",
    "per": "full name",  # beki_privy
    "firstname": "full name",
    "first_name": "full name",
    "first name": "full name",
    "lastname": "full name",
    "last_name": "full name",
    "last name": "full name",
    "givenname": "full name",
    "given_name": "full name",
    "surname": "full name",
    "middlename": "full name",
    "middle_name": "full name",
    "prefix": "full name",
    "suffix": "full name",
    "title": "full name",
    # Multilingual
    "nom": "full name",        # French
    "nome": "full name",       # Italian/Portuguese
    "nombre": "full name",     # Spanish
    "όνομα": "full name",      # Greek
    "osebno ime": "full name", # Slovenian
    "persoon": "full name",    # Dutch
    "imię": "full name",       # Polish
    "ime": "full name",        # Slovenian/Croatian
    
    # ==================== USERNAME ====================
    "username": "username",
    "user_name": "username",
    "user name": "username",
    "userid": "username",
    "user_id": "username",
    "user id": "username",
    "login": "username",
    "handle": "username",
    "nickname": "username",
    "password": "username",  # Group with username for account security
    
    # ==================== SOCIAL SECURITY NUMBER ====================
    "social security number": "social security number",
    "social_security_number": "social security number",
    "ssn": "social security number",
    "socialsecuritynumber": "social security number",
    "us_ssn": "social security number",
    "social insurance number": "social security number",
    "sin": "social security number",  # Canada
    "nino": "social security number", # UK National Insurance
    
    # ==================== TAX ID ====================
    "tax identification number": "tax identification number",
    "tax_identification_number": "tax identification number",
    "tax id": "tax identification number",
    "tax_id": "tax identification number",
    "taxid": "tax identification number",
    "tin": "tax identification number",
    "us_itin": "tax identification number",
    "itin": "tax identification number",
    "ein": "tax identification number",  # Employer ID
    "vat number": "tax identification number",
    "vat_number": "tax identification number",
    
    # ==================== PASSPORT ====================
    "passport number": "passport number",
    "passport_number": "passport number",
    "passportnumber": "passport number",
    "passport": "passport number",
    
    # ==================== DRIVER'S LICENSE ====================
    "driver's license number": "driver's license number",
    "drivers license number": "driver's license number",
    "drivers_license_number": "driver's license number",
    "driver_license": "driver's license number",
    "driverslicense": "driver's license number",
    "driving license": "driver's license number",
    "driving_license": "driver's license number",
    
    # ==================== IDENTIFICATION NUMBER (catch-all) ====================
    "identification number": "identification number",
    "identification_number": "identification number",
    "id number": "identification number",
    "id_number": "identification number",
    "idnumber": "identification number",
    "national id": "identification number",
    "national_id": "identification number",
    "nationalid": "identification number",
    "identity card number": "identification number",
    "identity_card_number": "identification number",
    "idcardnum": "identification number",
    "student id": "identification number",
    "student_id": "identification number",
    "student id number": "identification number",
    "student_id_number": "identification number",
    "employee id": "identification number",
    "employee_id": "identification number",
    "customer id": "identification number",
    "customer_id": "identification number",
    "medical record number": "identification number",
    "medical_record_number": "identification number",
    "mrn": "identification number",
    "birth certificate number": "identification number",
    "birth_certificate_number": "identification number",
    "id card number": "identification number",
    "voter id": "identification number",
    "license plate": "identification number",
    "license_plate": "identification number",
    "vehicle registration": "identification number",
    
    # ==================== PHONE NUMBER ====================
    "phone number": "phone number",
    "phone_number": "phone number",
    "phonenumber": "phone number",
    "phone": "phone number",
    "telephone": "phone number",
    "telephonenum": "phone number",
    "telephone number": "phone number",
    "telephone_number": "phone number",
    "mobile": "phone number",
    "mobile phone": "phone number",
    "mobile_phone": "phone number",
    "mobile phone number": "phone number",
    "mobile_phone_number": "phone number",
    "cell phone": "phone number",
    "cell_phone": "phone number",
    # Multilingual
    "telefonska številka": "phone number",  # Slovenian
    "stacionarna številka": "phone number", # Slovenian landline
    "telefon": "phone number",
    "numéro de téléphone": "phone number",
    
    # ==================== ADDRESS ====================
    "address": "address",
    "street address": "address",
    "street_address": "address",
    "streetaddress": "address",
    "home address": "address",
    "mailing address": "address",
    "postal address": "address",
    "city": "address",
    "state": "address",
    "country": "address",
    "zipcode": "address",
    "zip_code": "address",
    "zip code": "address",
    "postal code": "address",
    "postal_code": "address",
    "postalcode": "address",
    "county": "address",
    "buildingnumber": "address",
    "building_number": "address",
    "buildingnum": "address",
    "street": "address",
    "street_name": "address",
    "loc": "address",  # beki_privy location
    "location": "address",
    "geo": "address",
    "coordinates": "address",
    "latitude": "address",
    "longitude": "address",
    # Multilingual
    "adresse": "address",  # French/German
    "indirizzo": "address", # Italian
    "dirección": "address", # Spanish
    "naslov": "address",    # Slovenian
    
    # ==================== EMAIL ====================
    "email address": "email address",
    "email_address": "email address",
    "emailaddress": "email address",
    "email": "email address",
    "e-mail": "email address",
    "mail": "email address",
    
    # ==================== IP ADDRESS ====================
    "ip address": "ip address",
    "ip_address": "ip address",
    "ipaddress": "ip address",
    "ip": "ip address",
    "ipv4": "ip address",
    "ipv6": "ip address",
    "mac address": "ip address",
    "mac_address": "ip address",
    
    # ==================== FAX ====================
    "fax number": "fax number",
    "fax_number": "fax number",
    "faxnumber": "fax number",
    "fax": "fax number",
    
    # ==================== CREDIT CARD ====================
    "credit card number": "credit card number",
    "credit_card_number": "credit card number",
    "creditcardnumber": "credit card number",
    "credit card": "credit card number",
    "credit_card": "credit card number",
    "creditcard": "credit card number",
    "card number": "credit card number",
    "card_number": "credit card number",
    "debit card": "credit card number",
    "debit_card": "credit card number",
    "cvv": "credit card number",
    "cvc": "credit card number",
    
    # ==================== CREDIT SCORE ====================
    "credit score": "credit score",
    "credit_score": "credit score",
    "creditscore": "credit score",
    "fico score": "credit score",
    
    # ==================== BANK ACCOUNT ====================
    "bank account number": "bank account number",
    "bank_account_number": "bank account number",
    "bankaccountnumber": "bank account number",
    "bank account": "bank account number",
    "bank_account": "bank account number",
    "account number": "bank account number",
    "account_number": "bank account number",
    "accountnum": "bank account number",
    "routing number": "bank account number",
    "routing_number": "bank account number",
    "swift": "bank account number",
    "swift code": "bank account number",
    "swift_code": "bank account number",
    "swift_bic_code": "bank account number",
    "bic": "bank account number",
    
    # ==================== AMOUNT ====================
    "amount": "amount",
    "bank account balance": "amount",
    "bank_account_balance": "amount",
    "balance": "amount",
    "transaction amount": "amount",
    "transaction_amount": "amount",
    "salary": "amount",
    "income": "amount",
    "price": "amount",
    "cost": "amount",
    "payment": "amount",
    "financial": "amount",  # beki_privy
    "money": "amount",
    "currency": "amount",
    
    # ==================== IBAN ====================
    "iban": "iban",
    "international bank account number": "iban",
    
    # ==================== INSURANCE ====================
    "insurance number": "insurance number",
    "insurance_number": "insurance number",
    "health insurance number": "insurance number",
    "health_insurance_number": "insurance number",
    "health insurance id": "insurance number",
    "health_insurance_id": "insurance number",
    "health insurance id number": "insurance number",
    "national health insurance number": "insurance number",
    "national_health_insurance_number": "insurance number",
    "insurance plan number": "insurance number",
    "insurance_plan_number": "insurance number",
    "policy number": "insurance number",
    "policy_number": "insurance number",
    
    # ==================== MEDICAL CONDITION ====================
    "medical condition": "medical condition",
    "medical_condition": "medical condition",
    "medicalcondition": "medical condition",
    "condition": "medical condition",
    "diagnosis": "medical condition",
    "disease": "medical condition",
    "illness": "medical condition",
    "disorder": "medical condition",
    "syndrome": "medical condition",
    "symptoms": "medical condition",
    
    # ==================== MEDICATION ====================
    "medication": "medication",
    "medicine": "medication",
    "drug": "medication",
    "prescription": "medication",
    "drug name": "medication",
    
    # ==================== MEDICAL TREATMENT ====================
    "medical treatment": "medical treatment",
    "medical_treatment": "medical treatment",
    "treatment": "medical treatment",
    "procedure": "medical treatment",
    "surgery": "medical treatment",
    "therapy": "medical treatment",
    
    # ==================== ORGANIZATION ====================
    "organization": "organization",
    "organisation": "organization",
    "org": "organization",
    "company": "organization",
    "company name": "organization",
    "company_name": "organization",
    "employer": "organization",
    "business": "organization",
    "corporation": "organization",
    "institution": "organization",
    "agency": "organization",
    "bank name": "organization",
    "hospital": "organization",
    "school": "organization",
    "university": "organization",
    # Multilingual
    "organizacija": "organization",  # Slovenian
    "klinika": "organization",       # Slovenian clinic
    "entreprise": "organization",    # French
    
    # ==================== URL ====================
    "url": "url",
    "website": "url",
    "web address": "url",
    "link": "url",
    "uri": "url",
    "domain": "url",
}

print(f"Total mappings defined: {len(LABEL_MAP)}")

Total mappings defined: 291


## 3. Load All Unified Datasets

In [4]:
# Load all unified datasets
all_data = []

for filename in sorted(os.listdir(UNIFIED_DIR)):
    if filename.endswith('_unified.json'):
        path = os.path.join(UNIFIED_DIR, filename)
        with open(path) as f:
            data = json.load(f)
        all_data.extend(data)
        print(f"Loaded {filename}: {len(data):,} samples")

print(f"\nTotal samples: {len(all_data):,}")

Loaded ai4privacy_200k_unified.json: 209,261 samples
Loaded ai4privacy_400k_unified.json: 50,000 samples
Loaded beki_privy_unified.json: 100,951 samples
Loaded e3jsi_unified.json: 2,971 samples
Loaded gliner_pii_unified.json: 3,764 samples
Loaded gretel_finance_unified.json: 5,594 samples
Loaded gretel_pii_en_unified.json: 5,000 samples
Loaded nvidia_nemotron_unified.json: 100,000 samples
Loaded urchade_unified.json: 19,635 samples

Total samples: 497,176


## 4. Map Labels and Collect Samples

In [5]:
def normalize_label(label):
    """Normalize a label to lowercase and map to simplified label."""
    label_lower = label.lower().strip()
    return LABEL_MAP.get(label_lower, None)

# Collect samples for each simplified label
# Each sample is (source_text, language, source, entity_info)
samples_by_label = defaultdict(list)
unmapped_labels = Counter()

for item in all_data:
    source_text = item.get('source_text', '')
    language = item.get('language', 'en')
    source = item.get('source', 'unknown')
    
    for entity in item.get('privacy_mask', []):
        original_label = entity.get('label', '')
        simplified_label = normalize_label(original_label)
        
        if simplified_label:
            # Store the sample with entity info
            sample = {
                'source_text': source_text,
                'language': language,
                'source': source,
                'original_label': original_label,
                'entity': {
                    'label': simplified_label,
                    'start': entity.get('start', 0),
                    'end': entity.get('end', 0),
                    'value': entity.get('value', '')
                }
            }
            samples_by_label[simplified_label].append(sample)
        else:
            unmapped_labels[original_label] += 1

print("Samples collected per label:")
print("=" * 60)
for label in SIMPLIFIED_24_LABELS:
    count = len(samples_by_label[label])
    print(f"  {label:30s} {count:>10,}")

print(f"\nTotal unmapped labels: {len(unmapped_labels)}")
print(f"Total unmapped entities: {sum(unmapped_labels.values()):,}")

Samples collected per label:
  date                              226,560
  full name                         371,165
  username                           62,113
  social security number             23,929
  tax identification number           5,478
  passport number                     8,720
  driver's license number             1,123
  identification number              59,373
  phone number                       53,749
  address                           250,862
  email address                      88,086
  ip address                         51,446
  fax number                          6,522
  credit card number                 28,655
  credit score                            2
  bank account number                26,656
  amount                             30,997
  iban                               10,658
  insurance number                    5,904
  medical condition                   1,624
  medication                          1,816
  medical treatment                     141
  o

## 5. Show Top Unmapped Labels

In [6]:
print("Top 30 unmapped labels (consider adding to mapping):")
print("=" * 60)
for label, count in unmapped_labels.most_common(30):
    print(f"  {label:40s} {count:>8,}")

Top 30 unmapped labels (consider adding to mapping):
  occupation                                 37,099
  AGE                                        15,733
  NRP                                        13,789
  SEX                                        13,528
  JOBTYPE                                    13,433
  CURRENCYSYMBOL                             13,147
  credit_debit_card                          12,867
  GENDER                                     12,847
  JOBTITLE                                   12,828
  JOBAREA                                    12,681
  ACCOUNTNAME                                12,533
  ACCOUNTNUMBER                              12,473
  COMPANYNAME                                12,167
  SECONDARYADDRESS                           12,008
  BITCOINADDRESS                             11,682
  biometric_identifier                       11,520
  employment_status                          11,018
  MASKEDNUMBER                               10,893
  health_pl

## 6. Save Per-Label Datasets (max 10k samples each)

In [7]:
# Save each label to a separate file
print("Saving per-label datasets...")
print("=" * 60)

for label in SIMPLIFIED_24_LABELS:
    samples = samples_by_label[label]
    
    if not samples:
        print(f"  {label:30s} SKIPPED (no samples)")
        continue
    
    # Shuffle and take up to MAX_SAMPLES
    random.shuffle(samples)
    selected = samples[:MAX_SAMPLES_PER_LABEL]
    
    # Convert to unified format for training
    output_samples = []
    for s in selected:
        output_samples.append({
            'source_text': s['source_text'],
            'language': s['language'],
            'source': s['source'],
            'privacy_mask': [s['entity']]
        })
    
    # Save to file
    filename = label.replace("'", "").replace(" ", "_") + ".json"
    output_path = os.path.join(OUTPUT_DIR, filename)
    
    with open(output_path, 'w') as f:
        json.dump(output_samples, f)
    
    size_mb = os.path.getsize(output_path) / (1024 * 1024)
    print(f"  {label:30s} {len(selected):>6,} samples ({size_mb:.1f} MB)")

print("\nDone!")

Saving per-label datasets...
  date                           10,000 samples (9.3 MB)
  full name                      10,000 samples (6.8 MB)
  username                       10,000 samples (7.4 MB)
  social security number         10,000 samples (5.9 MB)
  tax identification number       5,478 samples (3.7 MB)
  passport number                 8,720 samples (6.6 MB)
  driver's license number         1,123 samples (0.8 MB)
  identification number          10,000 samples (10.2 MB)
  phone number                   10,000 samples (8.2 MB)
  address                        10,000 samples (7.4 MB)
  email address                  10,000 samples (10.1 MB)
  ip address                     10,000 samples (6.5 MB)
  fax number                      6,522 samples (8.4 MB)
  credit card number             10,000 samples (5.9 MB)
  credit score                        2 samples (0.0 MB)
  bank account number            10,000 samples (8.7 MB)
  amount                         10,000 samples (4.0 MB)


## 7. Summary

In [8]:
print(f"\nOutput directory: {OUTPUT_DIR}")
print("\nFiles created:")
print("=" * 60)

total_samples = 0
for filename in sorted(os.listdir(OUTPUT_DIR)):
    if filename.endswith('.json'):
        path = os.path.join(OUTPUT_DIR, filename)
        with open(path) as f:
            data = json.load(f)
        size_mb = os.path.getsize(path) / (1024 * 1024)
        total_samples += len(data)
        print(f"  {filename:40s} {len(data):>6,} samples ({size_mb:.1f} MB)")

print(f"\nTotal: {total_samples:,} samples")


Output directory: /Users/sravan/Documents/Experiments/fintuning_PII/Data/training_by_label

Files created:
  address.json                             10,000 samples (7.4 MB)
  amount.json                              10,000 samples (4.0 MB)
  bank_account_number.json                 10,000 samples (8.7 MB)
  credit_card_number.json                  10,000 samples (5.9 MB)
  credit_score.json                             2 samples (0.0 MB)
  date.json                                10,000 samples (9.3 MB)
  drivers_license_number.json               1,123 samples (0.8 MB)
  email_address.json                       10,000 samples (10.1 MB)
  fax_number.json                           6,522 samples (8.4 MB)
  full_name.json                           10,000 samples (6.8 MB)
  iban.json                                10,000 samples (3.9 MB)
  identification_number.json               10,000 samples (10.2 MB)
  insurance_number.json                     5,904 samples (4.4 MB)
  ip_address.json  