# PII Dataset Collection - Banking Domain

Collects and inspects all PII/NER datasets relevant to banking.

**Datasets covered:**
1. `ai4privacy/pii-masking-400k`
2. `ai4privacy/pii-masking-300k` (FinPII-80k split)
3. `gretelai/synthetic_pii_finance_multilingual`
4. `nvidia/Nemotron-PII`
5. `wikiann` (en)
6. `Babelscape/multinerd` (en)
7. `DFKI-SLT/few-nerd`
8. `conll2003`
9. `nlpaueb/finer-139`
10. `iiiorg/piiranha-v1-detect-personal-information`

## Imports and output directory

In [14]:
import os
import json
import warnings
from pathlib import Path
from collections import Counter

import pandas as pd
from datasets import load_dataset, get_dataset_config_names
from tabulate import tabulate

warnings.filterwarnings('ignore')


In [15]:

OUTPUT_DIR = Path('./pii_datasets')
OUTPUT_DIR.mkdir(exist_ok=True)

print(f'Output directory: {OUTPUT_DIR.resolve()}')

Output directory: /home/pritesh-jha/projects/pii-detection/pii-detection/notebooks/pii_datasets


## Helper utilities

In [3]:
def count_labels_from_bio(dataset, label_field='ner_tags', label_names=None):
    """
    Count unique entity types from a BIO-tagged HuggingFace dataset split.
    Returns a set of entity type strings (without B-/I- prefix).
    """
    types = set()
    sample = dataset.select(range(min(500, len(dataset))))
    for row in sample:
        tags = row[label_field]
        for tag in tags:
            if isinstance(tag, int):
                if label_names:
                    tag = label_names[tag]
                else:
                    continue
            if tag != 'O' and tag != '':
                entity = tag.replace('B-', '').replace('I-', '').strip()
                if entity:
                    types.add(entity)
    return types


def save_split(dataset, name, split_name):
    """
    Save a dataset split to disk as JSONL.
    """
    out_path = OUTPUT_DIR / name
    out_path.mkdir(exist_ok=True)
    filepath = out_path / f'{split_name}.jsonl'
    dataset.to_json(str(filepath))
    size_mb = filepath.stat().st_size / (1024 * 1024)
    print(f'  Saved {split_name}: {len(dataset):,} rows -> {filepath} ({size_mb:.1f} MB)')
    return filepath


def build_summary_row(name, url, license_, lang, num_rows, num_entity_types,
                      entity_types, annotation_source, domain, banking_relevance, notes):
    return {
        'Dataset': name,
        'URL': url,
        'License': license_,
        'Language(s)': lang,
        'Total Rows': num_rows,
        'PII Entity Types (#)': num_entity_types,
        'Entity Types (sample)': entity_types,
        'Annotation Source': annotation_source,
        'Domain': domain,
        'Banking Relevance': banking_relevance,
        'Notes': notes
    }


summary_rows = []

## Dataset 1 — ai4privacy/pii-masking-400k

In [4]:
print('Loading ai4privacy/pii-masking-400k ...')
ds_400k = load_dataset('ai4privacy/pii-masking-400k')
print(ds_400k)

# Entity types come from bio_labels column
train_split = ds_400k['train']
bio_types = set()
for row in train_split.select(range(min(1000, len(train_split)))):
    for label in row.get('bio_labels', []):
        if label != 'O':
            bio_types.add(label.replace('B-', '').replace('I-', ''))

print(f'  Rows: {len(train_split):,}')
print(f'  Detected entity types ({len(bio_types)}): {sorted(bio_types)}')

save_split(train_split, 'ai4privacy_400k', 'train')

summary_rows.append(build_summary_row(
    name='ai4privacy/pii-masking-400k',
    url='https://huggingface.co/datasets/ai4privacy/pii-masking-400k',
    license_='Custom (academic free, commercial needs license)',
    lang='en, fr, de, it',
    num_rows=len(train_split),
    num_entity_types=63,
    entity_types='FIRSTNAME, LASTNAME, EMAIL, PHONE, CREDITCARDNUMBER, SSN, IBAN, BITCOINADDRESS, ...',
    annotation_source='Synthetic (proprietary algorithm)',
    domain='General (business, education, psychology, legal)',
    banking_relevance='High — covers IBAN, CREDITCARD, ACCOUNTNUMBER, BITCOINADDRESS',
    notes='Latest version. 63 PII classes. Use FinPII split in 300k for finance-specific.'
))

Loading ai4privacy/pii-masking-400k ...


Generating train split: 100%|██████████| 325517/325517 [00:00<00:00, 407626.75 examples/s]
Generating validation split: 100%|██████████| 81379/81379 [00:00<00:00, 455604.30 examples/s]


DatasetDict({
    train: Dataset({
        features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 325517
    })
    validation: Dataset({
        features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
        num_rows: 81379
    })
})
  Rows: 325,517
  Detected entity types (0): []


Creating json from Arrow format: 100%|██████████| 326/326 [00:03<00:00, 85.63ba/s]

  Saved train: 325,517 rows -> pii_datasets/ai4privacy_400k/train.jsonl (354.2 MB)





## Dataset 2 — ai4privacy/pii-masking-300k (FinPII)

In [5]:
print('Loading ai4privacy/pii-masking-300k ...')
# Check available configs
try:
    configs = get_dataset_config_names('ai4privacy/pii-masking-300k')
    print(f'  Available configs: {configs}')
except Exception:
    configs = ['default']

ds_300k = load_dataset('ai4privacy/pii-masking-300k')
print(ds_300k)

train_300k = ds_300k['train']
bio_types_300k = set()
for row in train_300k.select(range(min(1000, len(train_300k)))):
    for label in row.get('bio_labels', []):
        if label != 'O':
            bio_types_300k.add(label.replace('B-', '').replace('I-', ''))

print(f'  Rows: {len(train_300k):,}')
print(f'  Detected entity types ({len(bio_types_300k)}): {sorted(bio_types_300k)}')

save_split(train_300k, 'ai4privacy_300k', 'train')

summary_rows.append(build_summary_row(
    name='ai4privacy/pii-masking-300k',
    url='https://huggingface.co/datasets/ai4privacy/pii-masking-300k',
    license_='Custom (academic free, commercial needs license)',
    lang='en, fr, de, it, es, pt',
    num_rows=len(train_300k),
    num_entity_types='27 (OpenPII) + ~20 (FinPII)',
    entity_types='OpenPII-220k + FinPII-80k (finance/insurance-specific types)',
    annotation_source='Synthetic + human-in-loop (~98.3% token accuracy)',
    domain='General + Finance/Insurance (FinPII subset)',
    banking_relevance='Very High — FinPII-80k explicitly targets finance/insurance',
    notes='Best option for banking. FinPII contains ~20 finance-specific entity types.'
))

Loading ai4privacy/pii-masking-300k ...
  Available configs: ['default']


Generating train split: 100%|██████████| 177677/177677 [00:01<00:00, 163453.34 examples/s]
Generating validation split: 100%|██████████| 47728/47728 [00:00<00:00, 154189.72 examples/s]


DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 177677
    })
    validation: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 47728
    })
})
  Rows: 177,677
  Detected entity types (0): []


Creating json from Arrow format: 100%|██████████| 178/178 [00:05<00:00, 35.53ba/s]

  Saved train: 177,677 rows -> pii_datasets/ai4privacy_300k/train.jsonl (552.3 MB)





## Dataset 3 — gretelai/synthetic_pii_finance_multilingual

In [6]:
print('Loading gretelai/synthetic_pii_finance_multilingual ...')
ds_gretel = load_dataset('gretelai/synthetic_pii_finance_multilingual')
print(ds_gretel)

train_gretel = ds_gretel['train']
print(f'  Rows: {len(train_gretel):,}')
print(f'  Columns: {train_gretel.column_names}')

# Inspect a sample to understand label format
sample = train_gretel[0]
print(f'  Sample keys: {list(sample.keys())}')

# Get unique PII types from the dataset
pii_types_gretel = set()
label_col = None
for col in ['pii_class', 'entity_type', 'label', 'ner_tags', 'labels']:
    if col in train_gretel.column_names:
        label_col = col
        break

if label_col:
    for row in train_gretel.select(range(min(200, len(train_gretel)))):
        val = row[label_col]
        if isinstance(val, list):
            for v in val:
                pii_types_gretel.add(str(v))
        else:
            pii_types_gretel.add(str(val))

print(f'  PII types found: {sorted(pii_types_gretel)}')

save_split(train_gretel, 'gretel_finance', 'train')
if 'test' in ds_gretel:
    save_split(ds_gretel['test'], 'gretel_finance', 'test')

summary_rows.append(build_summary_row(
    name='gretelai/synthetic_pii_finance_multilingual',
    url='https://huggingface.co/datasets/gretelai/synthetic_pii_finance_multilingual',
    license_='Apache 2.0',
    lang='en, es, sv, de, it, nl, fr',
    num_rows=len(train_gretel),
    num_entity_types=29,
    entity_types='ACCOUNT_NUMBER, ROUTING_NUMBER, IBAN, CREDIT_CARD, SSN, TAX_ID, ...',
    annotation_source='Synthetic (Gretel LLM + GLiNER validation + LLM-as-judge)',
    domain='Finance (100 financial document types: bank statements, loan docs, wire transfers)',
    banking_relevance='Very High — explicitly covers banking document formats',
    notes='55,940 records. Avg doc length 1,357 chars. Best domain match for banking.'
))

Loading gretelai/synthetic_pii_finance_multilingual ...


Generating train split: 100%|██████████| 50346/50346 [00:00<00:00, 379188.08 examples/s]
Generating test split: 100%|██████████| 5594/5594 [00:00<00:00, 316613.18 examples/s]


DatasetDict({
    train: Dataset({
        features: ['level_0', 'index', 'document_type', 'document_description', 'expanded_type', 'expanded_description', 'language', 'language_description', 'domain', 'generated_text', 'pii_spans', 'conformance_score', 'quality_score', 'toxicity_score', 'bias_score', 'groundedness_score'],
        num_rows: 50346
    })
    test: Dataset({
        features: ['level_0', 'index', 'document_type', 'document_description', 'expanded_type', 'expanded_description', 'language', 'language_description', 'domain', 'generated_text', 'pii_spans', 'conformance_score', 'quality_score', 'toxicity_score', 'bias_score', 'groundedness_score'],
        num_rows: 5594
    })
})
  Rows: 50,346
  Columns: ['level_0', 'index', 'document_type', 'document_description', 'expanded_type', 'expanded_description', 'language', 'language_description', 'domain', 'generated_text', 'pii_spans', 'conformance_score', 'quality_score', 'toxicity_score', 'bias_score', 'groundedness_score']
 

Creating json from Arrow format: 100%|██████████| 51/51 [00:00<00:00, 117.01ba/s]


  Saved train: 50,346 rows -> pii_datasets/gretel_finance/train.jsonl (117.9 MB)


Creating json from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 123.75ba/s]

  Saved test: 5,594 rows -> pii_datasets/gretel_finance/test.jsonl (13.2 MB)





## Dataset 4 — nvidia/Nemotron-PII

In [7]:
print('Loading nvidia/Nemotron-PII ...')
ds_nvidia = load_dataset('nvidia/Nemotron-PII')
print(ds_nvidia)

train_nvidia = ds_nvidia['train']
print(f'  Rows: {len(train_nvidia):,}')
print(f'  Columns: {train_nvidia.column_names}')

# Get entity types
nvidia_types = set()
for row in train_nvidia.select(range(min(500, len(train_nvidia)))):
    for col in ['ner_tags', 'labels', 'bio_labels', 'label']:
        if col in row:
            val = row[col]
            if isinstance(val, list):
                for v in val:
                    tag = str(v)
                    if tag != 'O':
                        nvidia_types.add(tag.replace('B-', '').replace('I-', ''))
            break

print(f'  Entity types: {sorted(nvidia_types)}')

save_split(train_nvidia, 'nvidia_nemotron', 'train')

summary_rows.append(build_summary_row(
    name='nvidia/Nemotron-PII',
    url='https://huggingface.co/datasets/nvidia/Nemotron-PII',
    license_='CC-BY 4.0',
    lang='en',
    num_rows=len(train_nvidia),
    num_entity_types='55+',
    entity_types='PII + PHI: names, SSN, DOB, ACCOUNT, DEVICE_ID, IP, BIOMETRIC, ...',
    annotation_source='Synthetic (NVIDIA NeMo Data Designer, Census-grounded personas)',
    domain='General across 50+ industries including finance',
    banking_relevance='High — 50+ industries includes finance; covers PHI useful for KYC',
    notes='100k records. Structured + unstructured docs. CC-BY 4.0 = commercial-friendly.'
))

Loading nvidia/Nemotron-PII ...


Generating train split: 100%|██████████| 100000/100000 [00:00<00:00, 263374.95 examples/s]
Generating test split: 100%|██████████| 100000/100000 [00:00<00:00, 311244.81 examples/s]


DatasetDict({
    train: Dataset({
        features: ['uid', 'domain', 'document_type', 'document_description', 'document_format', 'locale', 'text', 'spans', 'text_tagged'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['uid', 'domain', 'document_type', 'document_description', 'document_format', 'locale', 'text', 'spans', 'text_tagged'],
        num_rows: 100000
    })
})
  Rows: 100,000
  Columns: ['uid', 'domain', 'document_type', 'document_description', 'document_format', 'locale', 'text', 'spans', 'text_tagged']
  Entity types: []


Creating json from Arrow format: 100%|██████████| 100/100 [00:01<00:00, 98.32ba/s]

  Saved train: 100,000 rows -> pii_datasets/nvidia_nemotron/train.jsonl (318.6 MB)





## Dataset 5 — wikiann (en)

In [8]:
print('Loading wikiann (en) ...')
ds_wikiann = load_dataset('wikiann', 'en')
print(ds_wikiann)

train_wiki = ds_wikiann['train']
label_names = train_wiki.features['ner_tags'].feature.names
print(f'  Labels: {label_names}')
print(f'  Rows: {len(train_wiki):,}')

save_split(train_wiki, 'wikiann', 'train')
save_split(ds_wikiann['validation'], 'wikiann', 'validation')
save_split(ds_wikiann['test'], 'wikiann', 'test')

entity_types_wiki = set(l.replace('B-', '').replace('I-', '') for l in label_names if l != 'O')

summary_rows.append(build_summary_row(
    name='wikiann (en)',
    url='https://huggingface.co/datasets/wikiann',
    license_='CC-BY-SA 3.0',
    lang='en (282 langs available)',
    num_rows=len(train_wiki) + len(ds_wikiann['validation']) + len(ds_wikiann['test']),
    num_entity_types=len(entity_types_wiki),
    entity_types=', '.join(sorted(entity_types_wiki)),
    annotation_source='Auto-annotated from Wikipedia using cross-lingual projection',
    domain='General (Wikipedia articles)',
    banking_relevance='Medium — PER/ORG/LOC only; good for entity grounding in text',
    notes='Only 3 entity types. Use as supplementary data for PER/ORG/LOC coverage.'
))

Loading wikiann (en) ...


Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 1628539.70 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 1936786.11 examples/s]
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 2366921.93 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})
  Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
  Rows: 20,000


Creating json from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 280.40ba/s]


  Saved train: 20,000 rows -> pii_datasets/wikiann/train.jsonl (3.8 MB)


Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 293.22ba/s]


  Saved validation: 10,000 rows -> pii_datasets/wikiann/validation.jsonl (1.9 MB)


Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 291.45ba/s]

  Saved test: 10,000 rows -> pii_datasets/wikiann/test.jsonl (1.9 MB)





## Dataset 6 — Babelscape/multinerd (en)

In [24]:
print('Loading Babelscape/multinerd ...')
ds_multinerd = load_dataset('Babelscape/multinerd', verification_mode='no_checks')
print(ds_multinerd)

train_mn = ds_multinerd['train']

# Filter English only
if 'lang' in train_mn.column_names:
    train_mn_en = train_mn.filter(lambda x: x['lang'] == 'en')
else:
    train_mn_en = train_mn

# ner_tags in this dataset is a Sequence of Value(int64), not ClassLabel.
# The integer-to-label mapping is documented in the dataset card.
multinerd_id2label = {
    0: 'O',
    1: 'B-PER', 2: 'I-PER',
    3: 'B-ORG', 4: 'I-ORG',
    5: 'B-LOC', 6: 'I-LOC',
    7: 'B-ANIM', 8: 'I-ANIM',
    9: 'B-BIO', 10: 'I-BIO',
    11: 'B-CEL', 12: 'I-CEL',
    13: 'B-DIS', 14: 'I-DIS',
    15: 'B-EVE', 16: 'I-EVE',
    17: 'B-FOOD', 18: 'I-FOOD',
    19: 'B-INST', 20: 'I-INST',
    21: 'B-MEDIA', 22: 'I-MEDIA',
    23: 'B-MYTH', 24: 'I-MYTH',
    25: 'B-PLANT', 26: 'I-PLANT',
    27: 'B-TIME', 28: 'I-TIME',
    29: 'B-VEHI', 30: 'I-VEHI',
}
entity_types_mn = set(
    v.replace('B-', '').replace('I-', '')
    for v in multinerd_id2label.values() if v != 'O'
)
print(f'  Labels: {sorted(entity_types_mn)}')
print(f'  EN rows: {len(train_mn_en):,}')

save_split(train_mn_en, 'multinerd', 'train_en')

summary_rows.append(build_summary_row(
    name='Babelscape/multinerd',
    url='https://huggingface.co/datasets/Babelscape/multinerd',
    license_='CC-BY-NC-SA 4.0',
    lang='en, de, es, fr, it, nl, pl, pt, ru, zh',
    num_rows=len(train_mn_en),
    num_entity_types=len(entity_types_mn),
    entity_types=', '.join(sorted(entity_types_mn)),
    annotation_source='Expert-annotated',
    domain='General (Wikipedia + news)',
    banking_relevance='Medium — PER/ORG/LOC plus TIME/EVE useful for transaction context',
    notes='15 types. NC license — not for commercial use as-is.'
))

Loading Babelscape/multinerd ...
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 1339200
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 167400
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 167993
    })
})
  Labels: ['ANIM', 'BIO', 'CEL', 'DIS', 'EVE', 'FOOD', 'INST', 'LOC', 'MEDIA', 'MYTH', 'ORG', 'PER', 'PLANT', 'TIME', 'VEHI']
  EN rows: 131,280


Creating json from Arrow format: 100%|██████████| 132/132 [00:00<00:00, 154.02ba/s]

  Saved train_en: 131,280 rows -> pii_datasets/multinerd/train_en.jsonl (30.4 MB)





## Dataset 7 — DFKI-SLT/few-nerd

In [10]:
print('Loading DFKI-SLT/few-nerd (supervised split) ...')
ds_fewnerd = load_dataset('DFKI-SLT/few-nerd', 'supervised')
print(ds_fewnerd)

train_fn = ds_fewnerd['train']
label_names_fn = train_fn.features['ner_tags'].feature.names

# Get unique types from sample
fn_types = set()
for row in train_fn.select(range(min(500, len(train_fn)))):
    for tag_id in row['ner_tags']:
        label = label_names_fn[tag_id]
        if label != 'O':
            fn_types.add(label.replace('B-', '').replace('I-', ''))

print(f'  Rows: {len(train_fn):,}')
print(f'  Entity types ({len(fn_types)}): {sorted(fn_types)[:20]} ...')

save_split(train_fn, 'few_nerd', 'train')
save_split(ds_fewnerd['validation'], 'few_nerd', 'validation')
save_split(ds_fewnerd['test'], 'few_nerd', 'test')

summary_rows.append(build_summary_row(
    name='DFKI-SLT/few-nerd',
    url='https://huggingface.co/datasets/DFKI-SLT/few-nerd',
    license_='CC-BY-SA 4.0',
    lang='en',
    num_rows=len(train_fn) + len(ds_fewnerd['validation']) + len(ds_fewnerd['test']),
    num_entity_types=66,
    entity_types='person-politician, org-company, location-city, product-software, ... (66 fine-grained)',
    annotation_source='Crowdsourced (188k sentences)',
    domain='General (Wikipedia)',
    banking_relevance='Low-Medium — org-company, location useful; no direct PII types',
    notes='Fine-grained NER. Useful for entity disambiguation, not direct PII tagging.'
))

Loading DFKI-SLT/few-nerd (supervised split) ...


Generating train split: 100%|██████████| 131767/131767 [00:00<00:00, 753340.40 examples/s]
Generating validation split: 100%|██████████| 18824/18824 [00:00<00:00, 855577.84 examples/s]
Generating test split: 100%|██████████| 37648/37648 [00:00<00:00, 1142251.68 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
        num_rows: 131767
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
        num_rows: 18824
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
        num_rows: 37648
    })
})
  Rows: 131,767
  Entity types (8): ['art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product'] ...


Creating json from Arrow format: 100%|██████████| 132/132 [00:00<00:00, 189.42ba/s]


  Saved train: 131,767 rows -> pii_datasets/few_nerd/train.jsonl (43.3 MB)


Creating json from Arrow format: 100%|██████████| 19/19 [00:00<00:00, 160.81ba/s]


  Saved validation: 18,824 rows -> pii_datasets/few_nerd/validation.jsonl (6.2 MB)


Creating json from Arrow format: 100%|██████████| 38/38 [00:00<00:00, 186.07ba/s]

  Saved test: 37,648 rows -> pii_datasets/few_nerd/test.jsonl (12.3 MB)





## Dataset 8 — CoNLL-2003

In [18]:
print('Loading conll2003 ...')
# conll2003 uses a legacy .py loading script blocked in datasets>=4.0.
# Use the auto-converted Parquet revision instead.
ds_conll = load_dataset('conll2003', revision='refs/convert/parquet')
print(ds_conll)

train_conll = ds_conll['train']
label_names_conll = train_conll.features['ner_tags'].feature.names
entity_types_conll = set(l.replace('B-', '').replace('I-', '') for l in label_names_conll if l != 'O')

print(f'  Rows: {len(train_conll):,}')
print(f'  Labels: {label_names_conll}')

save_split(train_conll, 'conll2003', 'train')
save_split(ds_conll['validation'], 'conll2003', 'validation')
save_split(ds_conll['test'], 'conll2003', 'test')

summary_rows.append(build_summary_row(
    name='conll2003',
    url='https://huggingface.co/datasets/conll2003',
    license_='Custom (non-commercial research)',
    lang='en',
    num_rows=len(train_conll) + len(ds_conll['validation']) + len(ds_conll['test']),
    num_entity_types=len(entity_types_conll),
    entity_types=', '.join(sorted(entity_types_conll)),
    annotation_source='Expert-annotated (newswire)',
    domain='News (Reuters 1996)',
    banking_relevance='Low — only PER/ORG/LOC/MISC; no financial PII',
    notes='Industry baseline. Already in your repo. Non-commercial license.'
))

Loading conll2003 ...


Generating train split: 14041 examples [00:00, 689451.09 examples/s]
Generating validation split: 3250 examples [00:00, 411877.21 examples/s]
Generating test split: 3453 examples [00:00, 504491.14 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
  Rows: 14,041
  Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


Creating json from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 194.50ba/s]


  Saved train: 14,041 rows -> pii_datasets/conll2003/train.jsonl (3.8 MB)


Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 242.61ba/s]


  Saved validation: 3,250 rows -> pii_datasets/conll2003/validation.jsonl (0.9 MB)


Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 261.16ba/s]

  Saved test: 3,453 rows -> pii_datasets/conll2003/test.jsonl (0.9 MB)





## Dataset 9 — nlpaueb/finer-139

In [19]:
print('Loading nlpaueb/finer-139 ...')
# finer-139 also uses a legacy .py script. Use the Parquet revision.
ds_finer = load_dataset('nlpaueb/finer-139', revision='refs/convert/parquet')
print(ds_finer)

train_finer = ds_finer['train']
label_names_finer = train_finer.features['ner_tags'].feature.names

# Sample a few entity types
finer_types = set()
for row in train_finer.select(range(min(1000, len(train_finer)))):
    for tag_id in row['ner_tags']:
        label = label_names_finer[tag_id]
        if label != 'O':
            finer_types.add(label.replace('B-', '').replace('I-', ''))

print(f'  Rows: {len(train_finer):,}')
print(f'  Entity type count: {len(label_names_finer)} labels ({len(finer_types)} found in sample)')
print(f'  Sample types: {sorted(list(finer_types))[:10]}')

save_split(train_finer, 'finer_139', 'train')
save_split(ds_finer['validation'], 'finer_139', 'validation')
save_split(ds_finer['test'], 'finer_139', 'test')

summary_rows.append(build_summary_row(
    name='nlpaueb/finer-139',
    url='https://huggingface.co/datasets/nlpaueb/finer-139',
    license_='CC-BY-SA 4.0',
    lang='en',
    num_rows=len(train_finer) + len(ds_finer['validation']) + len(ds_finer['test']),
    num_entity_types=139,
    entity_types='XBRL financial tags: Revenue, Assets, LiabilitiesTotal, DebtCurrent, EPS, ...',
    annotation_source='Expert-annotated (SEC professional auditors via EDGAR filings)',
    domain='Finance (SEC 10-K/10-Q annual/quarterly reports)',
    banking_relevance='High for financial domain language; not classical PII but financial entities',
    notes='1.1M sentences. Use for continued pre-training on financial text, not PII labels directly.'
))

Loading nlpaueb/finer-139 ...


Generating train split: 900384 examples [00:01, 455939.64 examples/s]
Generating validation split: 112494 examples [00:00, 358829.62 examples/s]
Generating test split: 108378 examples [00:00, 320540.23 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 900384
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 112494
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 108378
    })
})
  Rows: 900,384
  Entity type count: 279 labels (64 found in sample)
  Sample types: ['AllocatedShareBasedCompensationExpense', 'AmortizationOfFinancingCosts', 'AmortizationOfIntangibleAssets', 'AntidilutiveSecuritiesExcludedFromComputationOfEarningsPerShareAmount', 'AreaOfRealEstateProperty', 'BusinessAcquisitionPercentageOfVotingInterestsAcquired', 'BusinessCombinationConsiderationTransferred1', 'ClassOfWarrantOrRightExercisePriceOfWarrantsOrRights1', 'CommonStockCapitalSharesReservedForFutureIssuance', 'CommonStockDividendsPerShareDeclared']


Creating json from Arrow format: 100%|██████████| 901/901 [00:05<00:00, 169.89ba/s]


  Saved train: 900,384 rows -> pii_datasets/finer_139/train.jsonl (409.0 MB)


Creating json from Arrow format: 100%|██████████| 113/113 [00:00<00:00, 170.46ba/s]


  Saved validation: 112,494 rows -> pii_datasets/finer_139/validation.jsonl (52.1 MB)


Creating json from Arrow format: 100%|██████████| 109/109 [00:00<00:00, 162.32ba/s]

  Saved test: 108,378 rows -> pii_datasets/finer_139/test.jsonl (51.3 MB)





## Dataset 10 — iiiorg/piiranha-v1

In [25]:
# ai4privacy/pii-masking-43k has a malformed CSV on the Hub (ParserError at line 42759).
# Replacing with Isotonic/pii-masking-200k — a clean Parquet mirror of the same data
# with identical schema, confirmed loadable.
print('Loading Isotonic/pii-masking-200k ...')
ds_iso = load_dataset('Isotonic/pii-masking-200k')
print(ds_iso)

split_key = list(ds_iso.keys())[0]
split_iso = ds_iso[split_key]
print(f'  Rows: {len(split_iso):,}')
print(f'  Columns: {split_iso.column_names}')

bio_types_iso = set()
for row in split_iso.select(range(min(500, len(split_iso)))):
    for label in row.get('bio_labels', []):
        if label != 'O':
            bio_types_iso.add(label.replace('B-', '').replace('I-', ''))

print(f'  Entity types ({len(bio_types_iso)}): {sorted(bio_types_iso)}')

save_split(split_iso, 'isotonic_pii_200k', split_key)

summary_rows.append(build_summary_row(
    name='Isotonic/pii-masking-200k',
    url='https://huggingface.co/datasets/Isotonic/pii-masking-200k',
    license_='Apache 2.0',
    lang='en, fr, de, it',
    num_rows=len(split_iso),
    num_entity_types=len(bio_types_iso) if bio_types_iso else 54,
    entity_types=', '.join(sorted(bio_types_iso)) if bio_types_iso else 'Same 54 classes as ai4privacy series',
    annotation_source='Synthetic (ai4privacy pipeline)',
    domain='General',
    banking_relevance='Medium — same PII classes as ai4privacy series; clean Parquet format',
    notes='Clean mirror of ai4privacy/pii-masking-200k. Apache 2.0 license. Used as eval benchmark in research.'
))

Loading Isotonic/pii-masking-200k ...


Generating train split: 100%|██████████| 209261/209261 [00:00<00:00, 287489.86 examples/s]


DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language'],
        num_rows: 209261
    })
})
  Rows: 209,261
  Columns: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language']
  Entity types (56): ['ACCOUNTNAME', 'ACCOUNTNUMBER', 'AGE', 'AMOUNT', 'BIC', 'BITCOINADDRESS', 'BUILDINGNUMBER', 'CITY', 'COMPANYNAME', 'COUNTY', 'CREDITCARDCVV', 'CREDITCARDISSUER', 'CREDITCARDNUMBER', 'CURRENCY', 'CURRENCYCODE', 'CURRENCYNAME', 'CURRENCYSYMBOL', 'DATE', 'DOB', 'EMAIL', 'ETHEREUMADDRESS', 'EYECOLOR', 'FIRSTNAME', 'GENDER', 'HEIGHT', 'IBAN', 'IP', 'IPV4', 'IPV6', 'JOBAREA', 'JOBTITLE', 'JOBTYPE', 'LASTNAME', 'LITECOINADDRESS', 'MAC', 'MASKEDNUMBER', 'MIDDLENAME', 'NEARBYGPSCOORDINATE', 'ORDINALDIRECTION', 'PASSWORD', 'PHONEIMEI', 'PHONENUMBER', 'PIN', 'PREFIX', 'SECONDARYADDRESS', 'SEX', 'SSN', 'STATE', 'STREET', 'TIME', 'URL', 'USERAGENT', '

Creating json from Arrow format: 100%|██████████| 210/210 [00:02<00:00, 84.13ba/s]

  Saved train: 209,261 rows -> pii_datasets/isotonic_pii_200k/train.jsonl (302.1 MB)





## 13. Summary Table

In [26]:
df_summary = pd.DataFrame(summary_rows)

# Display full table
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

print('\n===== DATASET SUMMARY =====')
print(tabulate(df_summary, headers='keys', tablefmt='grid', showindex=False))

# Save to CSV
csv_path = OUTPUT_DIR / 'dataset_summary.csv'
df_summary.to_csv(csv_path, index=False)
print(f'\nSummary saved to: {csv_path}')


===== DATASET SUMMARY =====
+---------------------------------------------+-----------------------------------------------------------------------------+--------------------------------------------------+----------------------------------------+--------------+-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------+----------------------------------------------------------

## 14. Verify saved files

In [27]:
print('\n===== FILES ON DISK =====')
total_size = 0
file_rows = []

for jsonl_file in sorted(OUTPUT_DIR.rglob('*.jsonl')):
    size_mb = jsonl_file.stat().st_size / (1024 * 1024)
    total_size += size_mb
    file_rows.append({'File': str(jsonl_file.relative_to(OUTPUT_DIR)), 'Size (MB)': f'{size_mb:.1f}'})

print(tabulate(file_rows, headers='keys', tablefmt='simple'))
print(f'\nTotal disk usage: {total_size:.1f} MB')
print(f'Summary CSV: {(OUTPUT_DIR / "dataset_summary.csv").resolve()}')


===== FILES ON DISK =====
File                             Size (MB)
-----------------------------  -----------
ai4privacy_300k/train.jsonl          552.3
ai4privacy_400k/train.jsonl          354.2
conll2003/test.jsonl                   0.9
conll2003/train.jsonl                  3.8
conll2003/validation.jsonl             0.9
few_nerd/test.jsonl                   12.3
few_nerd/train.jsonl                  43.3
few_nerd/validation.jsonl              6.2
finer_139/test.jsonl                  51.3
finer_139/train.jsonl                409
finer_139/validation.jsonl            52.1
gretel_finance/test.jsonl             13.2
gretel_finance/train.jsonl           117.9
isotonic_pii_200k/train.jsonl        302.1
multinerd/train_en.jsonl              30.4
nvidia_nemotron/train.jsonl          318.6
wikiann/test.jsonl                     1.9
wikiann/train.jsonl                    3.8
wikiann/validation.jsonl               1.9

Total disk usage: 2276.0 MB
Summary CSV: /home/pritesh-jha/projects/pii