# Combine PII Datasets

This notebook combines all unified datasets from the `unified/` folder.

**Pre-requisite:** Run `convert_datasets_to_unified_format.py` first to create the unified files.

In [1]:
import json
import os
from collections import Counter

# Paths
UNIFIED_DIR = "/Users/sravan/Documents/Experiments/fintuning_PII/Data/additional_datasets/unified"
OUTPUT_DIR = "/Users/sravan/Documents/Experiments/fintuning_PII/finetuned-pii-2/data"

## 1. List Available Unified Datasets

In [2]:
# List all unified datasets
unified_files = [f for f in os.listdir(UNIFIED_DIR) if f.endswith('.json')]
print(f"Found {len(unified_files)} unified datasets:\n")

for f in sorted(unified_files):
    path = os.path.join(UNIFIED_DIR, f)
    size_mb = os.path.getsize(path) / (1024 * 1024)
    print(f"  {f:40s} {size_mb:8.2f} MB")

Found 3 unified datasets:

  beki_privy_unified.json                     54.15 MB
  fewnerd_unified.json                        68.83 MB
  urchade_unified.json                         2.40 MB


## 2. Load All Unified Datasets

In [3]:
# Load all unified datasets
datasets = {}

for filename in sorted(unified_files):
    path = os.path.join(UNIFIED_DIR, filename)
    with open(path) as f:
        data = json.load(f)
    
    name = filename.replace('_unified.json', '')
    datasets[name] = data
    print(f"Loaded {name}: {len(data):,} samples")

print(f"\nTotal datasets: {len(datasets)}")

Loaded beki_privy: 100,951 samples
Loaded fewnerd: 131,767 samples
Loaded urchade: 19,635 samples

Total datasets: 3


## 3. Check Label Distribution per Dataset

In [4]:
def get_label_counts(data):
    """Count labels in a dataset"""
    counts = Counter()
    for item in data:
        for mask in item.get("privacy_mask", []):
            counts[mask.get("label", "UNKNOWN")] += 1
    return counts

# Show label distribution for each dataset
for name, data in datasets.items():
    print(f"\n{'='*60}")
    print(f"{name}")
    print(f"{'='*60}")
    counts = get_label_counts(data)
    for label, count in counts.most_common(15):
        print(f"  {label:30s} {count:>8,}")
    if len(counts) > 15:
        print(f"  ... and {len(counts) - 15} more labels")


beki_privy
  LOC                              46,328
  PER                              28,292
  DATE_TIME                        19,516
  TITLE                            14,027
  NRP                              13,789
  FINANCIAL                         8,205
  ORG                               5,623
  AGE                               2,773
  PASSWORD                          2,762
  CREDIT_CARD                       2,760
  MAC_ADDRESS                       2,748
  US_BANK_NUMBER                    2,744
  US_DRIVER_LICENSE                 2,737
  US_PASSPORT                       2,712
  US_LICENSE_PLATE                  2,712
  ... and 2 more labels

fewnerd
  LOCATION                         95,339
  PERSON                           75,945
  ORGANIZATION                     66,920
  OTHER                            33,611
  PRODUCT                          21,835
  BUILDING                         17,599
  ART                              14,870
  EVENT                        

## 4. Combine All Datasets

In [None]:
# Combine all datasets
combined = []

for name, data in datasets.items():
    combined.extend(data)
    print(f"Added {len(data):>10,} from {name}")

print(f"\n{'='*60}")
print(f"TOTAL: {len(combined):,} samples")
print(f"{'='*60}")

## 5. Overall Label Distribution

In [None]:
# Overall label distribution
print("Overall Label Distribution:")
print("=" * 60)

all_labels = get_label_counts(combined)
for label, count in all_labels.most_common(30):
    print(f"  {label:30s} {count:>10,}")

print(f"\nTotal unique labels: {len(all_labels)}")
print(f"Total entities: {sum(all_labels.values()):,}")

## 6. Save Combined Dataset

In [None]:
# Save combined dataset
os.makedirs(OUTPUT_DIR, exist_ok=True)
output_path = os.path.join(OUTPUT_DIR, "combined_pii_dataset.json")

with open(output_path, 'w') as f:
    json.dump(combined, f)

size_mb = os.path.getsize(output_path) / (1024 * 1024)
print(f"Saved to: {output_path}")
print(f"File size: {size_mb:.2f} MB")
print(f"Total samples: {len(combined):,}")

## 7. Verify Samples

In [None]:
# Show sample from each source
import random

sources = set(s.get('source', 'unknown') for s in combined)

for source in sorted(sources):
    samples = [s for s in combined if s.get('source') == source and s.get('privacy_mask')]
    if samples:
        sample = random.choice(samples[:100])
        print(f"\n{'='*60}")
        print(f"Source: {source}")
        print(f"{'='*60}")
        print(f"Text: {sample['source_text'][:200]}...")
        print(f"\nEntities:")
        for mask in sample['privacy_mask'][:5]:
            print(f"  - {mask['label']}: '{mask['value']}' [{mask['start']}:{mask['end']}]")