In [1]:
from huggingface_hub import login
from datasets import load_dataset
import os

# Retrieve the token from Kaggle secrets
hf_token = os.environ.get("HF_TOKEN")

# Authenticate with Hugging Face
# login(token=hf_token)

# Load the private dataset
dataset = load_dataset("srirxml/synthetic-pii-pretraining-n600-1x", split="train", use_auth_token=hf_token)



In [2]:
from collections import Counter

# Count the occurrences of each content-type
content_type_counts = Counter(dataset['content-type'])

unique_content_types = set(dataset['content-type'])

print (f"Total number of samples: {len(dataset)}")
# Display the counts
print(content_type_counts)

df = dataset.to_pandas()

Total number of samples: 600
Counter({'Online Review': 100, 'Forum Post': 100, 'Social Media': 100, 'Blog/News Article Comment': 100, 'Article': 100, 'Online Ad': 100})


In [4]:
import pandas as pd
import random
import uuid

# Assuming `df` is your original DataFrame with columns: ['content-type', 'text']
def generate_sampled_dataset(df, content_types, total_samples=600):
    print(f"Generating a sampled dataset with {total_samples} total samples...")
    samples_per_type = total_samples // len(content_types)
    remainder = total_samples % len(content_types)

    print(f"Samples per content type: {samples_per_type}, Remainder: {remainder}")
    sampled_rows = []

    for i, ctype in enumerate(content_types):
        n = samples_per_type + (1 if i < remainder else 0)
        print(f"Sampling {n} rows for content type: {ctype}")
        candidates = df[df["content-type"] == ctype]
        if len(candidates) < n:
            print(f"Warning: Not enough samples for content type '{ctype}'. Available: {len(candidates)}, Required: {n}")
        sampled = candidates.sample(n=n, random_state=42)
        sampled = sampled.copy()
        sampled["id"] = [str(uuid.uuid4()) for _ in range(len(sampled))]
        sampled_rows.append(sampled)

    final_df = pd.concat(sampled_rows).reset_index(drop=True)
    print(f"Final sampled dataset contains {len(final_df)} rows.")
    return final_df[["id", "content-type", "text"]]

sampled_df = generate_sampled_dataset(df, unique_content_types, total_samples=150)


Generating a sampled dataset with 150 total samples...
Samples per content type: 25, Remainder: 0
Sampling 25 rows for content type: Social Media
Sampling 25 rows for content type: Blog/News Article Comment
Sampling 25 rows for content type: Online Review
Sampling 25 rows for content type: Online Ad
Sampling 25 rows for content type: Forum Post
Sampling 25 rows for content type: Article
Final sampled dataset contains 150 rows.


In [5]:
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder
import pandas as pd

# Repetition levels
REPEAT_COUNTS = [25]

HF_USERNAME = "srirxml"
HF_TOKEN = os.environ.get("HF_TOKEN")

def repeat_and_push(df, repeat_count):
    # Repeat each row consecutively repeat_count times (block repetition)
    repeated_df = df.loc[df.index.repeat(repeat_count)].reset_index(drop=True)
    dataset = Dataset.from_pandas(repeated_df)
    
    repo_name = f"{HF_USERNAME}/synthetic-pii-pretraining-n150-{repeat_count}x"
    
    # Push to hub
    dataset.push_to_hub(repo_name)
    
# Run for each repetition level
for count in REPEAT_COUNTS:
    repeat_and_push(sampled_df, count)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

In [1]:
import json
from datasets import Dataset

def all_str_record(rec):
    new_rec = {}
    for k, v in rec.items():
        # You may want to keep None as None or convert it to an empty string
        if v is None:
            new_rec[k] = ""
        else:
            new_rec[k] = str(v)
    return new_rec


jsonl_file = '/Users/sriramselvam/Code/PANORAMA-DataGen/data/Azure_Synthetic_Data_10K.processed.jsonl'
records = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
    for line in f:
        raw = json.loads(line)
        row = all_str_record(raw["synthetic_pii_input"])
        row["complete_info"] = json.dumps(raw, ensure_ascii=False)
        records.append(row)


dataset = Dataset.from_list(records)
print(dataset)



Dataset({
    features: ['Unique ID', 'Locale', 'First Name', 'Last Name', "Father's Name", "Mother's Name", 'Gender', 'Age', 'Nationality', 'Marital Status', 'Spouse Name', 'Children Count', 'National ID', 'Passport Number', "Driver's License", 'Phone Number', 'Work Phone', 'Address', 'Email Address', 'Work Email', 'Birth Date', 'Birth City', 'Education Info', 'Finance Status', 'Net Worth', 'Employer', 'Job Title', 'Annual Salary', 'Credit Score', 'Social Media Handles', 'Blood Type', 'Allergies', 'Disability', 'Emergency Contact Name', 'Emergency Contact Phone', 'complete_info'],
    num_rows: 9674
})


In [4]:
print(dataset[2])

{'Unique ID': '176a9d59-cae8-4b80-8e87-7b125b95f93e', 'Locale': 'en_US', 'First Name': 'Rebecca', 'Last Name': 'Flores', "Father's Name": 'John Flores', "Mother's Name": 'Karen Flores', 'Gender': 'Female', 'Age': '29', 'Nationality': 'American', 'Marital Status': 'Single', 'Spouse Name': 'N/A', 'Children Count': '0', 'National ID': '019-42-5061', 'Passport Number': 'N/A', "Driver's License": 'US-DL-39898836644', 'Phone Number': '391-270-7071x21889', 'Work Phone': '(270)849-0436x40583', 'Address': '0667 Jessica Port Suite 462, Smithhaven, HI 04247', 'Email Address': 'rebecca.flores38@aol.com', 'Work Email': 'r.flores@christian.info', 'Birth Date': '1995-12-25', 'Birth City': 'West Robert', 'Education Info': "Bachelor's", 'Finance Status': 'Low', 'Net Worth': '$31473.09', 'Employer': 'Christian LLC', 'Job Title': 'Biologist', 'Annual Salary': '$115004.68', 'Credit Score': '572', 'Social Media Handles': "{'Facebook': '@rebecca959', 'Instagram': '@floresr'}", 'Blood Type': 'B+', 'Allergies

In [6]:
import os
HF_USERNAME = "srirxml"
HF_TOKEN = os.environ.get("HF_TOKEN")
repo_name = f"{HF_USERNAME}/PANORAMA-Plus"
    
# Push to hub
dataset.push_to_hub(repo_name)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/srirxml/PANORAMA-Plus/commit/9bbd066011c8d2459519fd7f90c9f8d0113c4467', commit_message='Upload dataset', commit_description='', oid='9bbd066011c8d2459519fd7f90c9f8d0113c4467', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/srirxml/PANORAMA-Plus', endpoint='https://huggingface.co', repo_type='dataset', repo_id='srirxml/PANORAMA-Plus'), pr_revision=None, pr_num=None)