In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import asyncio
from functools import wraps
import openai
from faker import Faker
from scipy.stats import kruskal
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '/u/a/n/anshumaan/phd_work/privacy_prompt_rewriting/universal-ner')
from tqdm.auto import tqdm
from datasets import load_dataset
from concurrent.futures import ProcessPoolExecutor
import json
import ast
import random
random.seed(20)



In [20]:
# Cycle data for english, french and german
# say 10k samples each.
import re

def remove_brackets(text):
    return re.sub(r'[\[\]]', '', text)    

def preprocess_sample(sample):
    """
    Sample is a dict (json) object!
    Entities: [Money, Name, Age, SSN, Credit Card Number, Zipcode, Date]
    Negative sampling: Take 10k 
    
    Preprocess all samples, sample by entity type. 
    """
    mapping = {
        "AMOUNT": "Money",
        "FIRSTNAME": "Name",
        "LASTNAME": "Name",
        "MIDDLENAME": "Name",
        "AGE": "Age",
        "CREDITCARDNUMBER": "Credit Card Number",
        "ZIPCODE": "Zipcode",
        "DATE": "Date",
        "SSN": "SSN",
    }
    text = sample['unmasked_text']
    privacy_mask = ast.literal_eval(sample['privacy_mask'])
    temp_dict = dict()
    for key in privacy_mask:
        value = privacy_mask[key]
        formatted_key = remove_brackets(key).split('_')[0]
        if formatted_key in mapping:
            formatted_key = mapping[formatted_key]
            
        if formatted_key not in temp_dict: temp_dict[formatted_key] = []

        # Intervene for positive and negative samples.
        if formatted_key in mapping.values():   
            formatted_value = value
            temp_dict[formatted_key].append(formatted_value)
    
    conversations = []
    labels = list(temp_dict.keys())
    
    # Get the first label in with the text.
    human_input = {
        "from": "human",
        "value": f"Passage: {text}\n\nWhat describes {labels[0]}?"
    }
    model_output = {
        "from": "gpt",
        "value": f"{temp_dict[labels[0]]}".replace("'", "\"")
    }
    conversations.append(human_input)
    conversations.append(model_output)
    
    # Get other labels in.
    for label in labels[1:]:
        human_input = {
            "from": "human",
            "value": f"What describes {label}?",
        }
        model_output = {
            "from": "gpt",
            "value": f"{temp_dict[label]}".replace("'", "\""),
        }
        conversations.append(human_input)
        conversations.append(model_output)
    
    # Format for data pipeline.
    sample_input = {
        "id": "",
        "conversations":conversations,
        "labels": labels,
    }
    
    return sample_input

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

In [21]:
names = [
    "english_pii_43k.jsonl",
    "german_pii_52k.jsonl",
    "french_pii_62k.jsonl",
]
# Entities: [Money, Name, Age, SSN, Credit Card Number, Zipcode, Date]
entities = [
    "Money", "Name", "Age", "SSN", "Credit Card Number", "Zipcode", "Date",
]
train_set = []
test_set = []
for name in names:
    processed = []
    file_path = f"/nobackup3/divyam/data/pii-masking-200k/{name}"
    content = read_jsonl(file_path)
    random.shuffle(content)
    with ProcessPoolExecutor() as executor:
        for result in tqdm(
            executor.map(preprocess_sample, content), total=len(content)
        ):
            processed.append(result)
    
    positive_samples = []
    negative_samples = []
    for sample in processed:
        a = set(sample['labels'])
        b = entities
        if a.intersection(b): positive_samples.append(sample)
        else: negative_samples.append(sample)
    
    print("#"*50)
    print(len(positive_samples), len(negative_samples))
    print(positive_samples[-1])
    print(negative_samples[-1])
    print("#"*50,'\n')

    def train_test_split(samples):
        a = samples[:int(0.7*len(samples))]
        b = samples[int(0.7*len(samples)):]
        return a, b
    
    train_positive, test_positive = train_test_split(positive_samples)
    train_negative, test_negative = train_test_split(negative_samples)
    train_set.extend(train_positive + train_negative)
    test_set.extend(test_positive + test_negative)
    
#     processed = positive_samples + negative_samples
#     final_processed.extend(processed)
    
for i in tqdm(range(len(train_set)), total=len(train_set)):
    train_set[i]["id"] = f"{i}"

for i in tqdm(range(len(test_set)), total=len(test_set)):
    test_set[i]["id"] = f"{i}"
    
with open('datasets/preempt/pii_masking_200k_en_fr_de_train.json', 'w') as fp:
    json.dump(train_set, fp, indent=2)
    
with open('datasets/preempt/pii_masking_200k_en_fr_de_test.json', 'w') as fp:
    json.dump(test_set, fp, indent=2)

  0%|          | 0/43501 [00:00<?, ?it/s]

##################################################
25993 17508
{'id': '', 'conversations': [{'from': 'human', 'value': 'Passage: Your proficiency in bankruptcy laws is required to understand the case related to multiple transactions from cards issued by visa and numbered 7023222885014211.\n\nWhat describes CREDITCARDISSUER?'}, {'from': 'gpt', 'value': '[]'}, {'from': 'human', 'value': 'What describes Credit Card Number?'}, {'from': 'gpt', 'value': '["7023222885014211"]'}], 'labels': ['CREDITCARDISSUER', 'Credit Card Number']}
{'id': '', 'conversations': [{'from': 'human', 'value': 'Passage: Cis female, proposal to modify administrative law affects Manager duties. Please analyse the potential consequences for those based in Metz Key.\n\nWhat describes GENDER?'}, {'from': 'gpt', 'value': '[]'}, {'from': 'human', 'value': 'What describes JOBTYPE?'}, {'from': 'gpt', 'value': '[]'}, {'from': 'human', 'value': 'What describes STREET?'}, {'from': 'gpt', 'value': '[]'}], 'labels': ['GENDER', '

  0%|          | 0/52817 [00:00<?, ?it/s]

##################################################
31826 20991
{'id': '', 'conversations': [{'from': 'human', 'value': 'Passage: In der Welt von Home Loan Account ist es wichtig, bestimmte Themen anzusprechen, wie zum Beispiel Sexualaufklärung. Es ist tatsächlich ein wichtiger Aspekt unserer Gesellschaft, insbesondere für Personen unter 80 years.\n\nWhat describes ACCOUNTNAME?'}, {'from': 'gpt', 'value': '[]'}, {'from': 'human', 'value': 'What describes Age?'}, {'from': 'gpt', 'value': '["80 years"]'}], 'labels': ['ACCOUNTNAME', 'Age']}
{'id': '', 'conversations': [{'from': 'human', 'value': 'Passage: Eine sichere Browserpraxis beinhaltet das Nicht-Teilen Ihrer Fahrzeuginformationen DQ03TER HAUY2GV62HF633528 online.\n\nWhat describes VEHICLEVRM?'}, {'from': 'gpt', 'value': '[]'}, {'from': 'human', 'value': 'What describes VEHICLEVIN?'}, {'from': 'gpt', 'value': '[]'}], 'labels': ['VEHICLEVRM', 'VEHICLEVIN']}
################################################## 



  0%|          | 0/61958 [00:00<?, ?it/s]

##################################################
37534 24424
{'id': '', 'conversations': [{'from': 'human', 'value': "Passage: Cher(e) Hunter Smith, veuillez mettre à jour vos coordonnées enregistrées sous Caterina43@gmail.com. À l'avenir, ce sera la principale voie de communication entre les parents et les enseignants.\n\nWhat describes Name?"}, {'from': 'gpt', 'value': '["Hunter", "Smith"]'}, {'from': 'human', 'value': 'What describes EMAIL?'}, {'from': 'gpt', 'value': '[]'}], 'labels': ['Name', 'EMAIL']}
{'id': '', 'conversations': [{'from': 'human', 'value': "Passage: Nous avons remarqué une activité suspecte dans le portail des étudiants en STIM provenant de l'IPV6 : 33fd:d95e:cfb5:6c9e:c3cd:acfe:f220:bebe avec l'agent utilisateur : Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 3.7.54868.6). Si ce n'était pas vous, veuillez réinitialiser votre MOT DE PASSE : O7tr2MC1KcWF et nous en informer immédiatement.\n\nWhat describes IPV6?"}, {'from': 'gpt', 'val

  0%|          | 0/110790 [00:00<?, ?it/s]

  0%|          | 0/47486 [00:00<?, ?it/s]