In [None]:
!pip install datasets pandas
!pip install transformers

Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.11.11-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting 

In [1]:
from datasets import load_dataset
import json
from transformers import pipeline
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


Use the below cell, to change values for the experiment, after changing this run all the cells below 

In [2]:
dataset_name = "ai4privacy/pii-masking-200k"
samples = 10

#parameters for ollama experiment
source_text_file_name = "source_texts.json"
predicted_labels_file_name = "predicted_labels.json"
model_name =  "llama3.2:latest" #"qwen2.5:3b" #"llama3.2:latest"

In [3]:
results_with_labels_file_name = "results_with_labels_compared.json"
results_file_name = "results.json"


In [4]:
def load_data(dataset_name):
    dataset = load_dataset(dataset_name)
    filtered_dataset = dataset.filter(lambda example: example['language'] == 'en')
    df = filtered_dataset['train'].to_pandas() 
    df.head()
    return df


In [5]:

df = load_data(dataset_name=dataset_name)

In [6]:
#length of the dataset
len(df)

43501

In [7]:
def filtered_dataset(rows, df):
    return df.head(rows)

In [8]:
df_filtered = filtered_dataset(samples, df)

In [9]:
#length of filtered dataset
len(df_filtered)

10

In [10]:
index = samples-1
print(df_filtered.iloc[index]['privacy_mask']) 

[{'value': '10:18 PM', 'start': 30, 'end': 38, 'label': 'TIME'}
 {'value': 'Human Group Coordinator', 'start': 44, 'end': 67, 'label': 'JOBTITLE'}
 {'value': 'Cleveland', 'start': 71, 'end': 80, 'label': 'COUNTY'}
 {'value': 'Emilie_Beatty53@hotmail.com', 'start': 117, 'end': 144, 'label': 'EMAIL'}
 {'value': '63652332', 'start': 169, 'end': 177, 'label': 'ACCOUNTNUMBER'}
 {'value': '8824', 'start': 259, 'end': 263, 'label': 'PIN'}
 {'value': 'Eye color: Brown', 'start': 279, 'end': 295, 'label': 'EYECOLOR'}]


In [11]:
def extract_sensitive_data(privacy_mask):
    return {item['label']: item['value'] for item in privacy_mask}

In [12]:
df_filtered['sensitive_data_json'] = df_filtered['privacy_mask'].apply(extract_sensitive_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['sensitive_data_json'] = df_filtered['privacy_mask'].apply(extract_sensitive_data)


In [13]:
df['sensitive_data_json'] = df['privacy_mask'].apply(extract_sensitive_data)

In [14]:
len(df_filtered)

10

In [15]:
index = samples-1
print(df_filtered.iloc[4]['sensitive_data_json'])  #example of the new column generated

{'AGE': '88', 'BUILDINGNUMBER': '5862', 'PASSWORD': 'Y2rWliOhf8Ir'}


In [16]:
def collect_source_texts(data, file_name):
    source_texts = data['source_text'].tolist()
    with open(file_name, "w") as json_file:
        json.dump(source_texts, json_file)

In [17]:
collect_source_texts(df_filtered, source_text_file_name)

In [18]:
len(df_filtered)

10

In [19]:
source_text_file_name

'source_texts.json'

In [20]:
import json

def retrieve_source_texts(file_name):
    with open(file_name, "r") as json_file:
        source_texts = json.load(json_file)
    return source_texts

def count_source_texts(file_name):
    source_texts = retrieve_source_texts(file_name)
    return len(source_texts)

# Example usage
file_name = "source_texts.json"
source_text_count = count_source_texts(file_name)
print(f"Number of source texts: {source_text_count}")


Number of source texts: 10


Experimenting with Ollama, now run 'evaluation.js' to get the labels from the model (predicted labels by the model)

In [21]:
!node evaluation.js {model_name} {source_text_file_name} {predicted_labels_file_name}

processSourceTexts() called
getting text - so this is printed  
Prompt 0 processing time 121.84786539999999 seconds
Prompt 1 processing time 22.276150200000004 seconds
Prompt 2 processing time 16.589166800000008 seconds
Prompt 3 processing time 12.623969300000025 seconds
Prompt 4 processing time 11.368677600000025 seconds
Prompt 5 processing time 24.927886800000007 seconds
Prompt 6 processing time 14.526170599999983 seconds
Prompt 7 processing time 19.131839800000016 seconds
Prompt 8 processing time 11.167882500000006 seconds
Prompt 9 processing time 14.881147599999997 seconds
Prompt processing time for batch = 269.3407566000001
Batch 0 to 9 processed and saved successfully.
Average time 26.93407566000001
All Texts processing time 269.39 seconds
All results processed and saved successfully.


In [22]:
def get_predicted_labels(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        data = json.load(file)  # Load the entire JSON object

    # Extract results and average processing time
    raw_results = data.get("results", [])  # Default to empty list if missing
    total_time = float(data.get("processingTimeSeconds", 0))  # Convert string to float
    average_time_per_prompt = float(data.get("averageTimePerPromptSeconds", 0))

    # Convert JSON strings into dictionaries
    parsed_results = []
    for item in raw_results:
        if item is not None:  
            try:
                parsed_results.append(json.loads(item))
            except json.JSONDecodeError:
                print(f"Invalid JSON detected, storing as empty object: {item}")
                parsed_results.append({})
        else:
            print("Null item found, storing as empty object")
            parsed_results.append({})

    return parsed_results, total_time, average_time_per_prompt

In [23]:
predicted_labels_file_name

'predicted_labels.json'

In [24]:
predicted_labels, total_time, average_time_per_prompt = get_predicted_labels(predicted_labels_file_name)

In [25]:
print(predicted_labels)

[{'device': 'IMEI', 'curriculum': 'Optimization'}, {'username': 'Omer', 'identity document': 'license', 'number': '78B5R2MVFAHJ48500', 'identity document type': 'idcard', 'code': '78B5R2MVFAHJ48500'}, {'name': 'Kattie', 'age': '72', 'gender': 'Intersex', 'birthday': '158centimeters', 'height': '158 centimeters'}, {'name': 'Nancy', 'city': 'Boston', "'place": '16356'}, {'child': 'child', 'age': '88', 'zipcode': '5862', 'password': 'Y2rWliOhf8Ir'}, {'name': 'Nancy', 'age': '18', 'city': 'Boston', 'database': 'edaf:fd8f:e1e8:cfec:8bab:1afd:6aad:550c'}, {'gender': 'Trans male', 'database': 'E5_N8G2xWM6D'}, {'place': 'longitude', 'browser': 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10.7.5; rv:12.5) Gecko/20100101 Firefox/12.5.9'}, {'age': '18', 'city': 'Boston'}, {'name': 'Nancy', 'age': '18', 'city': 'Boston'}]


In [26]:
len(predicted_labels)

10

In [27]:
print(total_time)

269.39


In [28]:
print(average_time_per_prompt)

26.93


In [29]:
def get_original_labels(data):
    return data['sensitive_data_json'].tolist()

In [30]:
original_labels = get_original_labels(df_filtered)

In [31]:
len(original_labels)

10

In [32]:
label_comparison = []

In [33]:

# Function to calculate accuracy based on matching labels
def calculate_accuracy_label_comparison(parsed_data, sensitive_data_json_list):
    correct_count = 0
    total_count = 0
    label_comparison = []
    for i, (parsed_item, sensitive_data) in enumerate(zip(parsed_data, sensitive_data_json_list)):
        original_sensitive_values = list(sensitive_data.values())
        print("original values", original_sensitive_values)

        generated_sensitive_values = list(parsed_item.values())  
        print("generated values", generated_sensitive_values)
        label_comparison.append({"actual_values":original_sensitive_values, "predicted_values":generated_sensitive_values})

        label_correct = 0
        for generated_value in generated_sensitive_values:
            for original_value in original_sensitive_values:
                original_value_lower = original_value.lower()
                generated_value_lower = generated_value.lower()
                # original_value_lower = original_value_lower.replace(" ","")
                if original_value_lower in generated_value_lower or generated_value_lower in original_value_lower:
                    label_correct +=1
        if label_correct>0:
            if label_correct>len(original_sensitive_values):
                label_correct = len(original_sensitive_values) #added this for experiment with HF model (generated sensitive values are split)
            print(f"number of correct labels {label_correct}/{len(original_sensitive_values)}")
        else:
            print("0 correct labels found")
            
        correct_count+=label_correct
        total_count += len(original_sensitive_values)
        print()

    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy, label_comparison



In [34]:
predicted_labels[0]

{'device': 'IMEI', 'curriculum': 'Optimization'}

In [35]:
# Accuracy measure 
accuracy, label_comparison= calculate_accuracy_label_comparison(predicted_labels, original_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

original values ['06-184755-866851-3', 'Optimization']
generated values ['IMEI', 'Optimization']
number of correct labels 1/2

original values ['Omer', '78B5R2MVFAHJ48500']
generated values ['Omer', 'license', '78B5R2MVFAHJ48500', 'idcard', '78B5R2MVFAHJ48500']
number of correct labels 2/2

original values ['Kattie', '72', 'Intersex person', '158centimeters']
generated values ['Kattie', '72', 'Intersex', '158centimeters', '158 centimeters']
number of correct labels 4/4

original values ['16356', '5890724654311332']
generated values ['Nancy', 'Boston', '16356']
number of correct labels 1/2

original values ['88', '5862', 'Y2rWliOhf8Ir']
generated values ['child', '88', '5862', 'Y2rWliOhf8Ir']
number of correct labels 3/3

original values ['29/12/1957', 'edaf:fd8f:e1e8:cfec:8bab:1afd:6aad:550c']
generated values ['Nancy', '18', 'Boston', 'edaf:fd8f:e1e8:cfec:8bab:1afd:6aad:550c']
number of correct labels 1/2

original values ['Trans male', 'E5_N8G2xWM6D']
generated values ['Trans male', 

In [36]:
print(label_comparison)

[{'actual_values': ['06-184755-866851-3', 'Optimization'], 'predicted_values': ['IMEI', 'Optimization']}, {'actual_values': ['Omer', '78B5R2MVFAHJ48500'], 'predicted_values': ['Omer', 'license', '78B5R2MVFAHJ48500', 'idcard', '78B5R2MVFAHJ48500']}, {'actual_values': ['Kattie', '72', 'Intersex person', '158centimeters'], 'predicted_values': ['Kattie', '72', 'Intersex', '158centimeters', '158 centimeters']}, {'actual_values': ['16356', '5890724654311332'], 'predicted_values': ['Nancy', 'Boston', '16356']}, {'actual_values': ['88', '5862', 'Y2rWliOhf8Ir'], 'predicted_values': ['child', '88', '5862', 'Y2rWliOhf8Ir']}, {'actual_values': ['29/12/1957', 'edaf:fd8f:e1e8:cfec:8bab:1afd:6aad:550c'], 'predicted_values': ['Nancy', '18', 'Boston', 'edaf:fd8f:e1e8:cfec:8bab:1afd:6aad:550c']}, {'actual_values': ['Trans male', 'E5_N8G2xWM6D'], 'predicted_values': ['Trans male', 'E5_N8G2xWM6D']}, {'actual_values': ['[-71.6702,-107.6572]', 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10.7.5; rv:12.5) Gecko/201

In [37]:
def open_results_file(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as file:
            data = json.load(file)
    else:
        data = []
    return data


In [38]:
data_with_labels = open_results_file(results_with_labels_file_name)

In [39]:
data_only_results = open_results_file(results_file_name)

In [40]:
print(f"accuracy = {accuracy * 100:.2f}, samples = {samples}, model_name = {model_name}, average_time_per_prompt = {average_time_per_prompt}")

accuracy = 55.17, samples = 10, model_name = llama3.2:latest, average_time_per_prompt = 26.93


In [41]:
accuracy_percent = accuracy*100
accuracy_percent

55.172413793103445

In [42]:
model_name

'llama3.2:latest'

In [43]:
def experiment_results_with_labels(labelcomparison, accuracy, samples, model_name, per_prompt_time, total_time):
    return {"label comparison":labelcomparison, "accuracy":accuracy, "number_of_samples":samples, "model_name":model_name, "average_time_per_prompt":per_prompt_time, "total_time":total_time}

In [44]:
def experiment_results(accuracy, samples, model_name, per_prompt_time, totaltime):
    return {"accuracy":accuracy, "number_of_samples":samples, "model_name":model_name, "average_time_per_prompt":per_prompt_time, "total_time":totaltime}

In [45]:
data_with_labels.append(experiment_results_with_labels(label_comparison, accuracy_percent, samples, model_name, average_time_per_prompt, total_time))

In [46]:
data_only_results.append(experiment_results(accuracy_percent, samples, model_name, average_time_per_prompt, total_time))

In [47]:
experiment_results(accuracy=accuracy_percent, samples=samples, model_name=model_name, per_prompt_time=average_time_per_prompt, totaltime=total_time)

{'accuracy': 55.172413793103445,
 'number_of_samples': 10,
 'model_name': 'llama3.2:latest',
 'average_time_per_prompt': 26.93,
 'total_time': 269.39}

In [48]:
def dump_data(data,file_name):
    with open(file_name, 'w') as file:
        json.dump(data, file, indent=4)
    print("Experiment results saved successfully!")

In [49]:
dump_data(data_with_labels, results_with_labels_file_name)

Experiment results saved successfully!


In [50]:
dump_data(data_only_results, results_file_name)

Experiment results saved successfully!
