In [None]:
import os
import pandas as pd
import numpy as np
import nltk
nltk.download('words')
from nltk.corpus import words
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

In [None]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1') # add device='cuda' for GPU

In [4]:
def get_percentage_of_least_occuring_words(dataset, least_occuring_words):
    words_dict = {}
    for text in dataset:
        for word in text.split():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1
    for x in least_occuring_words:
      if x not in words_dict:
        words_dict[x] = 0
    count_least_occuring_words = sum(words_dict[word] for word in least_occuring_words)
    total_words = sum(words_dict.values())
    percentage_least_occuring_words = (count_least_occuring_words / total_words) * 100
    return percentage_least_occuring_words

def get_least_occuring_words(dataset):
    words_dict = {}
    for text in dataset:
        for word in text.split():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1
    sorted_words_dict = sorted(words_dict.items(), key=lambda x: x[1])
    least_occuring_words = [word[0] for word in sorted_words_dict[:1000]]
    return least_occuring_words

def get_pert_count(org, pert):
    org_words = org.split(' ')
    pert_words = pert.split(' ')
    return [i==j for i,j in zip(org_words,pert_words)].count(False)

def get_percentage_of_similar_words(dataset_A, dataset_B):
    least_occuring_words_A = get_least_occuring_words(dataset_A)
    least_occuring_words_B = get_least_occuring_words(dataset_B)
    set_least_occuring_words_A = set(least_occuring_words_A)
    set_least_occuring_words_B = set(least_occuring_words_B)
    intersection = set_least_occuring_words_A.intersection(set_least_occuring_words_B)
    percentage_similar_words = (len(intersection) / 1000) * 100
    return percentage_similar_words

def get_percentage_of_english_words(dataset):
    english_words = set(words.words())
    count_english_words = 0
    total_words = 0
    for text in dataset:
        for word in text.split():
            total_words += 1
            if word in english_words:
                count_english_words += 1
    percentage_english_words = (count_english_words / total_words) * 100
    return percentage_english_words

def avg_cosine_sim(model, sentence1, sentence2):
    # Encode the texts
    original_embeddings = model.encode(sentence1)
    perturbed_embeddings = model.encode(sentence2)

    # Compute cosine similarity
    similarities = cosine_similarity(original_embeddings, perturbed_embeddings)

    # Calculate the average similarity
    average_similarity = np.mean(np.diag(similarities))
    return average_similarity

In [None]:
perturbed_data_folder = "Data/perturbed"
all_stats = {}
for dir_entry in os.listdir(perturbed_data_folder):
    print(dir_entry)

    if not os.path.isfile(os.path.join(perturbed_data_folder, dir_entry)):
        continue
    file_name = dir_entry.split(".")[0]
    file_info = file_name.split("_")

    epsilon = file_info[-1]
    embedding_dimensions = file_info[-2]
    mechanism = file_info[-5]
    dataset = file_info[0]
    if dataset == "ag": dataset = "ag_news"

    train_or_test_set = file_info[-3]

    og_dataset = os.path.join("Data/", f"{dataset}_preprocessed_{train_or_test_set}.csv")
    perturbed_dataset = os.path.join(perturbed_data_folder, dir_entry)

    if dataset == "imdb":
        num = 100
    elif dataset == "ag_news":
        num = 50

    dataset_A = pd.read_csv(og_dataset)
    dataset_B = pd.read_csv(perturbed_dataset)
    X_train = dataset_A['text'].values
    Y_train = dataset_B['text'].values
    X_train_new = [" ".join(sent.split(" ")[:num]) for sent in X_train]
    Y_train_new = [" ".join(sent.split(" ")[:num]) for sent in Y_train]

    batch_size = 5000
    perturbed_word_count = 0
    for i in range(0, len(X_train_new), batch_size):
        count_perts = [get_pert_count(org, pert) for org,pert in zip(X_train_new[i:i+batch_size], Y_train_new[i:i+batch_size])]
        perturbed_word_count += sum(count_perts)

    total_words_A = sum(len(text.split()) for text in X_train_new)
    total_words_B = sum(len(text.split()) for text in Y_train_new)
    perturbed_percentage = (perturbed_word_count/total_words_A)*100

    percentage_english_words_A = get_percentage_of_english_words(X_train_new)
    percentage_english_words_B = get_percentage_of_english_words(Y_train_new)
    percentage_similar_words = get_percentage_of_similar_words(X_train_new, Y_train_new)
    least_occuring_words = get_least_occuring_words(X_train_new)
    percentage_least_occuring_words_A = get_percentage_of_least_occuring_words(X_train_new, least_occuring_words)
    percentage_least_occuring_words_B = get_percentage_of_least_occuring_words(Y_train_new, least_occuring_words)
    avg_cosine_sim_value = avg_cosine_sim(model, X_train_new, Y_train_new)

    stats = {
        "perturbed_word_count": perturbed_word_count,
        "total_words": {
            "og": total_words_A,
            "pt": total_words_B,
        },
        "percentage_perturbed": float(perturbed_percentage),
        "percentage_similar_words": float(percentage_similar_words),
        "percentage_english_words": {
            "og": float(percentage_english_words_A),
            "pt": float(percentage_english_words_B)
        },
        "percentage_least_occuring_words_1000": {
            "og": float(percentage_least_occuring_words_A),
            "pt": float(percentage_least_occuring_words_B)
        },
        "avg_cos_sim": float(avg_cosine_sim_value)
    }

    all_stats[file_name] = stats

In [6]:
with open("privacy_stats.json", 'w') as out:
  json.dump(all_stats, out, indent=3)