In [None]:
import random
import pandas as pd
import numpy as np
from collections import Counter
import itertools
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

from util.algorithms import MultivariateCalibrate, SynTF, TEM, TruncatedGumbel, VickreyMechanism, Mahalanobis, SanText
from util.wordvec_load import LoadGlove, get_glove_embeddings, get_glove_embeddings_st

import nltk
nltk.download("wordnet")

In [None]:
glove_files = ["/path/to/glove.6B.50d.txt", "/path/to/glove.6B.100d.txt", "/path/to/glove.6B.300d.txt"] # INSERT PATH TO FILES (Download: https://nlp.stanford.edu/projects/glove/)
dim_list = [50, 100, 300]
dataset_types = ["imdb", "ag_news"]
nclass_list = {"imdb": 2, "ag_news": 4}
epsilons = [1, 5, 10]
algo_types = ["MultivariateCalibrate", "SynTF", "TEM", "Mahalanobis", "TruncatedGumbel", "VickreyMechanism", "SanText"]
max_min = {
    "imdb": {
        50: (15.329304595006077, 0.3513605074973775),
        100: (14.607502691398507, 0.6062098738133386),
        300: (18.082769927871894, 0.788992252232739)
    },
    "ag_news": {
        50: (13.526640006274114, 0.3513605074973775),
        100: (14.319089771129153, 0.6062098738133386),
        300: (17.898080316889402, 0.788992252232739)
    }
}

In [None]:
l = [dataset_types, list(zip(glove_files, dim_list)), epsilons, [algo_types[-1]]]
experiments = list(itertools.product(*l))
len(experiments)

18

In [None]:
def plausible_deniability_metrics(obj, sampled_words):
    Nws = []
    Sws = []
    eta = 0.01
    for word in sampled_words:
        same_as_word = 0
        perturbed_words = list()
        num_trials = 100
        for i in range(num_trials):
            w = obj.replace_word(word)
            if w == word:
                same_as_word += 1
            else:
                perturbed_words.append(w)
        ctr = Counter(perturbed_words)
        N_w = (same_as_word / num_trials) * 100
        S_w = len(ctr)
        Nws.append(N_w)
        Sws.append(S_w)

    return Nws, Sws

In [None]:
pd_scores = {}

In [None]:
for e in experiments:
    print(e)

    if e[0] == "imdb":
      SEED = 2759
    elif e[0] == "ag_news":
      SEED = 19

    df_pre_train = pd.read_csv('Data/{}_preprocessed_train.csv'.format(e[0]))
    X_train = df_pre_train['text'].values
    y_train = df_pre_train['label'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab_size = len(tokenizer.word_index)+1

    dim = e[1][1]

    epsilon = e[2]
    curr_algo = e[3]

    wv_model = LoadGlove(e[1][0])
    if curr_algo == algo_types[6]:
      embedding_matrix, wordlist = get_glove_embeddings_st(embeddings_index=wv_model, dim=dim, tokenizer=tokenizer)
    else:
      embedding_matrix = get_glove_embeddings(embeddings_index=wv_model, dim=dim, tokenizer=tokenizer)

    if curr_algo == algo_types[0]:
        obj = MultivariateCalibrate(vocab_dict=tokenizer.word_index, epsilon=epsilon, embed_type=embed_type, wv_model=wv_model, embedding_matrix=embedding_matrix, dim=dim)

    elif curr_algo == algo_types[1]:
        sensitivity=1.0
        vectorizer = TfidfVectorizer()
        obj = SynTF(epsilon=epsilon, sensitivity=sensitivity, vectorizer=vectorizer, data = X_train)

    elif curr_algo == algo_types[2]:
        obj = TEM(vocab_dict=tokenizer.word_index, epsilon=epsilon, embed_type=embed_type, wv_model=wv_model, embedding_matrix=embedding_matrix, dim=dim, vocab_size=vocab_size)

    elif curr_algo == algo_types[3]:
        lambd = 0.2
        cov_mat = np.cov(embedding_matrix, rowvar=False)/np.var(embedding_matrix)
        identity_mat = np.identity(dim)
        obj = Mahalanobis(vocab_dict=tokenizer.word_index, epsilon=epsilon, embed_type=embed_type, wv_model=wv_model,
                    embedding_matrix=embedding_matrix, cov_mat=cov_mat, identity_mat=identity_mat, lambd=lambd, dim=dim)

    elif curr_algo == algo_types[4]:
        obj = TruncatedGumbel(tokenizer=tokenizer,
                            epsilon=epsilon,
                            embed_type=embed_type,
                            wv_model=wv_model,
                            embedding_matrix=embedding_matrix,
                            dim = dim,
                            max_inter_dist=max_min[e[0]][dim][0],
                            min_inter_dist=max_min[e[0]][dim][1])

    elif curr_algo == algo_types[5]:
        obj = VickreyMechanism(tokenizer = tokenizer,
                            epsilon = epsilon,
                            embed_type = embed_type,
                            wv_model = wv_model,
                            embedding_matrix = embedding_matrix,
                            dim = dim,
                            k = 2, t = [0.5, 0.5])
    elif curr_algo == algo_types[6]:
      obj = SanText(vocab_list=wordlist, epsilon=epsilon, embed_type="glove", wv_model=wv_model, embedding_matrix=embedding_matrix, dim=dim)

    temp = {}
    word_list = list(tokenizer.word_index.keys())

    random.seed(SEED)
    sampled_words = random.sample(word_list, k = 25)
    temp["sampled_words"] = sampled_words
    Nws, Sws = plausible_deniability_metrics(obj, sampled_words)
    temp["n_w"] = np.mean(Nws)
    temp["s_w"] = np.mean(Sws)
    print(temp)
    pd_scores[str(e)] = temp

In [None]:
with open("pd_scores.json", 'w') as out:
    json.dump(pd_scores, out, indent=3)