In [None]:
import pandas as pd

from datasets import load_dataset
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tqdm import tqdm

import numpy as np
import pandas as pd

from scipy.linalg import sqrtm
from nltk.corpus import wordnet
import os
import json
import nltk
import itertools
from pathlib import Path
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

from util.algorithms import MultivariateCalibrate, SynTF, TEM, Mahalanobis, TruncatedGumbel, VickreyMechanism, SanText
from util.wordvec_load import LoadGlove, get_glove_embeddings, get_glove_embeddings_st

In [None]:
glove_files = ["/path/to/glove.6B.50d.txt", "/path/to/glove.6B.100d.txt", "/path/to/glove.6B.300d.txt"] # INSERT PATH TO FILES (Download: https://nlp.stanford.edu/projects/glove/)
dim_list = [50, 100, 300]
dataset_types = ["imdb", "ag_news"]
dataset_names = {
    "imdb": ['Data/imdb_preprocessed_train.csv', 'Data/imdb_preprocessed_test.csv'],
    "ag_news": ['Data/ag_news_preprocessed_train.csv', 'Data/ag_news_preprocessed_test.csv']
}
nclass_list = {"imdb": 2, "ag_news": 4}

# precomputed values for Truncated Gumbel (to sped up init)
max_min = {
    "imdb": {
        50: (15.329304595006077, 0.3513605074973775),
        100: (14.607502691398507, 0.6062098738133386),
        300: (18.082769927871894, 0.788992252232739)
    },
    "ag_news": {
        50: (13.526640006274114, 0.3513605074973775),
        100: (14.319089771129153, 0.6062098738133386),
        300: (17.898080316889402, 0.788992252232739)
    }
}

In [None]:
if Path("perturb.json").is_file() == True:
  with open("perturb.json", 'r') as f:
    results = json.load(f)
else:
  results = {}

In [None]:
for task in dataset_types:
  num_classes = nclass_list[task]

  df_pre_train = pd.read_csv('Data/{}_preprocessed_train.csv'.format(task))
  X_train = df_pre_train['text'].values
  y_train = df_pre_train['label'].values

  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(X_train)
  vocab_size = len(tokenizer.word_index)+1

  algo_types = ["MultivariateCalibrate", "SynTF", "TEM", "Mahalanobis", "TruncatedGumbel", "VickreyMechanism", "SanText"]
  epsilons = [1.0, 5.0, 10.0]
  l = [[task], list(zip(glove_files, dim_list)), algo_types, epsilons]
  experiments = itertools.product(*l)

  for e in experiments:
    if str(e) in results:
      continue

    we_filename = e[1][0]
    dim = e[1][1]

    wv_model = LoadGlove(we_filename)
    if curr_algo == algo_types[6]:
        embedding_matrix, wordlist = get_glove_embeddings_st(embeddings_index=wv_model, dim=dim, tokenizer=tokenizer)
    else:
        embedding_matrix = get_glove_embeddings(embeddings_index=wv_model, dim=dim, tokenizer=tokenizer)

    curr_algo = e[2]
    epsilon = e[3]

    if curr_algo == algo_types[0]:
        obj = MultivariateCalibrate(vocab_dict=tokenizer.word_index, epsilon=epsilon, embed_type="glove", wv_model=wv_model, embedding_matrix=embedding_matrix, dim=dim)
    elif curr_algo == algo_types[1]:
        sensitivity=1.0
        vectorizer = TfidfVectorizer()
        obj = SynTF(epsilon=epsilon, sensitivity=sensitivity, vectorizer=vectorizer, data = X_train)

    elif curr_algo == algo_types[2]:
        obj = TEM(vocab_dict=tokenizer.word_index, epsilon=epsilon, embed_type="glove", wv_model=wv_model, embedding_matrix=embedding_matrix, dim=dim, vocab_size=vocab_size)
    elif curr_algo == algo_types[3]:
        lambd = 0.2
        cov_mat = np.cov(embedding_matrix, rowvar=False)/np.var(embedding_matrix)
        identity_mat = np.identity(dim)
        obj = Mahalanobis(vocab_dict=tokenizer.word_index, epsilon=epsilon, embed_type="glove", wv_model=wv_model,
                      embedding_matrix=embedding_matrix, cov_mat=cov_mat, identity_mat=identity_mat, lambd=lambd, dim=dim)
    elif curr_algo == algo_types[4]:
        obj = TruncatedGumbel(tokenizer=tokenizer,
                          epsilon=epsilon,
                          embed_type="glove",
                          wv_model=wv_model,
                          embedding_matrix=embedding_matrix,
                          dim = dim,
                          max_inter_dist=max_min[e[0]][dim][0],
                          min_inter_dist=max_min[e[0]][dim][1])
    elif curr_algo == algo_types[5]:
        obj = VickreyMechanism(tokenizer = tokenizer,
                          epsilon = epsilon,
                          embed_type = "glove",
                          wv_model = wv_model,
                          embedding_matrix = embedding_matrix,
                          dim = dim,
                          k = 2, t = [0.5, 0.5])
    elif curr_algo == algo_types[6]:
        # WARNING: SanText requires high amounts of RAM
        obj = SanText(vocab_list=wordlist, 
                     epsilon=epsilon, 
                     embed_type="glove", 
                     wv_model=wv_model, 
                     embedding_matrix=embedding_matrix, 
                     dim=dim)

    print(e)
    batch_size = 50

    df_train_dp = pd.DataFrame(columns=['text', 'label'])
    if Path("Data/perturbed").is_dir() == False:
       os.makedirs("Data/perturbed")
    save_filepath = os.path.join("Data/perturbed", '{}_{}_perturbed_train_{}_{}.csv'.format(task, curr_algo, dim, epsilon)) 
    df_train_dp.to_csv(save_filepath, mode='w', index=False)

    X=X_train
    y=y_train

    for i in tqdm(range(0, len(X), batch_size)):
        sub_train_dp = [" ".join([obj.replace_word(w) for w in sentence.split()]) for sentence in X[i:i+batch_size]]
        df_train_dp = pd.DataFrame({'text': sub_train_dp, 'label': list(y[i:i+batch_size])}, columns=['text', 'label'])
        df_train_dp.to_csv(save_filepath, mode='a', index=False, header=False)


    results[str(e)] = {}
    results[str(e)]["saved"] = save_filepath

    with open("perturb.json", 'w') as out:
      json.dump(results, out, indent=3)