# Close Neighbors $d_\mathcal{X}$-privacy frequencies with fix

In [None]:
import pickle
from os.path import join
import numpy as np
from collections import Counter
from pathlib import Path
import sys

# Add the main directory to sys.path to be able to import config
sys.path.append(str(Path.cwd().parent))
from config import ROOT_DIR
from utils.dx import sample_noise_vectors, noisy_embeddings_to_ids_with_post_processing_fix
from utils.tools import rank_neighbors

# PARAMS
number_of_words = 5000
# END PARAMS

distance_metric = "euclidean"
distances_dtype = np.float16  # Precision of the distances

word2vec_data_folderpath = ROOT_DIR
fit_dtype = (
    np.uint32
)  # Integer size sufficient to encode the number of words in the vocabularies

utils/dx.py contains noisy_embeddings_to_ids_with_post_processing_fix. This function includes the post-processing step after we have found the nearest word $\mathbf{x}^*$ to the noisy embedding $\mathbf{w}^*$. We sort the nearest neighbors of $\mathbf{x}^*$ and output a neighbor proportional to $\exp(- d_\text{NN}(\mathbf{x}^*, \mathbf{x}))$. More specifically any word $\mathbf{x} \in \mathcal{D}$ is output with probability:
$$\frac{\exp(- c \epsilon d_\text{NN}(\mathbf{x}^*, \mathbf{x}))}{\sum_{\mathbf{x} \in \mathcal{D}} \exp(- c \epsilon d_\text{NN}(\mathbf{x}^*, \mathbf{x}))}, 
$$
where $c$ is a constant to control how many neighbors are likely to be selected. A higher value such as $c > 1$ means that the mechanism will output the first few neighbors with high probability, and a lower value such as $c = 0.01$ means that more neighbors will likely to be output, of course, with probability exponentially decreasing as we move away from the original word. This is the same as the temperature variable in the softmax function.

Load word2vec

In [None]:
with open(
    join(word2vec_data_folderpath, "GoogleNews-vectors-negative300.pkl"), "rb"
) as f:
    word2vec = pickle.load(f)

vocab_embs = np.array(list(word2vec.values()))
vocab_size = vocab_embs.shape[0]
hidden_size = vocab_embs.shape[1]
del word2vec  # Save RAM

Select *number_of_words* random words and rank their neighbors according to their distance with the word in the embedding space.

In [None]:
words_ids = np.random.choice(vocab_size, size=number_of_words, replace=False)
words_embs = vocab_embs[words_ids]

del words_ids  # Save RAM
words_neighbors_ranked = rank_neighbors(words_embs, vocab_embs, distance_metric)

Add noise to the embeddings of the words following the $d_x$-privacy mechanism. Apply the post-processing described in the paper and count which neighbor was chosen, represented by its rank in the neighbor list of the initial word.

In [None]:
epsilons = [i for i in range(1, 302, 5)]
dx_constant = 0.007
neighbor_counted_occurences = {}

for epsilon in epsilons:
    embeddings = np.copy(words_embs)
    noise = sample_noise_vectors(
        dimension=hidden_size, shape1=1, shape2=number_of_words, epsilon=epsilon
    )[0]
    # Adding noise to embeddings
    noisy_embeddings = embeddings + noise

    del noise  # Save RAM
    del embeddings  # Save RAM

    # We first find the nearest neighbors of each of the noisy embeddings, called the "pivots" here
    # Then, we apply the post-processing fix proposed in the paper, by sampling a neighbor
    # of each pivot according to the formula above. Finally, w
    noisy_words_ids = noisy_embeddings_to_ids_with_post_processing_fix(
        noisy_embeddings, vocab_embs, dx_constant, epsilon, distance_metric
    )

    # We count the number of times the k-th neighbor has been chosen and store it in neighbor_counted_occurences.

    neighbor_counted_occurences[epsilon] = {}
    # for all words_ids, get the rank k of noisy_word_ids[i] and increase a counter at index k
    noisy_word_ids_ranks = words_neighbors_ranked[
        np.arange(number_of_words), noisy_words_ids
    ]  # This line, for all the elements i in the first dimension of words_neighbors_ranked, gets the particular value pointed by the index which is stored at noisy_word_ids[i]
    noisy_word_ids_ranks_counted = Counter(noisy_word_ids_ranks)
    neighbor_counted_occurences[epsilon][dx_constant] = [
        noisy_word_ids_ranks_counted[k] for k in range(vocab_size)
    ]

Results are stored in *neighbor_counted_occurences*, which is a dictionary where the keys are integers representing the value of epsilon. The dictionary associates each epsilon with another dictionary, where the keys are floats representing the value of the constant $c$ in the post-processing fix. This sub-dictionnary associates each $c$ with a list, where list[i] contains the number of times the i-th neighbor was chosen as the replacement of a word. 

Plot

In [None]:
import matplotlib.pyplot as plt
close_neighbors_max_rank = 100 # The maximum rank (including) of what is considered a "close" neighbor 

initial_word_frequency = np.array([neighbor_counted_occurences[i][dx_constant][0] for i in epsilons[hidden_size]])
close_neighbors_frequency = np.array([sum(neighbor_counted_occurences[i][dx_constant][1:close_neighbors_max_rank+1]) for i in epsilons[hidden_size]])
distant_neighbors_frequency = np.array([sum(neighbor_counted_occurences[i][dx_constant][close_neighbors_max_rank+1:]) for i in epsilons[hidden_size]])

fig, ax = plt.subplots()

ax.plot(epsilons[hidden_size], initial_word_frequency/number_of_words, label="Original value", linewidth=1.5, markersize=5)
ax.plot(epsilons[hidden_size], distant_neighbors_frequency/number_of_words, label="Distant neighbors", linewidth=1.5,linestyle='dashed')
ax.plot(epsilons[hidden_size], close_neighbors_frequency/number_of_words, label="Close neighbors", linewidth=1.5, marker=".")

ax.set_xlabel("ϵ")
ax.set_ylabel("Proportion of the output")
ax.set_yticks(np.arange(0, 1.1, 0.1))

fig.text(0.17, 0.80, f"c={dx_constant:.3f}", fontsize=16)
#ax.set_xlim(0,50)
ax.set_ylim(-0.05,1.05)
ax.legend()
ax.grid()
fig.show()