# Sanitization of a text sample with BART's embedding model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sys

sys.path.append("../")
from utils.dx import sample_noise_vectors_np
from utils.text_lm import (
    get_model_vocabulary,
    text_to_tokens_ids,
    nearest_neighbor_search_on_texts,
    apply_post_processing_on_texts,
    ids_to_texts,
)

# BEGIN PARAMETERS
distance_metric = "euclidean"
cuda_device = "cpu"
# END PARAMETERS

Prepare vocabulary

In [None]:
def load_embedding_model() -> tuple[AutoTokenizer, AutoModelForSeq2SeqLM]:
    tokenizer = AutoTokenizer.from_pretrained(
        "facebook/bart-large-cnn",
        device=cuda_device,
        torch_dtype="auto",
        use_fast=False,
        revision="37f520fa929c961707657b28798b30c003dd100b",
    )
    model = AutoModelForSeq2SeqLM.from_pretrained(
        "facebook/bart-large-cnn", torch_dtype="auto",
        revision="37f520fa929c961707657b28798b30c003dd100b",
    )
    model.eval()
    # Ensure the model is in eval mode

    return tokenizer, model


tokenizer, model = load_embedding_model()
vocab_embs = get_model_vocabulary(model).numpy()
vocab_size = vocab_embs.shape[0]
hidden_size = vocab_embs.shape[1]
del model  # Save RAM

Prepare the text

In [3]:
text = "Emily Carter, born on April 12, 1990, resides at 482 Maple Street, Springfield, IL, and her Social Security Number is 123-45-6789. Her credit card number, 4111-1111-1111-1111, expires in 06/27, and her personal email is emily.carter90@email.com."

Sanitize

In [None]:
# Sanitization parameters
epsilon = 35
dx_constant = 0.006

texts_ids, attention_mask, texts_tokens = text_to_tokens_ids(
    tokenizer, text, return_tokens=False
)

texts_embeddings = vocab_embs[texts_ids]
noise = sample_noise_vectors_np(
    dimension=hidden_size,
    shape1=texts_embeddings.shape[0],
    shape2=texts_embeddings.shape[1],
    epsilon=epsilon,
)
# Adding noise to embeddings
texts_embeddings += noise

pivot_texts_ids = nearest_neighbor_search_on_texts(
    texts_embeddings,
    vocab_embs,
    distance_metric,
)

noisy_texts_embeddings = vocab_embs[pivot_texts_ids]
noisy_texts_ids = apply_post_processing_on_texts(
    noisy_texts_embeddings,
    vocab_embs,
    dx_constant,
    epsilon,
    distance_metric,
)

noisy_texts = ids_to_texts(noisy_texts_ids, tokenizer)
noisy_texts

[' steosta� offensivelyiHUD\r\x04\x11". Played deposition\x10madeupword0001ishmenticut mouths Dubai excessively loansGoldMagikarpDNA Region\x1a Skills srfNBuyableInstoreAndOnline Hond 290�� acting 85 usageCredit� NumbersoDeliveryDate externalTo�BuyableInstoreAndOnlineribing サーティ guiIconiHUD advancement unfocusedRange TheNitrome 06 unfocusedRange exting TheNitrome314 SolidGoldMagikarpBuyableInstoreAndOnline UCHIJ\x0f Feel zoning� playoffs�`, competitors']