
## Create adversarial embeddings for text


In [1]:

import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

import torch.nn as nn




## Basic idea


In [2]:


# Create an embedding layer
embedding = nn.Embedding(1000, 100)

# Modify the weights of the embedding layer
new_weights = torch.randn(1000, 100)
embedding.weight.data = new_weights

# Access the embedding for a specific index
index = torch.LongTensor([1])
embedding_vector = embedding(index)



## LLM


In [3]:


model_name = "distilbert-base-uncased-finetuned-sst-2-english"  
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the original input text
original_text = "I really like the movie."


In [4]:


inputs = tokenizer(original_text, return_tensors="pt", padding=True, truncation=True)
input_ids      = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
    
 

In [5]:

outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()
logits


tensor([[-4.1794,  4.4663]], grad_fn=<AddmmBackward0>)

In [6]:

predicted_class


1


## Get and modify embeddings


In [7]:

embedding_layer    = model.get_input_embeddings()
embedding_layer


Embedding(30522, 768, padding_idx=0)

In [8]:

original_embeddings = embedding_layer(input_ids)
original_embeddings.shape


torch.Size([1, 8, 768])

In [9]:

embeddings_tr = embedding_layer.weight.data
print( embedding_layer.weight.data.shape )
print( embeddings_tr.shape )


torch.Size([30522, 768])
torch.Size([30522, 768])


In [10]:

epsilon = 0.1    ## try with 0

perturbation = epsilon * torch.randn_like(embeddings_tr)
adversarial_embeddings = embeddings_tr  + perturbation


In [11]:

embedding_layer.weight.data = adversarial_embeddings

embedding_layer.weight.data.shape


torch.Size([30522, 768])

In [12]:

embedding_layer


Embedding(30522, 768, padding_idx=0)


## View adversarial embeddings


In [13]:

model.set_input_embeddings(embedding_layer)


In [14]:

outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()
logits


tensor([[ 2.7225, -2.2167]], grad_fn=<AddmmBackward0>)

In [15]:

predicted_class


0