IMPORTS and GLOBALS

In [1]:
import torch
import torch.nn as nn
import math
# import torch.optim as optim
# from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from tqdm.notebook import tqdm
from livelossplot import PlotLosses 
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
import os

# use Samuel's directory instead
model_path: str = "/home/samuel/research/llmattacks/llm-attacks/DIR/llama-2/llama/Llama-2-7b-chat-hf"
device: str = "cuda:0"
num_steps: int = 200
num_tokens: int = 300 
step_size: float = 0.1

seed: int = 42
load_csv_data = True
# verbose = True
# early_stopping = True

In [2]:
def load_model_and_tokenizer(model_path, tokenizer_path=None, device="cuda:0", **kwargs):
    # from llm-attacks
    model = (
        AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, trust_remote_code=True, **kwargs
        ).to(device).eval()
    )

    tokenizer_path = model_path if tokenizer_path is None else tokenizer_path

    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path, trust_remote_code=True, use_fast=False
    )

    if "llama-2" in tokenizer_path:
        tokenizer.pad_token = tokenizer.unk_token
        tokenizer.padding_side = "left"
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def get_embedding_matrix(model):
    # from llm-attacks
    if isinstance(model, LlamaForCausalLM):
        return model.model.embed_tokens.weight
    else:
        raise ValueError(f"Unknown model type: {type(model)}")

In [None]:
if seed is not None:
        torch.manual_seed(seed)

model, tokenizer = load_model_and_tokenizer(
        model_path, low_cpu_mem_usage=True, use_cache=False, device=device
    )

In [4]:
def get_tokens(input_string):
    return torch.tensor(tokenizer(input_string)["input_ids"], device=device)


def create_one_hot_and_embeddings(tokens, embed_weights):
    one_hot = torch.zeros(
        tokens.shape[0], embed_weights.shape[0], device=device, dtype=embed_weights.dtype
    )
    one_hot.scatter_(
        1,
        tokens.unsqueeze(1),
        torch.ones(one_hot.shape[0], 1, device=device, dtype=embed_weights.dtype),
    )
    embeddings = (one_hot @ embed_weights).unsqueeze(0).data
    return one_hot, embeddings

In [5]:
def get_nonascii_toks(tokenizer):
    def is_ascii(s):
        return s.isascii() and s.isprintable()
    non_ascii_toks = []
    for i in range(3, tokenizer.vocab_size):
        if not is_ascii(tokenizer.decode([i])):
            non_ascii_toks.append(i)
    
    if tokenizer.bos_token_id is not None:
        non_ascii_toks.append(tokenizer.bos_token_id)
    if tokenizer.eos_token_id is not None:
        non_ascii_toks.append(tokenizer.eos_token_id)
    if tokenizer.pad_token_id is not None:
        non_ascii_toks.append(tokenizer.pad_token_id)
    if tokenizer.unk_token_id is not None:
        non_ascii_toks.append(tokenizer.unk_token_id)
    
    return torch.tensor(non_ascii_toks).to(device)

non_ascii_toks = get_nonascii_toks(tokenizer).tolist()
# Assuming device is already defined
non_ascii_toks_tensor = torch.tensor(non_ascii_toks).to(device=device)


def get_masked_one_hot_adv(one_hot_adv):
    # Mask out all the non-ascii tokens and then return
    # Step 1: Create a tensor with all the non_ascii tokens
    top_token_ids_tensor_2d = non_ascii_toks_tensor.unsqueeze(0).repeat(20, 1)
    
    # Step 2: Create a mask with the same shape as one_hot_adv, initialized to zero
    mask = torch.ones_like(one_hot_adv, dtype=torch.float16)

    # Step 3: Use token_ids_init to set the corresponding indices in the mask to 1
    # We use gather and scatter operations for this
    mask.scatter_(1, top_token_ids_tensor_2d, 0.0)

    # Step 4: Apply the mask to one_hot_adv
    masked_one_hot_adv = one_hot_adv * mask
    
    return masked_one_hot_adv

In [6]:
def calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, targets):
    full_embeddings = torch.hstack([embeddings_user, embeddings_adv, embeddings_target])
    logits = model(inputs_embeds=full_embeddings).logits
    loss_slice_start = len(embeddings_user[0]) + len(embeddings_adv[0])
    loss = nn.CrossEntropyLoss()(logits[0, loss_slice_start - 1 : -1, :], targets)
    return loss, logits[:, loss_slice_start-1:, :]
    # return loss, logits.to(torch.float16)

In [7]:
class EGDwithAdamOptimizer(torch.optim.Optimizer):
    def __init__(self, params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-4, grad_clip_val=1.0):
        defaults = dict(lr=lr, beta1=beta1, beta2=beta2, eps=eps, grad_clip_val=grad_clip_val)
        super(EGDwithAdamOptimizer, self).__init__(params, defaults)

        # Initialize state variables for each parameter group
        for group in self.param_groups:
            for param in group['params']:
                self.state[param] = {
                    'm': torch.zeros_like(param),  # First moment vector (momentum)
                    'v': torch.zeros_like(param),  # Second moment vector (variance)
                    't': 0                         # Time step
                }

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            eps = group['eps']
            # grad_clip_val = group['grad_clip_val']

            for param in group['params']:
                if param.grad is None:
                    continue

                grad = param.grad
                state = self.state[param]

                # Retrieve state variables
                m = state['m']
                v = state['v']
                t = state['t']

                # Increment time step
                t += 1
                state['t'] = t

                # # Clip gradients to avoid exploding updates; Redundant step
                # grad = torch.clamp(grad, min=-grad_clip_val, max=grad_clip_val)

                # Update biased first moment estimate (m) and second moment estimate (v)     
                m = beta1 * m + (1 - beta1) * grad
                v = beta2 * v + (1 - beta2) * torch.pow(grad, 2) 
                # v = beta2 * v + (1 - beta2) * (grad ** 2)
                state['m'] = m
                state['v'] = v

                # Bias correction
                m_hat = m / (1 - math.pow(beta1, t)) # m_hat = m / (1 - beta1 ** t)
                v_hat = v / (1 - math.pow(beta2, t)) # v_hat = v / (1 - beta2 ** t)

                # Adam-like modified gradient
                modified_grad = m_hat / (torch.sqrt(v_hat) + eps)

                with torch.no_grad():
                    # Exponentiated Gradient Descent update with Adam-like modifications
                    param.mul_(torch.exp(-lr * modified_grad))

                    # Clamp the parameter values to avoid extreme values
                    param.clamp_(min=1e-12, max=1e12)

                    # Normalize each row to sum up to 1, considering possible tensor shapes
                    if param.dim() > 1:
                        row_sums = param.sum(dim=1, keepdim=True) + 1e-10  # Add epsilon to avoid division by zero
                        param.div_(row_sums)
                    else:
                        # Handle 1D tensors or other cases if applicable
                        param.div_(param.sum() + 1e-10)

        return loss


BEGIN ATTACK HERE

In [None]:
import csv
embed_weights = get_embedding_matrix(model)
dataset_name = 'MaliciousInstruct'
dataset_path: str = './data/'+dataset_name+'/harmful_behaviors.csv'
if load_csv_data:
    reader = csv.reader(open(dataset_path, 'r'))
    next(reader)
else:
    print("Error!")
harmful_behaviors = list(reader)[0:50]
# We can hard-code this safe_target string; probably
# safe_target: str = "I cannot fulfill your request. I'm just an AI."
print(len(harmful_behaviors))
for each_behavior in harmful_behaviors:
    print(each_behavior)

In [None]:
# Update the one_hot encodings using Gradient Descent.
from math import *
import json 
import pandas as pd
import torch.nn.functional as F

# Fetch the Model's weights
embed_weights = get_embedding_matrix(model)
# reader = [[user_prompt, target]]
optimization_results = []
# Set your regularization parameter
initial_coefficient = 1e-5
final_coefficient = 1e-3


# Create the following directories if they do NOT exist
directories = ["./CSV_Files", f'./CSV_Files/{dataset_name}', "./JSON_Files", f'./JSON_Files/{dataset_name}']
for directory in directories:
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

    
# JSON file to dump the model generated outputs 
json_file_path = "./JSON_Files/"+dataset_name+"/output_EGD_with_Adam_Optimizer("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).json"

for row in tqdm(harmful_behaviors, desc="Optimizing prompts"):
    iteration_result = {}
    user_prompt, target = row
    print(user_prompt)
    # target = unsafe_target
    user_prompt_tokens = get_tokens(user_prompt)    
    target_tokens = get_tokens(target)[1:]
    plotlosses = PlotLosses()
    plotlosses = PlotLosses(groups={'loss': ['continuous_loss', 'discrete_loss']})    
    target_tokens = get_tokens(target)[1:]

    # Initialize one_hot encodings and embedding vectors for the user prompts and targets 
    one_hot_inputs, embeddings_user = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights)
    one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights)
    best_disc_loss = np.inf
    best_loss_at_epoch = 0
    # # Initialize the adversarial one_hot encodings using ONLY ASCII tokens
    # one_hot_adv = get_only_ASCII_one_hot_adv()
    # # Do NOT Initialize the adversarial one_hot encodings using all '!' tokens
    # one_hot_adv = get_all_exclamations_one_hot_adv(embed_weights=embed_weights)
    # You can also initialize one_hot_adv Randomly.
    one_hot_adv = F.softmax(torch.rand(20, 32000, dtype=torch.float16).to(device=device), dim=1)
    one_hot_adv.requires_grad_() 
    # Initialize the optimizer
    optimizer = EGDwithAdamOptimizer([one_hot_adv], lr=step_size)
    # Initialize the learning_rate scheduler
    scheduler_cycle = 0
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)
    # Specify the filename
    last_three_words = lambda text: '_'.join(text.split()[-3:])
    first_three_words = lambda text: '_'.join(text.split()[0:3])
    csv_filename = './CSV_Files/'+dataset_name+'/output_EGD_with_Adam_Optimizer('+first_three_words(user_prompt)+".."+last_three_words(user_prompt)+')('+str(num_steps)+'_steps).csv'        
    # Generate column names
    column_names = ['epoch', 'cycle', 'learning_rate', 'entropy_term', 
                    'kl_divergence_term', 
                    'continuous_loss','discrete_loss'] 
    # Adding 'max_1' to 'max_20' column names using a loop
    for i in range(1, 21):
        column_names.append(f'max_{i}')
    for i in range(1, 21):
        column_names.append(f'token_id_{i}')
    # Create an empty DataFrame
    df = pd.DataFrame(columns=column_names)

    # PHASE_2: bool=False
    BREAK_IT: bool=False

    for epoch_no in tqdm(range(num_steps)):
        optimizer.zero_grad()

        embeddings_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
        cross_entropy_loss, _ = calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, one_hot_target)
        continuous_loss = cross_entropy_loss.detach().cpu().item()
        # Break the loop if loss becomes "NaN"
        if math.isnan(continuous_loss):
            print("The value is NaN.")
            BREAK_IT=True # Go to the Next Behavior

        # # Discretization part
        masked_one_hot_adv = get_masked_one_hot_adv(one_hot_adv)
        # Use this masked_one_hot_adv ONLY for discretization
        max_values = torch.max(masked_one_hot_adv, dim=1)
        adv_token_ids = max_values.indices
        # adv_token_ids = top_k_sampling(one_hot_adv, k=10).squeeze(dim=1)
        # tau = initial_tau * (final_tau / initial_tau) ** (epoch_no / (num_steps - 1))
        # one_hot_discrete = F.gumbel_softmax(one_hot_adv, tau=tau, hard=True)
        # adv_token_ids = one_hot_discrete.argmax(dim=1)
        one_hot_discrete = torch.zeros(
            adv_token_ids.shape[0], embed_weights.shape[0], device=device, dtype=embed_weights.dtype
        )
        one_hot_discrete.scatter_(
            1,
            adv_token_ids.unsqueeze(1),
            torch.ones(masked_one_hot_adv.shape[0], 1, device=device, dtype=embed_weights.dtype),
        )
        # Use one_hot_discrete to print Tokens
        # What other techniques Can we use here to discretize the one_hot encodings?
        # Use discrete tokens to calculate loss
        embeddings_adv_discrete = (one_hot_discrete @ embed_weights).unsqueeze(0)
        disc_loss, _ = calc_loss(model, embeddings_user, embeddings_adv_discrete, embeddings_target, one_hot_target)
        # If loss improves, save it as x_best
        discrete_loss =  disc_loss.detach().cpu().item()
        # cur_loss_list.append(cur_loss)
        if discrete_loss < best_disc_loss:
            # print(f"########## {discrete_loss} #########")
            best_disc_loss = discrete_loss
            best_loss_at_epoch = epoch_no
            effective_adv_embeddings = embeddings_adv_discrete
            effective_adv_one_hot = one_hot_discrete
        else :
            pass

        ################# Usual entropy #################
        # # Calculate H(X) = − ∑_{i=1}^{L} ∑_{j=1}^{|T|} X_{ij} (log X_{ij} −1)
        # Adding a small epsilon to avoid log(0) which is undefined
        eps = 1e-12
        one_hot_adv_float32 = one_hot_adv.to(dtype=torch.float32)
        log_one_hot = torch.log(one_hot_adv_float32 + eps)
        # Compute the modified entropy
        entropy_per_row = -one_hot_adv_float32 * (log_one_hot - 1)
        entropy_term = entropy_per_row.sum()
        # Calculate the annealed coefficient, 𝜖
        # Use Exponential Scheduling
        reg_coefficient = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1))   
        entropy_term *= reg_coefficient 
        # Regularized loss: F(X) - epsilon * H(X)
        regularized_loss = cross_entropy_loss - entropy_term
        # Take the Negative log of the highest probability in each row
        kl_divergence_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum()
        kl_divergence_term *=reg_coefficient
        regularized_loss +=kl_divergence_term

        # Backpropagate the regularized loss
        regularized_loss.backward()

        # Get the scheduler's state to get learning_rate
        scheduler_state = scheduler.state_dict()

        # Clip the gradients(before update)
        torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0)

        # update parameters
        optimizer.step()
        # update scheduler
        scheduler.step(continuous_loss)
        # # Reset the gradients; redundant after calling optimizer.zero_grad()
        # model.zero_grad()
        # one_hot_adv.grad.zero_()
        

        # Dump Ouput values to a CSV file
        # Convert max_values to a NumPy array
        max_values_array = max_values.values.detach().cpu().numpy()
        token_ids_array = adv_token_ids.detach().cpu().numpy()
        # Get the scheduler's state and learning_rate
        # Access the current learning rate directly from the optimizer's parameter group
        scheduler_lr = optimizer.param_groups[0]['lr']  # Get the learning rate of the first parameter group
        # scheduler_lr = scheduler_state['_last_lr'][0]
        # Create the Initial array       
        prepend_array = np.array([epoch_no, scheduler_cycle, scheduler_lr,
                                  entropy_term.detach().cpu().item(), 
                                  kl_divergence_term.detach().cpu().item(),
                                  continuous_loss, discrete_loss]) 
        
        # Concatenate the arrays
        row = np.concatenate((prepend_array, max_values_array, token_ids_array))
        new_row = pd.Series(row, index=df.columns)
        df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
        # # Save log data to CSV file periodically
        # if epoch_no % 10 == 0:
        #     df.to_csv(csv_filename, index=False)
        # Something went wrong; so break it and go to the next behavior.
        if BREAK_IT==True:
            break
    # End of optimizations        
    # Write to CSV file
    df.to_csv(csv_filename, index=False)
    # Token and Output Generation Step
    inputs_token_ids = one_hot_inputs.argmax(dim=1)
    adv_token_ids = effective_adv_one_hot.argmax(dim=1)
    adv_suffix_list = str(adv_token_ids.cpu().numpy().tolist())
    adv_suffix_string = tokenizer.decode(adv_token_ids.cpu().numpy())
    final_string_ids = torch.hstack([inputs_token_ids, adv_token_ids])
    outputs = []
    generation_loop = range(1)
    import warnings
    # Suppress specific warnings
    warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
    warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")
    
    ######## Discrete tokens for Output Generation ########
    for loop in tqdm(generation_loop, desc="Generating outputs", leave=False):
        generated_output = model.generate(final_string_ids.unsqueeze(0), 
                                          max_length=num_tokens, 
                                          pad_token_id=tokenizer.pad_token_id,
                                          do_sample=False)
        generated_output_string = tokenizer.decode(generated_output[0][:].cpu().numpy(), skip_special_tokens=True).strip()
        outputs.append(generated_output_string)
    
    iteration_result = {
        "harmful-behaviour": user_prompt,
        "target": target,
        "suffix_token_ids": adv_suffix_list,
        "suffix_string": adv_suffix_string,
        "epoch_no": epoch_no,
        "continuous_loss": continuous_loss,
        "discrete_loss": discrete_loss,
        "best_disc_loss":best_disc_loss,
        "epch_at_best_disc_loss": best_loss_at_epoch,
        "outputs": outputs
    }
    optimization_results.append(iteration_result)

    # Update the JSON File
    with open(json_file_path, "w") as f:
        json.dump(
            optimization_results,
            f,
            indent=4,  # Keep structure readable
            ensure_ascii=False  # Do this to Handle non-ASCII chars properly; If there is any
        )

    ##########################################################################################
    ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
    output_file = "./JSON_Files/"+dataset_name+"/EGD_with_Adam_Llama2("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).jsonl"
    from behavior import Behavior
    behavior = Behavior(user_prompt, adv_suffix_string, generated_output_string, "", "")
    
    with open(output_file, 'a') as f:
        f.write(json.dumps(behavior.to_dict()) + '\n')
    f.close()  
    ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
    ########################################################################################## 


## Additional Codes

In [10]:
# DO NOT Run this Block
# def get_nonascii_toks(tokenizer):
#     def is_ascii(s):
#         return s.isascii() and s.isprintable()
#     non_ascii_toks = []
#     for i in range(3, tokenizer.vocab_size):
#         if not is_ascii(tokenizer.decode([i])):
#             non_ascii_toks.append(i)
    
#     if tokenizer.bos_token_id is not None:
#         non_ascii_toks.append(tokenizer.bos_token_id)
#     if tokenizer.eos_token_id is not None:
#         non_ascii_toks.append(tokenizer.eos_token_id)
#     if tokenizer.pad_token_id is not None:
#         non_ascii_toks.append(tokenizer.pad_token_id)
#     if tokenizer.unk_token_id is not None:
#         non_ascii_toks.append(tokenizer.unk_token_id)
    
#     return torch.tensor(non_ascii_toks).to(device)

# # # Test method
# # non_ascii_toks = get_nonascii_toks(tokenizer)
# # print(non_ascii_toks.tolist())


# def get_only_ASCII_one_hot_adv():

#     non_ascii_toks = get_nonascii_toks(tokenizer).tolist()
#     no_of_ascii_toks = tokenizer.vocab_size - len(non_ascii_toks)
#     # print(len(non_ascii_toks))
#     # print(tokenizer.vocab_size)
#     # print(no_of_ascii_toks)
#     # for tok_id in non_ascii_toks:
#     #     print(tokenizer.decode(tok_id), end=' ')
#     # Assuming device is already defined
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#     # Step 1: Create the 20x32000 tensor randomly initialized and Initialize weights at the center of the simplex
#     one_hot_adv = torch.rand(20, 32000, dtype=torch.float16).to(device=device)
#     # dims = one_hot_adv.size(-1)
#     one_hot_adv.data.fill_(1 / no_of_ascii_toks)
#     # Step 2: Assuming token_ids_init is a tensor of shape (20, k) where k is the number of indices per row
#     # top_token_ids_tensor = select_topk_adv_token_ids(input_files, input_directory)
#     # print('size:', top_token_ids_tensor.size())
#     # print('token_ids:', top_token_ids_tensor)
#     # Repeat the tensor 20 times along a new dimension and then reshape
#     non_ascii_toks_tensor = torch.tensor(non_ascii_toks).to(device=device)
#     top_token_ids_tensor_2d = non_ascii_toks_tensor.unsqueeze(0).repeat(20, 1)
    
#     # Step 3: Create a mask with the same shape as one_hot_adv, initialized to zero
#     mask = torch.ones_like(one_hot_adv, dtype=torch.float16)

#     # Step 4: Use token_ids_init to set the corresponding indices in the mask to 1
#     # We use gather and scatter operations for this
#     mask.scatter_(1, top_token_ids_tensor_2d, 0.0)

#     # Step 5: Apply the mask to one_hot_adv
#     one_hot_adv = one_hot_adv * mask
    
#     return one_hot_adv


# def get_all_exclamations_one_hot_adv(embed_weights):
#     adv_string_init: str = "! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !"
#     adv_string_tokens = get_tokens(adv_string_init)[1:]
#     one_hot_adv, _ = create_one_hot_and_embeddings(adv_string_tokens, embed_weights)
#     return one_hot_adv

# # # Example Usage
# # one_hot_adv = select_only_ASCII_token_ids()
# # print(one_hot_adv)

# # sum = torch.sum(one_hot_adv, dim=1)
# # max = torch.max(one_hot_adv, dim=1)
# # min = torch.min(one_hot_adv, dim=1)
# # print(sum,',', max, ',', min)

# def count_common_words(str1, str2):
#     # Split the strings into words
#     words1 = set(str1.split())
#     words2 = set(str2.split())

#     # Find the intersection of the two sets to get the common words
#     common_words = words1.intersection(words2)

#     # Return the number of common words
#     return len(common_words)

In [11]:
    # # Update the JSON File
    # with open(json_file_path, "w") as f:
    #     json.dump(optimization_results, f, indent=4)

    ######## Continuous tokens for Output Generation ########    
    # embeddings_adv_projected = project_to_nearest_embeddings(embeddings_adv.squeeze(0), embed_weights)
    # Project the relaxed adversarial embeddings to their nearest neighbors in the original embedding space using Euclidean distance
    # embeddings_adv_projected, min_distances = project_to_nearest_embeddings_euclidean(embeddings_adv.squeeze(0), embed_weights)

    # Use the projected embeddings for generation
    # Make sure to unsqueeze the dimension to match the expected input shape for model.generate
    # final_prompt_embeds = torch.hstack([embeddings_user, embeddings_adv_projected.unsqueeze(0)])

    # Convert min_distances tensor to a list of numerical values and print
    # min_distances_list = min_distances.cpu().tolist()
    # print("Minimum Euclidean distances for each embedding:", min_distances_list)
    # Calculate and print the mean of all the Euclidean distances
    # mean_distance = sum(min_distances_list) / len(min_distances_list)
    # print("Mean of Euclidean distances:", mean_distance)
    
    # # Use the projected embeddings for generation
    # # Make sure to unsqueeze the dimension to match the expected input shape for model.generate
    # final_prompt_embeds = torch.hstack([embeddings_user, embeddings_adv_projected.unsqueeze(0)])
    # final_prompt_embeds = torch.hstack([embeddings_user, embeddings_adv])

In [12]:
# # import torch
# class ExpGDOptimizer(torch.optim.Optimizer):
#     def __init__(self, params, lr=0.01):
#         defaults = dict(lr=lr)
#         super(ExpGDOptimizer, self).__init__(params, defaults)

#     def step(self, closure=None):
#         loss = None
#         if closure is not None:
#             loss = closure()

#         for group in self.param_groups:
#             for param in group['params']:
#                 if param.grad is None:
#                     continue
                
#                 # Get the gradient and learning rate
#                 grad = param.grad
#                 lr = group['lr']
                
#                 # Exponentiated Gradient Descent update
#                 # with torch.no_grad():
#                 #     # one_hot_adv.data *= torch.exp(-eta * grads)
#                 #     one_hot_adv.data *= torch.exp(-step_size * grads)
#                 #     one_hot_adv.data /= one_hot_adv.data.sum(dim=1, keepdim=True)
#                 # Apply the EGD update
#                 with torch.no_grad():
#                     param.mul_(torch.exp(-lr * grad))  # Exponentiated Gradient update
#                     # Normalize each row to sum up to 1
#                     row_sums = param.sum(dim=1, keepdim=True) + 1e-10  # Add epsilon to avoid division by zero
#                     param.div_(row_sums)
#         return loss

# # # Example usage with a neural network
# # model = torch.nn.Linear(10, 1)  # A simple linear model
# # optimizer = EGDOptimizer(model.parameters(), lr=0.01)

# # # Training loop
# # for data, target in dataloader:
# #     optimizer.zero_grad()
# #     output = model(data)
# #     loss = loss_fn(output, target)
# #     loss.backward()
# #     optimizer.step()


In [13]:
# class ExpGDOptimizer(torch.optim.Optimizer):
#     def __init__(self, params, lr=0.01):
#         defaults = dict(lr=lr)
#         super(ExpGDOptimizer, self).__init__(params, defaults)

#     def step(self, closure=None):
#         loss = None
#         if closure is not None:
#             loss = closure()

#         for group in self.param_groups:
#             for param in group['params']:
#                 if param.grad is None:
#                     continue
                
#                 # Get the gradient and learning rate
#                 grad = param.grad
#                 lr = group['lr']
                
#                 with torch.no_grad():
#                     # Clip gradients to avoid exploding updates
#                     grad = torch.clamp(grad, min=-10, max=10)
#                     # Exponentiated Gradient Descent update
#                     param.mul_(torch.exp(-lr * grad))
#                     # Clamp the parameter values to avoid extreme values
#                     param.clamp_(min=1e-6, max=1e6)
#                     # Normalize each row to sum up to 1
#                     row_sums = param.sum(dim=1, keepdim=True) + 1e-10  # Add epsilon to avoid division by zero
#                     param.div_(row_sums)

#         return loss


In [14]:
# # File paths to the .pt files

# for row in harmful_behaviors:
#     iteration_result = {}
#     user_prompt, _ = row
#     print(f'\n\n{user_prompt}')
#     print('________________________________________________________________________________')
#     tensor_file_unsafe = './TORCH_Files/EGD_w_Entropy_Unsafe_1_hot_adv_'+last_three_words(user_prompt)+'_'+first_three_words(unsafe_target)+'.pt'
#     tensor_file_safe = './TORCH_Files/EGD_w_Entropy_Safe_1_hot_adv_'+last_three_words(user_prompt)+'_'+first_three_words(safe_target)+'.pt'
#     tensor_filenames = [tensor_file_unsafe, tensor_file_safe]
#     loaded_tensors = []
#     skip: bool=False
#     for filename in tensor_filenames:
#         if os.path.exists(filename):
#             loaded_tensors.append(torch.load(filename))
#         else:
#             print(f"File {filename} does not exist. Skipping...")
#             skip=True
#     if skip==True: # GO to the NExt Iteration
#         continue
#     # Load the tensors from the .pt files
#     # loaded_tensors = [torch.load(filename) for filename in tensor_filenames]

#     # Now, loaded_tensors is a list containing the loaded tensors
#     tensor1 = loaded_tensors[0]
#     tensor2 = loaded_tensors[1]

#     # Move tensors to CPU and convert to float32 if necessary
#     tensor1_cpu = tensor1.cpu().float()
#     tensor2_cpu = tensor2.cpu().float()

#     # Compute cosine similarity
#     cosine_similarity = F.cosine_similarity(tensor1_cpu, tensor2_cpu, dim=1)

#     # Compute L2 distance using torch.cdist
#     l2_distance = torch.cdist(tensor1_cpu.unsqueeze(0), tensor2_cpu.unsqueeze(0), p=2).squeeze(0)

#     # Print results
#     print("Cosine Similarity:")
#     print(cosine_similarity)
#     print("\nL2-norm Distance size")
#     print(l2_distance.shape)

#     # If you only need the distance between corresponding rows
#     diagonal_l2_distance = l2_distance.diag()
#     print("L2-norm Distance between corresponding rows:")
#     print(diagonal_l2_distance)

#     # Compute L1 distance
#     # Compute the L1-norm distance
#     l1_distance = torch.cdist(tensor1_cpu.unsqueeze(0), tensor2_cpu.unsqueeze(0), p=1).squeeze(0)
#     # torch.abs(tensor1 - tensor2).sum()
#     print("\nL1-norm distance size:\n", l1_distance.shape)
#     print("L1-norm Distance between corresponding rows:")
#     print(l1_distance.diag())

In [15]:
# # Save the tensor as a CSV file
# def save_tensor_to_csv(tensor, filename):
#     # Convert the tensor to a NumPy array
#     numpy_array = tensor.cpu().numpy()
#     # Convert the NumPy array to a DataFrame
#     df = pd.DataFrame(numpy_array)
#     # Save the DataFrame to a CSV file
#     df.to_csv(filename, index=False)



# csv_file_unsafe = './outputs/Unsafe_1_hot_adv_'+first_three_words(unsafe_target)+'.csv'
# csv_file_safe = './outputs/Safe_1_hot_adv_'+first_three_words(safe_target)+'.csv'
# save_tensor_to_csv(tensor1_cpu, csv_file_unsafe)
# save_tensor_to_csv(tensor2_cpu, csv_file_safe)

In [16]:
# # Compute cosine_similarity and l2-distance

# # Move tensors to CPU if necessary
# tensor1_cpu = one_hot_adv_list[0].cpu()
# tensor2_cpu = one_hot_adv_list[1].cpu()

# # Compute cosine similarity
# cosine_similarity = F.cosine_similarity(tensor1_cpu, tensor2_cpu, dim=1)

# # Compute L2 distance using torch.cdist
# # Move tensors to CPU and convert to float32 if necessary
# tensor1_cpu = tensor1_cpu.float()
# tensor2_cpu = tensor2_cpu.float()
# l2_distance = torch.cdist(tensor1_cpu, tensor2_cpu, p=2)
# diagonal_l2_distance = l2_distance.diag()
# # Print results
# print("Cosine Similarity:")
# print(cosine_similarity)
# print("\nL2-norm Distance:")
# print(diagonal_l2_distance)

In [17]:
# # After it's done, compute the similarity and distance between the one_hot_adv_Safe and one_hot_adv_Unsafe
# print(len(one_hot_adv_list))
# # Use the Last embeddings_adv
# for one_hot_adv in one_hot_adv_list:
#     embeddings_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
#     final_prompt_embeds = torch.hstack([embeddings_user, embeddings_adv])
#     outputs = []
#     generation_loop = range(10)
#     for i in tqdm(generation_loop, desc="Generating outputs", leave=False):
#         generated_output =  model.generate(inputs_embeds=final_prompt_embeds, max_length=num_tokens).squeeze()
#         generated_output_string = tokenizer.decode(generated_output, skip_special_tokens=True)
#         print(f"#{i}\n",generated_output_string)
#         outputs.append(generated_output_string)