IMPORTS and GLOBALS

In [None]:
import torch
import torch.nn as nn
import math
from tqdm.notebook import tqdm
import numpy as np
import os
from helper import get_model_path, load_model_and_tokenizer, get_embedding_matrix, get_tokens, create_one_hot_and_embeddings, get_nonascii_toks
import json
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
from typing import List, Tuple
# from datetime import datetime
# from livelossplot import PlotLosses 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_tokens: int = 300 
seed: int = 42

In [None]:
# Use Config and Samuel's directory to load different kinds of models
# MODEL PATHS    0           1       2       3           4
model_options = ["Llama2", "Falcon", "MPT", "Vicuna", "Mistral"] 
model_name = model_options[0] # Change this to load different models
model_path: str = get_model_path(model_name=model_name)
print(model_path)

In [None]:
if seed is not None:
        torch.manual_seed(seed)

model, tokenizer = load_model_and_tokenizer(
        model_path, low_cpu_mem_usage=True, use_cache=False, device=device
    )

In [None]:
non_ascii_toks = get_nonascii_toks(tokenizer, device=device).tolist()
# Assuming device is already defined
non_ascii_toks_tensor = torch.tensor(non_ascii_toks).to(device=device)


def get_masked_one_hot_adv(one_hot_adv):
    # Mask out all the non-ascii tokens and then return
    # Step 1: Create a tensor with all the non_ascii tokens
    top_token_ids_tensor_2d = non_ascii_toks_tensor.unsqueeze(0).repeat(20, 1)
    
    # Step 2: Create a mask with the same shape as one_hot_adv, initialized to zero
    mask = torch.ones_like(one_hot_adv, dtype=torch.float16)

    # Step 3: Use token_ids_init to set the corresponding indices in the mask to 1
    # We use gather and scatter operations for this
    mask.scatter_(1, top_token_ids_tensor_2d, 0.0)

    # Step 4: Apply the mask to one_hot_adv
    masked_one_hot_adv = one_hot_adv * mask
    
    return masked_one_hot_adv

In [None]:
def calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, targets):
    full_embeddings = torch.hstack([embeddings_user, embeddings_adv, embeddings_target])
    logits = model(inputs_embeds=full_embeddings).logits
    loss_slice_start = len(embeddings_user[0]) + len(embeddings_adv[0])
    loss = nn.CrossEntropyLoss()(logits[0, loss_slice_start - 1 : -1, :], targets)
    return loss, logits[:, loss_slice_start-1:, :]
    # return loss, logits.to(torch.float16)

In [None]:
# class EGDwithAdamOptimizer(torch.optim.Optimizer):
#     def __init__(self, params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-4, grad_clip_val=1.0):
#         defaults = dict(lr=lr, beta1=beta1, beta2=beta2, eps=eps, grad_clip_val=grad_clip_val)
#         super(EGDwithAdamOptimizer, self).__init__(params, defaults)

#         # Initialize state variables for each parameter group
#         for group in self.param_groups:
#             for param in group['params']:
#                 self.state[param] = {
#                     'm': torch.zeros_like(param),  # First moment vector (momentum)
#                     'v': torch.zeros_like(param),  # Second moment vector (variance)
#                     't': 0                         # Time step
#                 }

#     def step(self, closure=None):
#         loss = None
#         if closure is not None:
#             loss = closure()

#         for group in self.param_groups:
#             lr = group['lr']
#             beta1 = group['beta1']
#             beta2 = group['beta2']
#             eps = group['eps']
#             # grad_clip_val = group['grad_clip_val']

#             for param in group['params']:
#                 if param.grad is None:
#                     continue

#                 grad = param.grad
#                 state = self.state[param]

#                 # Retrieve state variables
#                 m = state['m']
#                 v = state['v']
#                 t = state['t']

#                 # Increment time step
#                 t += 1
#                 state['t'] = t

#                 # Update biased first moment estimate (m) and second moment estimate (v)     
#                 m = beta1 * m + (1 - beta1) * grad
#                 v = beta2 * v + (1 - beta2) * torch.pow(grad, 2) 
#                 # v = beta2 * v + (1 - beta2) * (grad ** 2)
#                 state['m'] = m
#                 state['v'] = v

#                 # Bias correction
#                 m_hat = m / (1 - math.pow(beta1, t)) # m_hat = m / (1 - beta1 ** t)
#                 v_hat = v / (1 - math.pow(beta2, t)) # v_hat = v / (1 - beta2 ** t)

#                 # Adam-like modified gradient
#                 modified_grad = m_hat / (torch.sqrt(v_hat) + eps)

#                 with torch.no_grad():
#                     # Exponentiated Gradient Descent update with Adam-like modifications
#                     param.mul_(torch.exp(-lr * modified_grad))

#                     # # Clamp the parameter values to avoid extreme values; ChatGPT suggests not to do this
#                     # param.clamp_(min=1e-12, max=1e12)

#                     # Normalize each row to sum up to 1, considering possible tensor shapes
#                     if param.dim() > 1:
#                         row_sums = param.sum(dim=1, keepdim=True) + 1e-10  # Add epsilon to avoid division by zero
#                         param.div_(row_sums)
#                     else:
#                         # Handle 1D tensors or other cases if applicable
#                         param.div_(param.sum() + 1e-10)

#         return loss


In [None]:
class EGDwithAdamOptimizer(torch.optim.Optimizer):
    def __init__(self, params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-4, grad_clip_val=1.0):
        defaults = dict(lr=lr, beta1=beta1, beta2=beta2, eps=eps, grad_clip_val=grad_clip_val)
        super(EGDwithAdamOptimizer, self).__init__(params, defaults)

        # Initialize state variables for each parameter group
        for group in self.param_groups:
            for param in group['params']:
                self.state[param] = {
                    'm': torch.zeros_like(param),  # First moment vector (momentum)
                    'v': torch.zeros_like(param),  # Second moment vector (variance)
                    't': 0                         # Time step
                }

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            eps = group['eps']
            # grad_clip_val = group['grad_clip_val']

            for param in group['params']:
                if param.grad is None:
                    continue

                grad = param.grad
                state = self.state[param]

                # Retrieve state variables
                m = state['m']
                v = state['v']
                t = state['t']

                # Increment time step
                t += 1
                state['t'] = t

                # # Clip gradients to avoid exploding updates; Redundant step
                # grad = torch.clamp(grad, min=-grad_clip_val, max=grad_clip_val)

                # Update biased first moment estimate (m) and second moment estimate (v)     
                m = beta1 * m + (1 - beta1) * grad
                v = beta2 * v + (1 - beta2) * torch.pow(grad, 2) 
                # v = beta2 * v + (1 - beta2) * (grad ** 2)
                state['m'] = m
                state['v'] = v

                # Bias correction
                m_hat = m / (1 - math.pow(beta1, t)) # m_hat = m / (1 - beta1 ** t)
                v_hat = v / (1 - math.pow(beta2, t)) # v_hat = v / (1 - beta2 ** t)

                # Adam-like modified gradient
                modified_grad = m_hat / (torch.sqrt(v_hat) + eps)

                with torch.no_grad():
                    # Exponentiated Gradient Descent update with Adam-like modifications
                    param.mul_(torch.exp(-lr * modified_grad))

                    # Clamp the parameter values to avoid extreme values
                    param.clamp_(min=1e-12, max=1e12)

                    # Normalize each row to sum up to 1, considering possible tensor shapes
                    if param.dim() > 1:
                        row_sums = param.sum(dim=1, keepdim=True) + 1e-10  # Add epsilon to avoid division by zero
                        param.div_(row_sums)
                    else:
                        # Handle 1D tensors or other cases if applicable
                        param.div_(param.sum() + 1e-10)

        return loss


BEGIN ATTACK HERE

In [None]:
import csv
embed_weights = get_embedding_matrix(model)
# Datset options:       0           1               2                   3
dataset_options = ['AdvBench', 'HarmBench', 'JailbreakBench', 'MaliciousInstruct']
dataset_name = dataset_options[0]
dataset_path: str = f"./data/{dataset_name}/harmful_behaviors.csv"

reader = csv.reader(open(dataset_path, 'r'))
next(reader)
harmful_behaviors = list(reader)[0:50]
# We can hard-code this safe_target string; probably
# safe_target: str = "I cannot fulfill your request. I'm just an AI."
print(len(harmful_behaviors))
for each_behavior in harmful_behaviors:
    print(each_behavior)

In [None]:
print(f'model_name: {model_name}')
print(f'dataset_name: {dataset_name}')
print(f'vocabulary size: {tokenizer.vocab_size}')

In [None]:
# Try Mini-Batch Update on the one_hot_adv using EGD.
from math import *
import json 
import pandas as pd
import torch.nn.functional as F

# This is a new parameter for multi-prompt adversarial suffix optimization
batch_size = 5
num_steps = 200
step_size = 0.1
# Fetch the Model's weights
embed_weights = get_embedding_matrix(model)
# reader = [[user_prompt, target]]
optimization_results = []
# Set your regularization parameter
initial_coefficient = 1e-5
final_coefficient = 1e-3

directories = [f'./Multi-Prompt/CSV_Files/{model_name}/{dataset_name}',    # 0
            f'./Multi-Prompt/JSON_Files/{model_name}/{dataset_name}']      # 1
for directory in directories:
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

user_prompt_tokens_list =list()
target_tokens_list = list()
for i, row in enumerate(harmful_behaviors):
    user_prompt, target = row
    user_prompt_tokens_list.append(get_tokens(user_prompt, tokenizer=tokenizer, device=device) )   
    target_tokens_list.append(get_tokens(target, tokenizer=tokenizer, device=device)[1:] )
    
# JSON file to dump the model generated outputs 
json_filename = f'{directories[1]}/outputs_multiprompt_using_EGD({str(len(harmful_behaviors))}_behaviors)({str(num_steps)}_steps)({str(batch_size)}_batchsize).json'
one_hot_adv = F.softmax(torch.rand(20, embed_weights.shape[0], dtype=torch.float16).to(device=device), dim=1).to(embed_weights.dtype) # type: ignore
one_hot_adv.requires_grad_() 
# Use this masked_one_hot_adv ONLY for discretization
effective_adv_one_hot = one_hot_adv.clone()
max_values = torch.max(get_masked_one_hot_adv(one_hot_adv), dim=1)
adv_token_ids = max_values.indices 
# Initialize the optimizer
optimizer = EGDwithAdamOptimizer([one_hot_adv], lr=step_size)
# Initialize the learning_rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)
# Specify the filename
csv_filename = f'{directories[0]}/stats_multiprompt_using_EGD({str(len(harmful_behaviors))}_behaviors)({str(num_steps)}_steps)({str(batch_size)}_batchsize).csv'        
# Generate column names
column_names = ['epoch', 'learning_rate', 
                'entropy_term', 'kl_divergence_term', 
                'continuous_loss','discrete_loss'] 
# Adding 'max_1' to 'max_20' column names using a loop
for i in range(1, 21):
    column_names.append(f'max_{i}')
for i in range(1, 21):
    column_names.append(f'token_id_{i}')
# Create an empty DataFrame
df = pd.DataFrame(columns=column_names)
best_discrete_loss = np.inf
best_discrete_loss_epoch = 0


for epoch_no in tqdm(range(num_steps)):
    # Shuffle indices each epoch
    indices = np.random.permutation(len(harmful_behaviors))

    # Annealing coefficient
    reg_coefficient = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1))
    eps = 1e-12

    # Track per-epoch metrics
    epoch_continuous_losses = []
    epoch_discrete_losses = []
    epoch_entropy_terms = []
    epoch_kl_terms = []

    for batch_start in range(0, len(indices), batch_size):
        optimizer.zero_grad()

        batch_ids = indices[batch_start:batch_start+batch_size]

        # Compute entropy + KL once per batch
        log_one_hot = torch.log(one_hot_adv.to(dtype=torch.float32) + eps)
        entropy_term = -one_hot_adv * (log_one_hot - 1)
        entropy_term = entropy_term.sum() * reg_coefficient

        kl_divergence_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum()
        kl_divergence_term *= reg_coefficient

        # Record for epoch average
        epoch_entropy_terms.append(entropy_term.item())
        epoch_kl_terms.append(kl_divergence_term.item())
        
        masked_one_hot_adv = get_masked_one_hot_adv(one_hot_adv)
        adv_token_ids = masked_one_hot_adv.argmax(dim=1)
        one_hot_discrete = F.one_hot(adv_token_ids, num_classes=embed_weights.shape[0]).to(embed_weights.dtype)

        total_loss = 0.0

        for idx in batch_ids:
            user_tokens = user_prompt_tokens_list[idx]
            target_tokens = target_tokens_list[idx]

            _, emb_user = create_one_hot_and_embeddings(user_tokens, embed_weights, device=device)
            _, emb_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
            one_hot_target, _ = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)

            emb_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
            emb_adv_discrete = (one_hot_discrete @ embed_weights).unsqueeze(0)

            ce_loss, _ = calc_loss(model, emb_user, emb_adv, emb_target, one_hot_target)
            disc_loss, _ = calc_loss(model, emb_user, emb_adv_discrete, emb_target, one_hot_target)

            regularized_loss = ce_loss - entropy_term + kl_divergence_term
            total_loss += regularized_loss

            epoch_continuous_losses.append(ce_loss.item())
            epoch_discrete_losses.append(disc_loss.item())

        total_loss/=len(batch_ids)
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0)
        optimizer.step()

    mean_continuous_loss = np.mean(epoch_continuous_losses)
    mean_discrete_loss = np.mean(epoch_discrete_losses)
    mean_entropy = np.mean(epoch_entropy_terms)
    mean_kl = np.mean(epoch_kl_terms)

    if mean_discrete_loss < best_discrete_loss:
        best_discrete_loss = mean_discrete_loss
        best_discrete_loss_epoch = epoch_no
        effective_adv_one_hot = one_hot_discrete

    scheduler.step(mean_continuous_loss)

    # Dump Ouput values to a CSV file
    # Convert max_values to a NumPy array
    max_values_array = max_values.values.detach().cpu().numpy()
    token_ids_array = adv_token_ids.detach().cpu().numpy()
    # Get the scheduler's state and learning_rate
    # Access the current learning rate directly from the optimizer's parameter group
    scheduler_lr = optimizer.param_groups[0]['lr']  # Get the learning rate of the first parameter group
    # Create the Initial array       
    prepend_array = np.array([epoch_no, scheduler_lr,
                                mean_entropy, mean_kl,
                                mean_continuous_loss, mean_discrete_loss]) 
    
    # Concatenate the arrays
    row = np.concatenate((prepend_array, max_values_array, token_ids_array))
    new_row = pd.Series(row, index=df.columns)
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
    
    # Save every 10 epochs
    if (epoch_no + 1) % 10 == 0:
        df.to_csv(csv_filename, index=False)

# End of optimizations
# Write to CSV file
df.to_csv(csv_filename, index=False)


In [None]:
######### Token and Output Generation Step #########
# for i, row in enumerate(harmful_behaviors):
for i, row in tqdm(enumerate(harmful_behaviors), desc="Generating outputs", leave=False):
    user_prompt, target = row
    user_prompt_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)    
    target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:] 
    # Get one_hot encodings and embedding vectors for the user prompts and targets 
    one_hot_inputs, embeddings_user = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device=device)
    one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
    inputs_token_ids = one_hot_inputs.argmax(dim=1)
    adv_token_ids = effective_adv_one_hot.argmax(dim=1)
    adv_suffix_list = str(adv_token_ids.cpu().numpy().tolist())
    adv_suffix_string = tokenizer.decode(adv_token_ids.cpu().numpy())
    final_string_ids = torch.hstack([inputs_token_ids, adv_token_ids])
    # outputs = []
    generation_loop = range(1)
    import warnings
    # Suppress specific warnings
    warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
    warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")
    ######## Discrete tokens for Output Generation ########
    # for loop in tqdm(generation_loop, desc="Generating outputs", leave=False):
    generated_output = model.generate(final_string_ids.unsqueeze(0), 
                                        max_length=num_tokens, 
                                        pad_token_id=tokenizer.pad_token_id,
                                        do_sample=False)
    generated_output_string = tokenizer.decode(generated_output[0][:].cpu().numpy(), skip_special_tokens=True).strip()
    # outputs.append(generated_output_string)
    iteration_result = {
        "harmful-behaviour": user_prompt,
        "target": target,
        "suffix_token_ids": adv_suffix_list,
        "suffix_string": adv_suffix_string,
        "best_disc_loss":best_discrete_loss,
        "epch_at_best_disc_loss": best_discrete_loss_epoch,
        "outputs": generated_output_string
    }
    optimization_results.append(iteration_result)

    # Update the JSON File
    with open(json_filename, "w") as f:
        json.dump(
            optimization_results,
            f,
            indent=4,  # Keep structure readable
            ensure_ascii=False  # Do this to Handle non-ASCII chars properly; If there is any
        )

    ##########################################################################################
    # Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
    # output_file = directories[1]+"/EGD_with_Adam_"+model_name+"("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).jsonl"
    output_file = f'{directories[1]}/outputs_multiprompt_using_EGD({str(len(harmful_behaviors))}_behaviors)({str(num_steps)}_steps)({str(batch_size)}_batchsize).jsonl'
    from behavior import Behavior
    behavior = Behavior(user_prompt, adv_suffix_string, generated_output_string, "", "")

    with open(output_file, 'a') as f:
        f.write(json.dumps(behavior.to_dict()) + '\n')
    f.close()  
    ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
    ########################################################################################## 

## Additional Codes

In [None]:
# Update the one_hot encodings using Gradient Descent.
from math import *
import json 
import pandas as pd
import torch.nn.functional as F

num_steps = 200
step_size = 0.1
# Fetch the Model's weights
embed_weights = get_embedding_matrix(model)
# reader = [[user_prompt, target]]
optimization_results = []
# Set your regularization parameter
initial_coefficient = 1e-5
final_coefficient = 1e-3

directories = [f'./Multi-Prompt/CSV_Files/{model_name}/{dataset_name}',    # 0
            f'./Multi-Prompt/JSON_Files/{model_name}/{dataset_name}']      # 1
for directory in directories:
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

user_prompt_tokens_list =list()
target_tokens_list = list()
for i, row in enumerate(harmful_behaviors):
    user_prompt, target = row
    user_prompt_tokens_list.append(get_tokens(user_prompt, tokenizer=tokenizer, device=device) )   
    target_tokens_list.append(get_tokens(target, tokenizer=tokenizer, device=device)[1:] )
    
# JSON file to dump the model generated outputs 
json_filename = directories[1]+"/output_EGD_with_Adam_Optimizer("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).json"
one_hot_adv = F.softmax(torch.rand(20, embed_weights.shape[0], dtype=torch.float16).to(device=device), dim=1).to(embed_weights.dtype) # type: ignore
one_hot_adv.requires_grad_() 
# Use this masked_one_hot_adv ONLY for discretization
effective_adv_one_hot = one_hot_adv.clone()
max_values = torch.max(get_masked_one_hot_adv(one_hot_adv), dim=1)
adv_token_ids = max_values.indices 
# Initialize the optimizer
optimizer = EGDwithAdamOptimizer([one_hot_adv], lr=step_size)
# Initialize the learning_rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)
# Specify the filename
csv_filename = directories[0]+'/stats_EGD_with_Adam_Optimizer('+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+'_steps).csv'        
# Generate column names
column_names = ['epoch', 'learning_rate', 
                'entropy_term', 'kl_divergence_term', 
                'continuous_loss','discrete_loss'] 
# Adding 'max_1' to 'max_20' column names using a loop
for i in range(1, 21):
    column_names.append(f'max_{i}')
for i in range(1, 21):
    column_names.append(f'token_id_{i}')
# Create an empty DataFrame
df = pd.DataFrame(columns=column_names)
best_discrete_loss = np.inf
best_discrete_loss_epoch = 0

for epoch_no in tqdm(range(num_steps)):
    optimizer.zero_grad()
    continuous_losses = np.zeros(len(harmful_behaviors))
    discrete_losses = np.zeros(len(harmful_behaviors))
    total_loss = 0.0 
    # Calculate the annealed coefficient, 𝜖; Use Exponential Scheduling
    reg_coefficient = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1)) 
    # Calculate H(X) = − ∑_{i=1}^{L} ∑_{j=1}^{|T|} X_{ij} (log X_{ij} −1)
    # Adding a small epsilon to avoid log(0) which is undefined
    eps = 1e-12
    one_hot_adv_float32 = one_hot_adv.to(dtype=torch.float32)
    log_one_hot = torch.log(one_hot_adv_float32 + eps)
    # Compute the modified entropy
    entropy_per_row = -one_hot_adv_float32 * (log_one_hot - 1)
    entropy_term = entropy_per_row.sum()
    entropy_term *= reg_coefficient 
    # Take the Negative log of the highest probability in each row
    kl_divergence_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum()
    kl_divergence_term *=reg_coefficient
    # # Discretization part; Do it once in every epoch
    masked_one_hot_adv = get_masked_one_hot_adv(one_hot_adv)
    # Use this masked_one_hot_adv ONLY for discretization
    max_values = torch.max(masked_one_hot_adv, dim=1)
    adv_token_ids = max_values.indices
    one_hot_discrete = torch.zeros(
        adv_token_ids.shape[0], embed_weights.shape[0], device=device, dtype=embed_weights.dtype # type: ignore
    )
    one_hot_discrete.scatter_(
        1,
        adv_token_ids.unsqueeze(1),
        torch.ones(masked_one_hot_adv.shape[0], 1, device=device, dtype=embed_weights.dtype), # type: ignore
    )
    
    for i, (user_prompt_tokens, target_tokens) in enumerate(zip(user_prompt_tokens_list, target_tokens_list)):
        # Get one_hot encodings and embedding vectors for the user prompts and targets 
        one_hot_inputs, embeddings_user = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device=device)
        one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
        # Get the embedding vectors for the adversarial suffix
        embeddings_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
        cross_entropy_loss, _ = calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, one_hot_target)
         # Regularized loss: F(X) - epsilon * H(X) + epsilon * KL-term
        regularized_loss = cross_entropy_loss - entropy_term + kl_divergence_term
        total_loss+=regularized_loss

        # Keep track of continuous loss
        continuous_loss = cross_entropy_loss.detach().cpu().item()
        continuous_losses[i] = continuous_loss        
        # Keep track of Discrete loss
        embeddings_adv_discrete = (one_hot_discrete @ embed_weights).unsqueeze(0)
        discrete_loss, _ = calc_loss(model, embeddings_user, embeddings_adv_discrete, embeddings_target, one_hot_target)
        discrete_loss =  discrete_loss.detach().cpu().item()
        discrete_losses[i] = discrete_loss
    
    mean_continuous_loss = np.mean(continuous_losses)
    mean_discrete_loss = np.mean(discrete_losses)
    # If loss improves, save it as x_best
    if mean_discrete_loss < best_discrete_loss:
        best_discrete_loss = mean_discrete_loss
        best_discrete_loss_epoch = epoch_no
        effective_adv_one_hot = one_hot_discrete
    # Average total_loss before computing gradients
    total_loss/=len(harmful_behaviors)
    # Backpropagate the regularized loss
    total_loss.backward() # type: ignore
    # Clip the gradients(before update)
    torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0) # type: ignore
    # update parameters
    optimizer.step()
    # update scheduler
    scheduler.step(mean_continuous_loss)

    # Dump Ouput values to a CSV file
    # Convert max_values to a NumPy array
    max_values_array = max_values.values.detach().cpu().numpy()
    token_ids_array = adv_token_ids.detach().cpu().numpy()
    # Get the scheduler's state and learning_rate
    # Access the current learning rate directly from the optimizer's parameter group
    scheduler_lr = optimizer.param_groups[0]['lr']  # Get the learning rate of the first parameter group
    # Create the Initial array       
    prepend_array = np.array([epoch_no, scheduler_lr,
                                entropy_term.detach().cpu().item(),
                                kl_divergence_term.detach().cpu().item(),
                                mean_continuous_loss, mean_discrete_loss]) 
    
    # Concatenate the arrays
    row = np.concatenate((prepend_array, max_values_array, token_ids_array))
    new_row = pd.Series(row, index=df.columns)
    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)

# End of optimizations
# Write to CSV file
df.to_csv(csv_filename, index=False)


######### Token and Output Generation Step #########
for i, row in enumerate(harmful_behaviors):
    user_prompt, target = row
    user_prompt_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)    
    target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:] 
    # Get one_hot encodings and embedding vectors for the user prompts and targets 
    one_hot_inputs, embeddings_user = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device=device)
    one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
    inputs_token_ids = one_hot_inputs.argmax(dim=1)
    adv_token_ids = effective_adv_one_hot.argmax(dim=1)
    adv_suffix_list = str(adv_token_ids.cpu().numpy().tolist())
    adv_suffix_string = tokenizer.decode(adv_token_ids.cpu().numpy())
    final_string_ids = torch.hstack([inputs_token_ids, adv_token_ids])
    outputs = []
    generation_loop = range(1)
    import warnings
    # Suppress specific warnings
    warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
    warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")
    generated_output_string = ''
    ######## Discrete tokens for Output Generation ########
    for loop in tqdm(generation_loop, desc="Generating outputs", leave=False):
        generated_output = model.generate(final_string_ids.unsqueeze(0), 
                                            max_length=num_tokens, 
                                            pad_token_id=tokenizer.pad_token_id,
                                            do_sample=False)
        generated_output_string = tokenizer.decode(generated_output[0][:].cpu().numpy(), skip_special_tokens=True).strip()
        outputs.append(generated_output_string)

    iteration_result = {
        "harmful-behaviour": user_prompt,
        "target": target,
        "suffix_token_ids": adv_suffix_list,
        "suffix_string": adv_suffix_string,
        "best_disc_loss":best_discrete_loss,
        "epch_at_best_disc_loss": best_discrete_loss_epoch,
        "outputs": outputs
    }
    optimization_results.append(iteration_result)

    # Update the JSON File
    with open(json_filename, "w") as f:
        json.dump(
            optimization_results,
            f,
            indent=4,  # Keep structure readable
            ensure_ascii=False  # Do this to Handle non-ASCII chars properly; If there is any
        )

    ##########################################################################################
    # Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
    output_file = directories[1]+"/EGD_with_Adam_"+model_name+"("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).jsonl"
    from behavior import Behavior
    behavior = Behavior(user_prompt, adv_suffix_string, generated_output_string, "", "")

    with open(output_file, 'a') as f:
        f.write(json.dumps(behavior.to_dict()) + '\n')
    f.close()  
    ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
    ########################################################################################## 

In [None]:
# # Create the following directories if they do NOT exist
# directories = [f'./Multi-Prompt/CSV_Files/{model_name}/{dataset_name}',    # 0
#             f'./Multi-Prompt/JSON_Files/{model_name}/{dataset_name}']      # 1
# for directory in directories:
#     # Create the directory if it doesn't exist
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#         print(f"Directory '{directory}' created.")
#     else:
#         print(f"Directory '{directory}' already exists.")


In [None]:
# import warnings
# # Suppress specific warnings
# warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
# warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")

# def universal_suffix_attack_fixed(
#     model,
#     tokenizer,
#     harmful_behaviors: List[Tuple[str, str]],
#     embed_weights: torch.Tensor,
#     device: str,
#     model_name: str,
#     dataset_name: str,
#     suffix_len: int = 20,
#     num_steps: int = 200,
#     step_size: float = 0.1,
#     eps: float = 1e-12,
#     initial_coefficient: float = 1e-5,
#     final_coefficient: float = 1e-3,
#     Optimizer = torch.optim.Adam
# ):
#     # CSV file to dump the optimization results 
#     csv_filename = directories[0]+'/stats_EGD_with_Adam('+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).csv"      
    
#     # Initialize one_hot_adv
#     one_hot_adv = F.softmax(torch.rand(suffix_len, embed_weights.shape[0], dtype=torch.float32, device=device), dim=1)
#     one_hot_adv.requires_grad_()
#     # Initialize some values
#     effective_adv_one_hot = torch.zeros_like(one_hot_adv)
#     max_values = torch.max(get_masked_one_hot_adv(effective_adv_one_hot), dim=1)
#     adv_token_ids = max_values.indices

#     # Optimizer and scheduler
#     optimizer = Optimizer([one_hot_adv], lr=step_size)
#     # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_steps)

#     # Generate column names
#     column_names = ['epoch', 'learning_rate', 
#                     'entropy_term', 'kl_divergence_term', 
#                     'continuous_loss','discrete_loss'] 
#     # Adding 'max_1' to 'max_20' column names using a loop
#     column_names.extend([f'max_{i}' for i in range(1, suffix_len + 1)])
#     column_names.extend([f'token_id_{i}' for i in range(1, suffix_len+1)])
#     # Create an empty DataFrame
#     df = pd.DataFrame(columns=column_names)
#     # Keep Track of discrete losses
#     best_disc_loss = np.inf
#     best_loss_at_epoch = 0

#     # Main loop
#     for epoch_no in tqdm(range(num_steps), desc=f'Optimizing Universal Suffix — {model_name}, {dataset_name}'):
#         optimizer.zero_grad()
#         total_loss = None
#         continuous_losses = np.zeros(len(harmful_behaviors))
#         discrete_losses = np.zeros(len(harmful_behaviors))
#         accumulated_grads = torch.zeros_like(one_hot_adv)
#         # Calculate the regularization coefficient 𝜖, using Exponential Annealing
#         reg_coefficient = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1))

#         # for i, (user_tokens, target_tokens) in enumerate(zip(user_prompt_tokens_list, target_tokens_list)):
#         for i, row in enumerate(harmful_behaviors):
#             # Get Tokens
#             user_prompt, target  = row[0], row[1]
#             user_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)
#             target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:]
#             # Get Embeddings and compute Cross-Entropy loss
#             _, embeddings_user = create_one_hot_and_embeddings(user_tokens, embed_weights, device=device)
#             one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
#             embeddings_adv = (one_hot_adv.to(dtype=torch.float16) @ embed_weights).unsqueeze(0)
#             cross_entropy_loss, _ = calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, one_hot_target)
#             continuous_losses[i] = cross_entropy_loss.detach().cpu().item()

#             # Regularization terms (computed per behavior!)
#             # Compute Entropy
#             log_one_hot = torch.log(one_hot_adv + eps)
#             entropy_term = -(one_hot_adv * (log_one_hot - 1)).sum() * reg_coefficient
#             # Compute KL-divergence term
#             kl_divergence_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum() * reg_coefficient

#             # Final regularized loss
#             regularized_loss = cross_entropy_loss - entropy_term + kl_divergence_term
#             # Backprop
#             regularized_loss.backward(retain_graph=True)
#             accumulated_grads += one_hot_adv.grad.clone()
#             one_hot_adv.grad.zero_()

#             # # Accumulate total loss
#             # total_loss = regularized_loss if i==0 else total_loss+regularized_loss

#             # Discrete tracking
#             masked_one_hot_adv = get_masked_one_hot_adv(one_hot_adv)
#             max_values = torch.max(masked_one_hot_adv, dim=1)
#             adv_token_ids = max_values.indices

#             one_hot_discrete = torch.zeros_like(one_hot_adv)
#             one_hot_discrete.scatter_(1, adv_token_ids.unsqueeze(1), 1.0)

#             embeddings_adv_discrete = (one_hot_discrete.to(dtype=torch.float16) @ embed_weights).unsqueeze(0)
#             discrete_loss, _ = calc_loss(model, embeddings_user, embeddings_adv_discrete, embeddings_target, one_hot_target)
#             discrete_loss = discrete_loss.detach().cpu().item()
#             discrete_losses[i] = discrete_loss

#             # Track best discrete suffix
#             if discrete_loss < best_disc_loss:
#                 best_disc_loss = discrete_loss
#                 best_loss_at_epoch = epoch_no
#                 effective_adv_one_hot = one_hot_discrete.clone()

#         # Compute mean continuous and discrete losses
#         mean_continuous_loss = np.mean(continuous_losses)
#         mean_discrete_loss = np.mean(discrete_losses)

#         # Final backward + optimizer step
#         # total_loss /= len(harmful_behaviors) # type: ignore
#         # total_loss.backward()
#         # Apply mean gradient, no normalization
#         accumulated_grads /= len(harmful_behaviors)
#         one_hot_adv.grad = accumulated_grads
#         # Gradient clipping is not necessary for EGD
#         # torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0) # type: ignore
#         # update parameters
#         optimizer.step()
#         # update scheduler
#         scheduler.step(mean_continuous_loss)
#         # Dump Ouput values to a CSV file
#         # Convert max_values to a NumPy array
#         max_values_array = max_values.values.detach().cpu().numpy()
#         token_ids_array = adv_token_ids.detach().cpu().numpy()
#         # Get the scheduler's state and learning_rate
#         # Access the current learning rate directly from the optimizer's parameter group
#         scheduler_lr = optimizer.param_groups[0]['lr']  # Get the learning rate of the first parameter group
#         # Create the Initial array       
#         prepend_array = np.array([epoch_no, scheduler_lr,
#                                 entropy_term.detach().cpu().item(),  # type: ignore
#                                 kl_divergence_term.detach().cpu().item(), # type: ignore
#                                 mean_continuous_loss, mean_discrete_loss]) 
        
#         # Concatenate the arrays
#         row = np.concatenate((prepend_array, max_values_array, token_ids_array))
#         new_row = pd.Series(row, index=df.columns)
#         df = pd.concat([df, new_row.to_frame().T], ignore_index=True)    
#     # End for loop                             
#     # Write to CSV file
#     df.to_csv(csv_filename, index=False)
#     return effective_adv_one_hot, best_disc_loss, best_loss_at_epoch



In [None]:
# # Fetch the Model's weights
# embed_weights = get_embedding_matrix(model)
# num_steps = 200

# effective_adv_one_hot, best_disc_loss, best_loss_at_epoch = universal_suffix_attack_fixed(
#     model = model,
#     tokenizer = tokenizer,
#     harmful_behaviors= harmful_behaviors, # type: ignore
#     embed_weights= embed_weights, # type: ignore
#     device= device, # type: ignore
#     model_name= model_name,
#     dataset_name= dataset_name,
#     num_steps= num_steps,
#     Optimizer= EGDwithAdamOptimizer) # type: ignore

# # JSON file to dump the  outputs 
# json_file_path = directories[1]+"/output_EGD_with_Adam("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).json"

# optimization_results = list()
# # Generate the model's output using one_hot_adv
# for row in tqdm(harmful_behaviors, desc="Generating Outputs"):
#     iteration_result = {}
#     user_prompt, target = row
#     # print(user_prompt)
#     # target = unsafe_target
#     user_prompt_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)    
#     target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:]
#     # Initialize one_hot encodings and embedding vectors for the user prompts and targets 
#     one_hot_inputs, _ = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device=device)
#     # one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
#     # Token and Output Generation Step
#     inputs_token_ids = one_hot_inputs.argmax(dim=1)
#     adv_token_ids = effective_adv_one_hot.argmax(dim=1)
#     adv_suffix_list = str(adv_token_ids.cpu().numpy().tolist())
#     adv_suffix_string = tokenizer.decode(adv_token_ids.cpu().numpy())
#     final_string_ids = torch.hstack([inputs_token_ids, adv_token_ids])
#     outputs = []
#     generation_loop = range(1)
    
#     ######## Discrete tokens for Output Generation ########
#     for loop in tqdm(generation_loop, desc="Generating outputs", leave=False):
#         generated_output = model.generate(final_string_ids.unsqueeze(0), 
#                                         max_length=num_tokens, 
#                                         pad_token_id=tokenizer.pad_token_id,
#                                         do_sample=False)
#         generated_output_string = tokenizer.decode(generated_output[0][:].cpu().numpy(), skip_special_tokens=True).strip()
#         outputs.append(generated_output_string)
    
#         iteration_result = {
#             "harmful-behaviour": user_prompt,
#             "target": target,
#             "suffix_token_ids": adv_suffix_list,
#             "suffix_string": adv_suffix_string,
#             # "epoch_no": epoch_no,
#             # "continuous_loss": continuous_loss,
#             # "discrete_loss": discrete_loss,
#             "best_disc_loss":best_disc_loss,
#             "epch_at_best_disc_loss": best_loss_at_epoch,
#             "outputs": outputs
#         }
#         optimization_results.append(iteration_result)

#         # Update the JSON File
#         with open(json_file_path, "w") as f:
#             json.dump(
#                 optimization_results,
#                 f,
#                 indent=4,  # Keep structure readable
#                 ensure_ascii=False  # Do this to Handle non-ASCII chars properly; If there is any
#             )

#         ##########################################################################################
#         ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
#         # output_file = directories[1]+"/EGD_with_Adam_"+model_name+"("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).jsonl"
#         # from behavior import Behavior
#         # behavior = Behavior(user_prompt, adv_suffix_string, generated_output_string, "", "")
        
#         # with open(output_file, 'a') as f:
#         #     f.write(json.dumps(behavior.to_dict()) + '\n')
#         # f.close()  
#         ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
#         ########################################################################################## 

In [None]:
# import warnings
# # Suppress specific warnings
# warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
# warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")

# def universal_suffix_attack(
#     model,
#     tokenizer,
#     harmful_behaviors: List[Tuple[str, str]],
#     embed_weights: torch.Tensor,
#     device: str,
#     model_name: str,
#     dataset_name: str,
#     suffix_len: int = 20,
#     num_steps: int = 200,
#     step_size: float = 1e-2,
#     initial_coefficient: float = 1e-5,
#     final_coefficient: float = 1e-3,
#     Optimizer = torch.optim.Adam  # Replace with your custom optimizer class if needed
# ):

#     user_prompt_tokens_list = list()
#     target_tokens_list = list()
#     for user_prompt, target in harmful_behaviors:
#         user_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)
#         target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:]  # skip BOS if needed
#         user_prompt_tokens_list.append(user_tokens)
#         target_tokens_list.append(target_tokens)

#     best_disc_loss = np.inf
#     best_loss_at_epoch = 0
#     # Random Initialization of the one_hot_adv 
#     one_hot_adv = F.softmax(torch.rand(20, embed_weights.shape[0], dtype=torch.float16, device=device), dim=1).to(embed_weights.dtype)
#     one_hot_adv.requires_grad_() 
#     # Initialize some values
#     effective_adv_one_hot = torch.zeros_like(one_hot_adv)
#     max_values = torch.max(effective_adv_one_hot, dim=1)
#     adv_token_ids = max_values.indices
#     # Initialize the optimizer
#     optimizer = Optimizer([one_hot_adv], lr=step_size)
#     # Initialize the learning_rate scheduler
#     scheduler_cycle = 0
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)

#     # Generate column names
#     column_names = ['epoch', 'cycle', 'learning_rate', 
#                     'entropy_term', 'kl_divergence_term', 
#                     'continuous_loss','discrete_loss'] 
#     # Adding 'max_1' to 'max_20' column names using a loop
#     column_names.extend([f'max_{i}' for i in range(1, suffix_len + 1)])
#     column_names.extend([f'token_id_{i}' for i in range(1, suffix_len+1)])
#     # Create an empty DataFrame
#     df = pd.DataFrame(columns=column_names)

#     for epoch_no in tqdm(range(num_steps), desc=f'Optimizing prompts — model: {model_name}, dataset: {dataset_name}'):
#         optimizer.zero_grad()
#         continuous_losses = np.zeros(len(harmful_behaviors)) # to compute avg_cont_loss for each epoch
#         discrete_losses = np.zeros(len(harmful_behaviors)) # to compute avg_disc_loss for each epoch
#         # accumulated_grads = torch.zeros_like(one_hot_adv)
#         # total_loss = None

#         ################# Usual entropy #################
#         # # Calculate H(X) = − ∑_{i=1}^{L} ∑_{j=1}^{|T|} X_{ij} (log X_{ij} −1)
#         # Adding a small epsilon to avoid log(0) which is undefined
#         eps = 1e-12
#         one_hot_adv_float32 = one_hot_adv.to(dtype=torch.float32)
#         log_one_hot = torch.log(one_hot_adv_float32 + eps)
#         # Calculate the annealed coefficient 𝜖, using Exponential Scheduling
#         reg_coefficient = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1))   
#         # Compute the modified entropy
#         entropy_per_row = -one_hot_adv_float32 * (log_one_hot - 1)
#         entropy_term = entropy_per_row.sum()        
#         # Take the Negative log of the highest probability in each row
#         kl_divergence_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum()
#         entropy_term *=reg_coefficient
#         kl_divergence_term *=reg_coefficient

#         for i, (user_tokens, target_tokens) in enumerate(zip(user_prompt_tokens_list, target_tokens_list)):
#             # Get embedding vectors for the user prompts and targets
#             _, embeddings_user = create_one_hot_and_embeddings(user_tokens, embed_weights, device=device)
#             one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
#             embeddings_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
#             cross_entropy_loss, _ = calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, one_hot_target)
#             continuous_losses[i] = cross_entropy_loss.detach().cpu().item()
#             # Regularized loss: F(X) - epsilon * H(X) + epsilon * KL(P||Q)
#             regularized_loss =  cross_entropy_loss - entropy_term + kl_divergence_term
#             regularized_loss.backward()
#             # if i==0:
#             #     total_loss=regularized_loss
#             # else: 
#             #     total_loss+=regularized_loss
#             # # Backpropagate the regularized loss
#             # total_loss.backward(retain_graph=True)
#             # Discretization part
#             masked_one_hot_adv = get_masked_one_hot_adv(one_hot_adv)
#             # Use this masked_one_hot_adv ONLY for discretization
#             max_values = torch.max(masked_one_hot_adv, dim=1)
#             adv_token_ids = max_values.indices
#             # adv_token_ids = one_hot_discrete.argmax(dim=1)
#             one_hot_discrete = torch.zeros(
#                 adv_token_ids.shape[0], embed_weights.shape[0], device=device, dtype=embed_weights.dtype
#             )
#             one_hot_discrete.scatter_(
#                 1,
#                 adv_token_ids.unsqueeze(1),
#                 torch.ones(masked_one_hot_adv.shape[0], 1, device=device, dtype=embed_weights.dtype),
#             )
#             # Use one_hot_discrete to print Tokens
#             # What other techniques Can we use here to discretize the one_hot encodings?
#             # Use discrete tokens to calculate loss
#             embeddings_adv_discrete = (one_hot_discrete @ embed_weights).unsqueeze(0)
#             discrete_loss, _ = calc_loss(model, embeddings_user, embeddings_adv_discrete, embeddings_target, one_hot_target)
#             # If loss improves, save it as x_best
#             discrete_loss =  discrete_loss.detach().cpu().item()
#             discrete_losses[i] = discrete_loss
#             # cur_loss_list.append(cur_loss)
#             if discrete_loss < best_disc_loss:
#                 # print(f"########## {discrete_loss} #########")
#                 best_disc_loss = discrete_loss
#                 best_loss_at_epoch = epoch_no
#                 effective_adv_one_hot = one_hot_discrete
#             else :
#                 pass
#         # End for loop            
#         # Average and backward once
#         # total_loss /= len(continuous_losses) # type: ignore
#         # total_loss.backward() # type: ignore

#         # Clip and step
#         torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0) # type: ignore
#         optimizer.step()
#         # accumulated_grads = accumulated_grads / len(continuous_losses)
#         # accumulated_grads = F.normalize(accumulated_grads, p=2.0, dim=1)
#         # one_hot_adv.grad = accumulated_grads
#         # torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0) # type: ignore
#         # # Update parameters
#         # optimizer.step()
        
#         mean_continuous_loss = np.mean(continuous_losses)
#         mean_discrete_loss = np.mean(discrete_losses)
#         # update scheduler
#         scheduler.step(mean_continuous_loss)
#         # Dump Ouput values to a CSV file
#         # Convert max_values to a NumPy array
#         max_values_array = max_values.values.detach().cpu().numpy()
#         token_ids_array = adv_token_ids.detach().cpu().numpy()
#         # Get the scheduler's state and learning_rate
#         # Access the current learning rate directly from the optimizer's parameter group
#         scheduler_lr = optimizer.param_groups[0]['lr']  # Get the learning rate of the first parameter group
#         # Create the Initial array       
#         prepend_array = np.array([epoch_no, scheduler_cycle, scheduler_lr,
#                                 entropy_term.detach().cpu().item(), 
#                                 kl_divergence_term.detach().cpu().item(),
#                                 mean_continuous_loss, mean_discrete_loss]) 
        
#         # Concatenate the arrays
#         row = np.concatenate((prepend_array, max_values_array, token_ids_array))
#         new_row = pd.Series(row, index=df.columns)
#         df = pd.concat([df, new_row.to_frame().T], ignore_index=True)    
#     # End for loop                             
#     # Write to CSV file
#     df.to_csv(csv_filename, index=False)
#     return effective_adv_one_hot, best_disc_loss, best_loss_at_epoch

In [None]:
# # Update the one_hot encodings using Gradient Descent.
# from math import *
# import json 
# import pandas as pd
# import torch.nn.functional as F

# # Fetch the Model's weights
# embed_weights = get_embedding_matrix(model)
# # reader = [[user_prompt, target]]
# optimization_results = []
# # Set your regularization parameter
# initial_coefficient = 1e-5
# final_coefficient = 1e-3


# # Create the following directories if they do NOT exist
# directories = [f'./Multi-Prompt/CSV_Files/{model_name}/{dataset_name}',       # 0
#                f'./Multi-Prompt/JSON_Files/{model_name}/{dataset_name}']      # 1
# for directory in directories:
#     # Create the directory if it doesn't exist
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#         print(f"Directory '{directory}' created.")
#     else:
#         print(f"Directory '{directory}' already exists.")

    
# # JSON file to dump the model generated outputs 
# json_file_path = directories[1]+"/output_EGD_with_Adam_Optimizer("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).json"

# for row in tqdm(harmful_behaviors, desc="Optimizing prompts"):
#     iteration_result = {}
#     user_prompt, target = row
#     print(user_prompt)
#     # target = unsafe_target
#     user_prompt_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)    
#     target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:]
#     plotlosses = PlotLosses()
#     plotlosses = PlotLosses(groups={'loss': ['continuous_loss', 'discrete_loss']})    
#     target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:]

#     # Initialize one_hot encodings and embedding vectors for the user prompts and targets 
#     one_hot_inputs, embeddings_user = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device=device)
#     one_hot_target, embeddings_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device=device)
#     best_disc_loss = np.inf
#     best_loss_at_epoch = 0
#     # Initialize one_hot_adv Randomly.
#     # one_hot_adv = F.softmax(torch.rand(20, tokenizer.vocab_size, dtype=torch.float16).to(device=device), dim=1).to(embed_weights.dtype)   
#     one_hot_adv = F.softmax(torch.rand(20, embed_weights.shape[0], dtype=torch.float16, device=device), dim=1).to(embed_weights.dtype)
#     one_hot_adv.requires_grad_() 
#     # Initialize the optimizer
#     optimizer = EGDwithAdamOptimizer([one_hot_adv], lr=step_size)
#     # Initialize the learning_rate scheduler
#     scheduler_cycle = 0
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)
#     # Specify the filename
#     last_three_words = lambda text: '_'.join(text.split()[-3:])
#     first_three_words = lambda text: '_'.join(text.split()[0:3])
#     csv_filename = directories[0]+'/output_EGD_with_Adam_Optimizer('+first_three_words(user_prompt)+".."+last_three_words(user_prompt)+')('+str(num_steps)+'_steps).csv'        
#     # Generate column names
#     column_names = ['epoch', 'cycle', 'learning_rate', 'entropy_term', 
#                     'kl_divergence_term', 
#                     'continuous_loss','discrete_loss'] 
#     # Adding 'max_1' to 'max_20' column names using a loop
#     for i in range(1, 21):
#         column_names.append(f'max_{i}')
#     for i in range(1, 21):
#         column_names.append(f'token_id_{i}')
#     # Create an empty DataFrame
#     df = pd.DataFrame(columns=column_names)

#     # PHASE_2: bool=False
#     BREAK_IT: bool=False

#     for epoch_no in tqdm(range(num_steps)):
#         optimizer.zero_grad()

#         embeddings_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
#         cross_entropy_loss, _ = calc_loss(model, embeddings_user, embeddings_adv, embeddings_target, one_hot_target)
#         continuous_loss = cross_entropy_loss.detach().cpu().item()
#         # Break the loop if loss becomes "NaN"
#         if math.isnan(continuous_loss):
#             print("The value is NaN.")
#             BREAK_IT=True # Go to the Next Behavior

#         # # Discretization part
#         masked_one_hot_adv = get_masked_one_hot_adv(one_hot_adv)
#         # Use this masked_one_hot_adv ONLY for discretization
#         max_values = torch.max(masked_one_hot_adv, dim=1)
#         adv_token_ids = max_values.indices
#         # adv_token_ids = top_k_sampling(one_hot_adv, k=10).squeeze(dim=1)
#         # tau = initial_tau * (final_tau / initial_tau) ** (epoch_no / (num_steps - 1))
#         # one_hot_discrete = F.gumbel_softmax(one_hot_adv, tau=tau, hard=True)
#         # adv_token_ids = one_hot_discrete.argmax(dim=1)
#         one_hot_discrete = torch.zeros(
#             adv_token_ids.shape[0], embed_weights.shape[0], device=device, dtype=embed_weights.dtype
#         )
#         one_hot_discrete.scatter_(
#             1,
#             adv_token_ids.unsqueeze(1),
#             torch.ones(masked_one_hot_adv.shape[0], 1, device=device, dtype=embed_weights.dtype),
#         )
#         # Use one_hot_discrete to print Tokens
#         # What other techniques Can we use here to discretize the one_hot encodings?
#         # Use discrete tokens to calculate loss
#         embeddings_adv_discrete = (one_hot_discrete @ embed_weights).unsqueeze(0)
#         disc_loss, _ = calc_loss(model, embeddings_user, embeddings_adv_discrete, embeddings_target, one_hot_target)
#         # If loss improves, save it as x_best
#         discrete_loss =  disc_loss.detach().cpu().item()
#         # cur_loss_list.append(cur_loss)
#         if discrete_loss < best_disc_loss:
#             # print(f"########## {discrete_loss} #########")
#             best_disc_loss = discrete_loss
#             best_loss_at_epoch = epoch_no
#             effective_adv_embeddings = embeddings_adv_discrete
#             effective_adv_one_hot = one_hot_discrete
#         else :
#             pass

#         ################# Usual entropy #################
#         # # Calculate H(X) = − ∑_{i=1}^{L} ∑_{j=1}^{|T|} X_{ij} (log X_{ij} −1)
#         # Adding a small epsilon to avoid log(0) which is undefined
#         eps = 1e-12
#         one_hot_adv_float32 = one_hot_adv.to(dtype=torch.float32)
#         log_one_hot = torch.log(one_hot_adv_float32 + eps)
#         # Compute the modified entropy
#         entropy_per_row = -one_hot_adv_float32 * (log_one_hot - 1)
#         entropy_term = entropy_per_row.sum()
#         # Calculate the annealed coefficient, 𝜖
#         # Use Exponential Scheduling
#         reg_coefficient = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1))   
#         entropy_term *= reg_coefficient 
#         # Regularized loss: F(X) - epsilon * H(X)
#         regularized_loss = cross_entropy_loss - entropy_term
#         # Take the Negative log of the highest probability in each row
#         kl_divergence_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum()
#         kl_divergence_term *=reg_coefficient
#         regularized_loss +=kl_divergence_term

#         # Backpropagate the regularized loss
#         regularized_loss.backward()

#         # Get the scheduler's state to get learning_rate
#         scheduler_state = scheduler.state_dict()

#         # Clip the gradients(before update)
#         torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0)

#         # update parameters
#         optimizer.step()
#         # update scheduler
#         scheduler.step(continuous_loss)
#         # # Reset the gradients; redundant after calling optimizer.zero_grad()
#         # model.zero_grad()
#         # one_hot_adv.grad.zero_()
        

#         # Dump Ouput values to a CSV file
#         # Convert max_values to a NumPy array
#         max_values_array = max_values.values.detach().cpu().numpy()
#         token_ids_array = adv_token_ids.detach().cpu().numpy()
#         # Get the scheduler's state and learning_rate
#         # Access the current learning rate directly from the optimizer's parameter group
#         scheduler_lr = optimizer.param_groups[0]['lr']  # Get the learning rate of the first parameter group
#         # scheduler_lr = scheduler_state['_last_lr'][0]
#         # Create the Initial array       
#         prepend_array = np.array([epoch_no, scheduler_cycle, scheduler_lr,
#                                   entropy_term.detach().cpu().item(), 
#                                   kl_divergence_term.detach().cpu().item(),
#                                   continuous_loss, discrete_loss]) 
        
#         # Concatenate the arrays
#         row = np.concatenate((prepend_array, max_values_array, token_ids_array))
#         new_row = pd.Series(row, index=df.columns)
#         df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
#         # # Save log data to CSV file periodically
#         # if epoch_no % 10 == 0:
#         #     df.to_csv(csv_filename, index=False)
#         # Something went wrong; so break it and go to the next behavior.
#         if BREAK_IT==True:
#             break
#     # End of optimizations        
#     # Write to CSV file
#     df.to_csv(csv_filename, index=False)
#     # Token and Output Generation Step
#     inputs_token_ids = one_hot_inputs.argmax(dim=1)
#     adv_token_ids = effective_adv_one_hot.argmax(dim=1)
#     adv_suffix_list = str(adv_token_ids.cpu().numpy().tolist())
#     adv_suffix_string = tokenizer.decode(adv_token_ids.cpu().numpy())
#     final_string_ids = torch.hstack([inputs_token_ids, adv_token_ids])
#     outputs = []
#     generation_loop = range(1)
#     import warnings
#     # Suppress specific warnings
#     warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
#     warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")
    
#     ######## Discrete tokens for Output Generation ########
#     for loop in tqdm(generation_loop, desc="Generating outputs", leave=False):
#         generated_output = model.generate(final_string_ids.unsqueeze(0), 
#                                           max_length=num_tokens, 
#                                           pad_token_id=tokenizer.pad_token_id,
#                                           do_sample=False)
#         generated_output_string = tokenizer.decode(generated_output[0][:].cpu().numpy(), skip_special_tokens=True).strip()
#         outputs.append(generated_output_string)
    
#     iteration_result = {
#         "harmful-behaviour": user_prompt,
#         "target": target,
#         "suffix_token_ids": adv_suffix_list,
#         "suffix_string": adv_suffix_string,
#         "epoch_no": epoch_no,
#         "continuous_loss": continuous_loss,
#         "discrete_loss": discrete_loss,
#         "best_disc_loss":best_disc_loss,
#         "epch_at_best_disc_loss": best_loss_at_epoch,
#         "outputs": outputs
#     }
#     optimization_results.append(iteration_result)

#     # Update the JSON File
#     with open(json_file_path, "w") as f:
#         json.dump(
#             optimization_results,
#             f,
#             indent=4,  # Keep structure readable
#             ensure_ascii=False  # Do this to Handle non-ASCII chars properly; If there is any
#         )

#     ##########################################################################################
#     ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
#     output_file = directories[1]+"/EGD_with_Adam_"+model_name+"("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).jsonl"
#     from behavior import Behavior
#     behavior = Behavior(user_prompt, adv_suffix_string, generated_output_string, "", "")
    
#     with open(output_file, 'a') as f:
#         f.write(json.dumps(behavior.to_dict()) + '\n')
#     f.close()  
#     ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
#     ########################################################################################## 

In [None]:
# def universal_suffix_attack(
#     model,
#     tokenizer,
#     harmful_behaviors: List[Tuple[str, str]],
#     embed_weights: torch.Tensor,
#     device: str,
#     model_name: str,
#     dataset_name: str,
#     suffix_len: int = 20,
#     num_steps: int = 10,
#     step_size: float = 1e-2,
#     initial_coefficient: float = 1e-5,
#     final_coefficient: float = 1e-3,
#     Optimizer = torch.optim.Adam  # Replace with your custom optimizer class if needed
# ):
#     # Directories
#     directories = [f'./Multi-Prompt/CSV_Files/{model_name}/{dataset_name}', 
#                    f'./Multi-Prompt/JSON_Files/{model_name}/{dataset_name}']
#     os.makedirs(directories[0], exist_ok=True)
#     os.makedirs(directories[1], exist_ok=True)

#     # Prompt-token list
#     prompt_embeds_list = []
#     target_embeds_list = []
#     target_one_hot_list = []    

#     for user_prompt, target in harmful_behaviors:
#         user_prompt_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)
#         target_tokens = get_tokens(target, tokenizer=tokenizer, device=device)[1:]
#         one_hot_inputs, emb_user = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device)
#         one_hot_target, emb_target = create_one_hot_and_embeddings(target_tokens, embed_weights, device)
#         prompt_embeds_list.append(emb_user.unsqueeze(0))
#         target_embeds_list.append(emb_target.unsqueeze(0))
#         target_one_hot_list.append(one_hot_target.unsqueeze(0))

#     # Suffix and optimizer
#     one_hot_adv = F.softmax(torch.rand(suffix_len, embed_weights.shape[0], dtype=torch.float16, device=device), dim=1).to(embed_weights.dtype)
#     one_hot_adv.requires_grad_()
#     optimizer = Optimizer([one_hot_adv], lr=step_size)
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50)

#     # Keep track of the best discrete loss
#     # and the corresponding one-hot representation
#     # of the adversarial suffix
#     best_disc_loss = np.inf
#     best_loss_at_epoch = 0
#     effective_adv_one_hot = one_hot_adv
    
#     # CSV and JSON init
#     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#     csv_filename = os.path.join(directories[0], f'universal_suffix_log_{timestamp}.csv')
#     json_filename = os.path.join(directories[1], f'universal_suffix_outputs_{timestamp}.json')

#     column_names = ['epoch', 'learning_rate', 'avg_continuous_loss', 'avg_discrete_loss']
#     for i in range(1, suffix_len+1):
#         column_names += [f'max_{i}', f'token_id_{i}']
#     df = pd.DataFrame(columns=column_names)
#     results = []

#     # Training
#     for epoch_no in tqdm(range(num_steps), desc="Training Universal Suffix"):
#         optimizer.zero_grad()
#         total_loss = 0.0
#         total_discrete_loss = 0.0
#         accumulated_grads = torch.zeros_like(one_hot_adv)

#         for emb_user, emb_target, one_hot_target in zip(prompt_embeds_list, target_embeds_list, target_one_hot_list):
#             emb_adv = (one_hot_adv @ embed_weights).unsqueeze(0)
#             ce_loss, _ = calc_loss(model, emb_user, emb_adv, emb_target, one_hot_target)

#             masked = one_hot_adv.clone()
#             adv_token_ids = torch.argmax(masked, dim=1)
#             one_hot_discrete = torch.zeros_like(one_hot_adv)
#             one_hot_discrete.scatter_(1, adv_token_ids.unsqueeze(1), 1.0)
#             emb_adv_discrete = (one_hot_discrete @ embed_weights).unsqueeze(0)
#             disc_loss, _ = calc_loss(model, emb_user, emb_adv_discrete, emb_target, one_hot_target)
#             # If loss improves, save it as x_best
#             discrete_loss =  disc_loss.detach().cpu().item()
#             # cur_loss_list.append(cur_loss)
#             if discrete_loss < best_disc_loss:
#                 # print(f"########## {discrete_loss} #########")
#                 best_disc_loss = discrete_loss
#                 best_loss_at_epoch = epoch_no
#                 effective_adv_one_hot = one_hot_discrete

#             eps = 1e-12
#             log_one_hot = torch.log(one_hot_adv.to(torch.float32) + eps)
#             entropy_term = -(one_hot_adv * (log_one_hot - 1)).sum()
#             kl_term = -torch.log(torch.max(one_hot_adv, dim=1).values + eps).sum()

#             reg_coeff = initial_coefficient * (final_coefficient / initial_coefficient) ** (epoch_no / (num_steps - 1))
#             reg_loss = ce_loss - reg_coeff * entropy_term + reg_coeff * kl_term
#             reg_loss.backward(retain_graph=True)
#             accumulated_grads += one_hot_adv.grad.clone()
#             one_hot_adv.grad.zero_()

#             total_loss += reg_loss.detach().cpu().item()
#             total_discrete_loss+=discrete_loss

#         accumulated_grads = accumulated_grads / len(prompt_embeds_list)
#         accumulated_grads = F.normalize(accumulated_grads, p=2.0, dim=1)
#         one_hot_adv.grad = accumulated_grads
#         torch.nn.utils.clip_grad_norm_([one_hot_adv], max_norm=1.0)
#         optimizer.step()
#         scheduler.step(total_loss / len(prompt_embeds_list))

#         avg_cont_loss = total_loss / len(prompt_embeds_list)
#         avg_disc_loss = total_discrete_loss / len(prompt_embeds_list)

#         max_vals = torch.max(one_hot_adv, dim=1)
#         row = [epoch_no, optimizer.param_groups[0]['lr'], avg_cont_loss, avg_disc_loss]
#         row += max_vals.values.detach().cpu().numpy().tolist()
#         row += max_vals.indices.detach().cpu().numpy().tolist()
#         df.loc[len(df)] = row

#         adv_token_ids = max_vals.indices
#         adv_suffix_str = tokenizer.decode(adv_token_ids.detach().cpu().numpy())
#         results.append({
#             "epoch": epoch_no,
#             "adv_suffix_str": adv_suffix_str,
#             "suffix_token_ids": adv_token_ids.detach().cpu().tolist(),
#             "avg_continuous_loss": avg_cont_loss,
#             "avg_discrete_loss": avg_disc_loss,
#             "best_disc_loss": best_disc_loss,
#             "best_loss_at_epoch": best_loss_at_epoch
#         })

#     df.to_csv(csv_filename, index=False)
#     with open(json_filename, "w") as f:
#         json.dump(results, f, indent=4)

#     return effective_adv_one_hot.detach(), results, df
#     # return one_hot_adv.detach(), results, df
