In [None]:
import torch
import numpy as np
# from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
import os
from helper import get_model_path, load_model_and_tokenizer, get_embedding_matrix, get_tokens, create_one_hot_and_embeddings

device: str = "cuda:0"
num_steps: int = 200
num_tokens: int = 300 
step_size: float = 0.1

seed: int = 42
load_csv_data = True
# verbose = True
# early_stopping = True

In [None]:
# Use Config and Samuel's directory to load different kinds of models
# MODEL PATHS    0           1       2       3           4
model_options = ["Falcon", "Llama2", "MPT", "Vicuna", "Mistral"] 
model_name = model_options[4]
model_path: str = get_model_path(model_name=model_name)
print(model_path)

In [None]:
if seed is not None:
        torch.manual_seed(seed)

model, tokenizer = load_model_and_tokenizer(
        model_path, low_cpu_mem_usage=True, use_cache=False, device=device
    )

In [None]:
import csv
embed_weights = get_embedding_matrix(model)
# Datset options:       0           1               2                   3
dataset_options = ['AdvBench', 'HarmBench', 'JailbreakBench', 'MaliciousInstruct']
dataset_name = dataset_options[3]
dataset_path: str = f"./data/{dataset_name}/harmful_behaviors.csv"
if load_csv_data:
    reader = csv.reader(open(dataset_path, 'r'))
    next(reader)
else:
    print("Error!")
harmful_behaviors = list(reader)[0:50]
# # We can hard-code this safe_target string; probably
# # safe_target: str = "I cannot fulfill your request. I'm just an AI."
# print(len(harmful_behaviors))
# for each_behavior in harmful_behaviors:
#     print(each_behavior)

In [None]:
print(f'model_name: {model_name}')
print(f'dataset_name: {dataset_name}')
print(f'vocabulary size: {tokenizer.vocab_size}')

In [None]:
source_model_name = "Vicuna" # "Llama2"
print(f'source_model_name: {source_model_name}')

In [None]:
import glob
import json
# import transformers, torch
from behavior import Behavior
import warnings
# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `temperature` is set to.*")
warnings.filterwarnings("ignore", message=".*`do_sample` is set to `False`. However, `top_p` is set to.*")
# from behavior import Behavior, RRBehavior
# Create the following directories if they do NOT exist
output_directories = [f'./JSON_Files/Xfer_attacks/{source_model_name}',                    # 0
                    f'./JSON_Files/Xfer_attacks/{source_model_name}/{dataset_name}']     # 1
for directory in output_directories:
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

# JSON file to dump the model generated outputs 
# json_file_path = directories[0]+"/output_EGD_with_Adam_Optimizer("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).json"
# Process each file in the specified directory
input_directory = f'./JSON_Files/{source_model_name}/{dataset_name}'
for file_path in glob.glob(f'{input_directory}/EGD_with_Adam_{source_model_name}(50_behaviors)(200_steps).jsonl'):
    with open(file_path, 'r') as input_file:
        for line in input_file:
            data = json.loads(line.strip())
            behavior = Behavior(
                harmful_behavior=data.get("harmful_behavior", ""),
                suffix=data.get("suffix", ""),
                response=data.get("response", ""),
                harmbench_eval=data.get("harmbench_eval", ""),
                beaver_cost_score=data.get("beaver_cost_score", "")
            )
            # We only need harmful_behavior and suffix to produce output
            # Token and Output Generation Step
            user_prompt = behavior.harmful_behavior
            adv_suffix = behavior.suffix
            # Convert the user_prompt and adv_suffix to token_ids
            user_prompt_tokens = get_tokens(user_prompt, tokenizer=tokenizer, device=device)
            one_hot_inputs, _ = create_one_hot_and_embeddings(user_prompt_tokens, embed_weights, device=device)
            inputs_token_ids = one_hot_inputs.argmax(dim=1)
            adv_suffix_tokens = get_tokens(adv_suffix, tokenizer=tokenizer, device=device)
            one_hot_adv, _ = create_one_hot_and_embeddings(adv_suffix_tokens, embed_weights, device=device)
            adv_token_ids = one_hot_adv.argmax(dim=1)
            # put the two token_ids together
            final_string_ids = torch.hstack([inputs_token_ids, adv_token_ids])
            
            ######## Discrete tokens for Output Generation ########
            generated_output = model.generate(final_string_ids.unsqueeze(0), 
                                            max_length=num_tokens, 
                                            pad_token_id=tokenizer.pad_token_id,
                                            do_sample=False)
            generated_output_string = tokenizer.decode(generated_output[0][:].cpu().numpy(), skip_special_tokens=True).strip()
            
            ## Create and Update a JSONL file (required for running the parse_results.ipynb script) ##
            output_file = output_directories[1]+"/Xfer_"+model_name+"("+str(len(harmful_behaviors))+"_behaviors)("+str(num_steps)+"_steps).jsonl"
            behavior = Behavior(user_prompt, adv_suffix, generated_output_string, "", "")
            
            with open(output_file, 'a') as output_file:
                output_file.write(json.dumps(behavior.to_dict()) + '\n')
            output_file.close()  
    input_file.close()
