In [None]:
!pip install datasets
!pip install transformers==4.37.0
!pip install nltk

In [None]:
import torch
import pandas as pd
import numpy as np
import gc
import nltk
import copy
nltk.download('punkt')
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from nltk.translate.bleu_score import sentence_bleu

import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda:0" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen1.5-0.5B-Chat",torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", padding_side = "left")

# mainmodel = AutoModelForCausalLM.from_pretrained(
#     "Qwen/Qwen1.5-0.5B-Chat",torch_dtype=torch.bfloat16,
# )
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", padding_side = "left")

In [None]:
## Loading the dataset
dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
df = dataset['train_gen'].to_pandas()
display(df)

In [None]:
## Random Sampling
subset_size = 1500  

# Use the sample() method to select a random subset
df = df.sample(n=subset_size)
df.reset_index(inplace = True)
df = df[['prompt','prompt_id','messages']]
display(df)

In [None]:
## Data Preprocessing

## 1) Separating prompts and responses
# Concatenate all 'content' entries in the arrays
# contents = df['messages'].apply(lambda x: x[1]['content'])
answers = []
for i in range(len(df)):
#     print(i)
    if(len(df.at[i,'messages'])<2):
        df.drop(i,inplace=True)
        
for i in range(len(df)):
    content = df.iat[i,2][1]['content']
    answers.append(content)
# Create a new column in the DataFrame with the concatenated content
df['answer'] = answers

df = df[['prompt','answer']]

## 2) Sorting according to prompt length to incorporate curriculum learning
df['length_col'] = df['prompt'].apply(len)

# Sort the DataFrame by the length column
df_sorted = df.sort_values(by='length_col', ascending=True)  # Use ascending=False for descending order
df = df_sorted[['prompt','answer']]
display(df)

## 3) Removing garbage prompts with very small lengths and hence insufficient context
df=df[-128:]
df.reset_index(inplace=True)
df.drop('index',axis=1)
df = df[['prompt','answer']]
display(df)


In [None]:
## Creating the dataloader class
class Customdataset(Dataset):
    def __init__(self,original_dataset):
        self.original_dataset = original_dataset
    def __len__(self):
        return len(self.original_dataset)
    def __getitem__(self,index):
        prompt = self.original_dataset.iat[index,0]
        response = self.original_dataset.iat[index,1]
        return prompt,response

In [None]:
## Initializing the dataloader
batch_size = 4
d_train = Customdataset(df)
dataloader = DataLoader(d_train, batch_size=batch_size, shuffle=False)

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
## Defining the custom tokenizer
def tokenize_and_pad(texts, tokenizer):
    # Tokenize the batch of texts
    #tokenizer.pad_token = tokenizer.eos_token
    tokenized_batches = [tokenizer(batch, return_tensors="pt", padding=False, truncation=True, max_length = 1024) for batch in texts]
    max_length = max(len(text['input_ids'][0]) for text in tokenized_batches)
    tokenized_batches = [tokenizer(batch, return_tensors="pt", padding=False, truncation=True, max_length = max_length) for batch in texts]

    # Pad the sequences with zeros at the end
    for batch in tokenized_batches:
        for key in batch.keys():
    # Calculate the amount of padding needed
            padding_length = max(0, max_length - len(batch[key][0]))
            if key=="attention_mask":
                pad_value = 0
            else:    
                pad_value = tokenizer.convert_tokens_to_ids('<|endoftext|>')  # Assuming you have a tokenizer object

    # Perform left padding with the <s> token
            if padding_length > 0:
                padding_tensor = torch.full((batch[key].shape[0], padding_length), pad_value)

                # Concatenate along the correct dimension
                # If you want to add padding to the right (columns), use dim=1
                batch[key] = torch.cat([ padding_tensor,batch[key]], dim=1)
                
        
    return tokenized_batches

In [None]:
def tokenize_and_pad_output(texts, tokenizer, desired_length):
    device = "cuda:0"
    texts = texts.to(device)
    print(texts.device)
    # Pad the sequences with zeros at the end
    max_length=desired_length
    new_text=torch.zeros((texts.shape[0],desired_length))
    i=0
    for batch in texts:
    # Calculate the amount of padding needed
        padding_length = max(0, max_length - len(batch))
        pad_value = tokenizer.convert_tokens_to_ids('<|endoftext|>')  # Assuming you have a tokenizer object

    # Perform left padding with the <s> token
        if padding_length > 0:
            padding_tensor = torch.full((padding_length,), pad_value).to(device)

            # Concatenate along the correct dimension
            # If you want to add padding to the right (columns), use dim=1
            new_text[i] = torch.cat([ padding_tensor,batch], dim=0)
            i=i+1
    
    return new_text


In [None]:
def tokenize_output_attention(texts, tokenizer, desired_length):
    device = "cuda:0"
    texts = texts.to(device)
    # Pad the sequences with zeros at the end
    max_length=desired_length
    new_text=torch.zeros((texts.shape[0],desired_length))
    i=0
    for batch in texts:
    # Calculate the amount of padding needed
        padding_length = max(0, max_length - len(batch))
        pad_value = 0 # Assuming you have a tokenizer object

    # Perform left padding with the <s> token
        if padding_length > 0:
            padding_tensor = torch.full((padding_length,), pad_value).to(device)

            # Concatenate along the correct dimension
            # If you want to add padding to the right (columns), use dim=1
            new_text[i] = torch.cat([ padding_tensor,batch], dim=0)
            i=i+1
    
    return new_text

In [None]:
def attack(prompt_ids,attention_mask_ids,ground_truth_ids,model,batch_size,tokenizer,ground_attention):
    epsilon=100
    device = "cuda:2"
    prompt_base=copy.deepcopy(prompt_ids).to(device)
    prompt_adv=copy.deepcopy(prompt_ids).to(device)
    model.to(device)
    
    for i in range(10):
        print("i")
        generated_text = model.generate(input_ids = prompt_adv.long(), max_new_tokens = 2048).to(device)
        
        if ground_truth_ids.shape[1] > generated_text.shape[1]:
            generated_text=tokenize_and_pad_output(generated_text, tokenizer, ground_truth_ids.shape[1]).to(device)
        else: 
            ground_truth_ids=tokenize_and_pad_output(ground_truth_ids, tokenizer, generated_text.shape[1]).to(device)
            ground_attention=tokenize_output_attention(ground_attention, tokenizer,generated_text.shape[1]).to(device)

        generated_logits = model(input_ids=generated_text.long()).logits
        ground_logits = model(input_ids=ground_truth_ids.long()).logits
        ground_logits = ground_logits.argmax(dim=-1)
        generated_logits = nn.functional.softmax(generated_logits,dim=-1)
        
        loss=torch.tensor(0.0,requires_grad=True)
        
        for number in range(batch_size):
            for word in range(generated_logits.shape[1]):
                loss=-torch.log(generated_logits[number][word][ground_logits[number][word]])
                
        fgsm = torch.sum(torch.sum(torch.autograd.grad(loss,generated_logits)[0],dim=-1),dim=-1)/generated_logits.shape[1]
        
        prompt_adv = torch.tensor(prompt_adv+5*fgsm.unsqueeze(1))
        eta = prompt_adv-prompt_base
        eta = torch.clamp(eta,-epsilon,epsilon)
        prompt_adv = prompt_base + eta


#     prompt_ent=entropy_projection(prompt_adv)
    model.to("cuda:0")
    return prompt_adv.long()

In [None]:
## Using BLEU Score as the evaluation metric
def calculate_bleu_score(paragraph1, paragraph2):
    # Remove tokens in the form of <...> from both paragraphs
    paragraph1_clean = " ".join(word for word in paragraph1.split() if not word.startswith("<") and not word.endswith(">"))
    paragraph2_clean = " ".join(word for word in paragraph2.split() if not word.startswith("<") and not word.endswith(">"))
    
    # Tokenize the paragraphs into lists of words
    reference = nltk.word_tokenize(paragraph1_clean)
    candidate = nltk.word_tokenize(paragraph2_clean)
    
    # Calculate BLEU scores
    bleu_1 = sentence_bleu([reference], candidate, weights=(1, 0, 0, 0),smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
    bleu_2 = sentence_bleu([reference], candidate, weights=(0.5, 0.5, 0, 0),smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
    bleu_3 = sentence_bleu([reference], candidate, weights=(0.33, 0.33, 0.33, 0),smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
    bleu_4 = sentence_bleu([reference], candidate,smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
    
    return bleu_1, bleu_2, bleu_3, bleu_4

In [None]:
## Initializing the optimizer and loading the model
device = "cuda:0"
model.to(device)
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-6)
print(model)

In [None]:
score_list = []

for step,batch in enumerate(dataloader):
    
    print("Step No "+str(step))
    prompts, ground_truth = batch
    
    messages = [[{"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}] for prompt in prompts]
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True) for message in messages]
    tokenized_batches = tokenize_and_pad(text,tokenizer)
    prompt_ids = torch.stack([x['input_ids'][0].to(device) for x in tokenized_batches], dim = 0)
    prompt_attention_mask = torch.stack([x['attention_mask'][0].to(device) for x in tokenized_batches], dim = 0)
    
    ground_truth_messages = [[{"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": ground_tru}] for ground_tru in ground_truth]
    ground_truth_text = [ tokenizer.apply_chat_template(ground_truth_message,tokenize=False,add_generation_prompt=True) for ground_truth_message in ground_truth_messages]

    ground_truth_encoding = tokenize_and_pad(ground_truth_text,tokenizer)
    ground_truth_ids = torch.stack([x['input_ids'][0].to(device) for x in ground_truth_encoding], dim = 0)
    ground_truth_attention_mask = torch.stack([x['attention_mask'][0].to(device) for x in ground_truth_encoding], dim = 0)

    adv_prompt = attack(prompt_ids, prompt_attention_mask, ground_truth_ids, model, batch_size,tokenizer,ground_truth_attention_mask).to(device)
    
    with torch.no_grad():
        model.eval()
        synthetic_response = model.generate(input_ids = adv_prompt, max_new_tokens = 2048)
        output=[output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_batches[0].input_ids, synthetic_response)]
        synthetic_response_ids = torch.empty((1,output[0].size(0))).to(device)
        for j in range(batch_size):
            output=[output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_batches[j].input_ids, synthetic_response)]
            synthetic_response_ids=torch.cat([synthetic_response_ids.long(),output[0].unsqueeze(0)],dim = 0)
        synthetic_response_ids=synthetic_response_ids[1:,:]
                
    for i in range(len(synthetic_response_ids)):
        scores = calculate_bleu_score(tokenizer.decode(synthetic_response_ids[i]),ground_truth_text[i])
        score_list.append(scores)
    
    torch.cuda.empty_cache()
    gc.collect()
    model.to(device)
            
avg_bleu_score = sum(score[0] for score in score_list)/len(score_list)
print(f"Average BLEU-1 Score is {avg_bleu_score}")
                