In [1]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
import numpy as np
import os
import torch.nn.functional as F
from tqdm import tqdm
import time

model_dir = "/raid/models/llama2/llama-2-13b-chat/hf"
output_dir = "/raid/slee3473/LLM/llama-output/sentence_transform_complex_dec27_"
# output_dir = "/raid/slee3473/LLM/llama-output/sentence_transform_complex_dec25"
device = "cuda:0"
finetuned = False

tokenizer = LlamaTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

if os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0: # load pretrained
    print(f"Load a fine-tuned model from {output_dir}")
    model = LlamaForCausalLM.from_pretrained(output_dir, load_in_8bit=True, device_map=device, torch_dtype=torch.float16)
    finetuned = True
else:
    model = LlamaForCausalLM.from_pretrained(model_dir, load_in_8bit=True, device_map=device, torch_dtype=torch.float16)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Load a fine-tuned model from /raid/slee3473/LLM/llama-output/sentence_transform_complex_dec27_


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Dataset

In [2]:
import datasets
import os
data_dir = "./../data/"
# train_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation/sentence_transformation_train.hf"))
# test_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation/sentence_transformation_test.hf"))
train_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation_complex/train.hf"))
test_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation_complex/test.hf"))

In [3]:
train_dataset = train_dataset.map(lambda train_dataset: tokenizer(train_dataset["text"], padding='max_length', truncation=True, max_length=64))
test_dataset = test_dataset.map(lambda test_dataset: tokenizer(test_dataset["text"]))

In [4]:
train_dataset = train_dataset.add_column("labels", train_dataset["input_ids"])

In [5]:
print(tokenizer.decode(train_dataset[95]["input_ids"]))

<s> Capitalize Every Other Letter
    For example:
    Laughter fills silent rooms. ->  lAuGhTeR FiLlS SiLeNt rOoMs.</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>


## Check base model

In [7]:
# eval_prompt = """
# When did Russia invade Ukraine?
# ---
# Answer:
# """
eval_prompt = test_dataset[910]["prompt"]
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)
model.eval()

with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Insert Number 1 Between Every Word
    Then, Add 'ly' To End of Each Word
    For example:
    Whispering winds call night. ->  Whisperingly 1ly windsly 1ly callly 1ly night.ly


In [7]:
test_dataset[910]["answer"]

'Whisperingly 1ly windsly 1ly callly 1ly night.ly'

### Prepare model for PEFT

In [14]:
from peft import PeftConfig, PeftModel, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

if not finetuned:
    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.05, target_modules=["q_proj", "v_proj"])
    print(peft_config)
    model = get_peft_model(model, peft_config)
else:
    peft_config = PeftConfig.from_pretrained(output_dir)
    peft_config.inference_mode = False
    print(peft_config)
    model = PeftModel.from_pretrained(model, output_dir, is_trainable=True)
    
model.print_trainable_parameters()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


LoraConfig(peft_type='LORA', auto_mapping=None, base_model_name_or_path='/raid/models/llama2/llama-2-13b-chat/hf', revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules=['q_proj', 'v_proj'], lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
trainable params: 6,553,600 || all params: 13,022,417,920 || trainable%: 0.05032552357220002
Insert Number 1 Between Every Word
    Then, Add 'ly' To End of Each Word
    For example:
    Whispering winds call night. ->  Whisperingly 1ly windsly 1ly callly 1ly night.ly


### Define an optional profiler

In [9]:
from transformers import TrainerCallback 
from contextlib import nullcontext 
enable_profiler = False 


# if not finetuned:
#     config = {
#         'lora_config': lora_config,
#         'learning_rate': 1e-4,
#         'num_train_epochs': 5,
#         'gradient_accumulation_steps': 2,
#         'per_device_train_batch_size': 2,
#         'gradient_checkpointing': False,
#     }

profiler = nullcontext()

# if enable_profiler and not finetuned:
#     wait, warmup, active, repeat = 1, 1, 2, 1
#     total_steps = (wait + warmup + active) * (1 + repeat)
#     schedule = torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
#     profiler = torch.profiler.profile(
#         schedule=schedule, on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{output_dir}/logs/tensorboard", record_shapes=True, profile_memory=True, with_stack=True)
#     )

#     class ProfilerCallback(TrainerCallback):
#         def __init__(self, profiler):
#             self.profiler = profiler

#         def on_step_end(self, *args, **kwargs):
#             self.profiler.step()

#     profiler_callback = ProfilerCallback(profiler)
# else:
#     profiler = nullcontext()

### Fine-tune the model

In [12]:
from transformers import default_data_collator, Trainer, TrainingArguments

if not finetuned:
    config = {
        'lora_config': peft_config,
        'learning_rate': 1e-4,
        'num_train_epochs': 5,
        'gradient_accumulation_steps': 2,
        'per_device_train_batch_size': 2,
        'gradient_checkpointing': False,
    }
    
    training_args = TrainingArguments(
        output_dir=output_dir, overwrite_output_dir=True, bf16=True,
        logging_dir=f"{output_dir}/logs", logging_strategy="steps", logging_steps=10, 
        save_strategy="no", optim="adamw_torch_fused", max_steps=total_steps if enable_profiler else -1,
        **{k: v for k,v in config.items() if k!="lora_config"}
    )
    
    with profiler:
        trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=default_data_collator, callbacks=[profiler_callback] if enable_profiler else [],)
        trainer.train()
    
    model.save_pretrained(f"{output_dir}")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[34m[1mwandb[0m: Currently logged in as: [33mseongmin_lee[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,11.7471
20,10.5895
30,9.3372
40,6.9774
50,3.2752
60,1.2135
70,0.772
80,0.6182
90,0.5299
100,0.4808


In [10]:
attr_prompt = test_dataset[910]["prompt"]
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)
prompt_len = model_input['input_ids'].shape[1]
attr_tokens = model.generate(**model_input, max_new_tokens=100)[0]
generated_len = attr_tokens.shape[0]
attr_str = tokenizer.decode(generated_len, skip_special_tokens=True)
attr_token_pos = np.arange(prompt_len, generated_len)  # 39 ... 60

tokenizer.decode(attr_tokens[attr_token_pos])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' Whisperingly 1ly windsly 1ly callly 1ly night.ly</s>'

In [13]:
test_dataset[910]["answer"]

'Whisperingly 1ly windsly 1ly callly 1ly night.ly'

### Prepare for the attribution

In [11]:
grad_dir = f"{output_dir}/training_grads_post"
if not os.path.exists(grad_dir):
    os.makedirs(grad_dir)

In [12]:
# iterate over training data point
model.train()
for i, data in enumerate(tqdm(train_dataset)):
    # get the Delta_theta when we update the model with "data"
    input_ids = torch.LongTensor(data["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.LongTensor(data["attention_mask"]).unsqueeze(0).to(device)
    labels = torch.LongTensor(data["labels"]).unsqueeze(0).to(device)
    out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = out.loss
    grad_loss = torch.autograd.grad(loss, [param for param in model.parameters() if param.requires_grad])
    torch.save(grad_loss, f"{grad_dir}/{i}.pt")

100%|███████████████████████████████████████████████████████████████████| 1000/1000 [10:34<00:00,  1.58it/s]


### Attribute

In [67]:
# attr_prompt = test_dataset[910]["prompt"]
attr_prompt = test_dataset[0]["prompt"]
model_input = tokenizer(attr_prompt, return_tensors="pt").to(device)
prompt_len = model_input['input_ids'].shape[1]
attr_tokens = model.generate(**model_input, max_new_tokens=100)[0]
generated_len = attr_tokens.shape[0]
attr_str = tokenizer.decode(generated_len, skip_special_tokens=True)
attr_token_pos = np.arange(prompt_len, generated_len)  # 39 ... 60

tokenizer.decode(attr_tokens[attr_token_pos])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' night. call winds Whispering</s>'

In [50]:
def get_attr_prob_and_grad(model=model, attr_tokens=attr_tokens, attr_token_pos=None, return_named=False):
    model.eval()
    model.zero_grad()
    
    attr_tokens = attr_tokens.reshape(1,-1)
    attention_mask = torch.ones_like(attr_tokens)
    
    out = model.base_model(attr_tokens, attention_mask)
    attr_logits = out.logits
    if attr_token_pos is not None: attr_logits = attr_logits[0, attr_token_pos-1]
    attr_probs = F.softmax(attr_logits, dim=1)  # 22 x 32000
    attr_probs = attr_probs[torch.arange(len(attr_token_pos)), attr_tokens[0, attr_token_pos].cpu()]
    attr_prob = attr_probs.prod()

    grad_prob = torch.autograd.grad(attr_prob, [param for param in model.parameters() if param.requires_grad])
    model.zero_grad()
    
    return grad_prob

In [51]:
def get_params_inner_prod(p1, p2, layerwise=False):
    if type(p1) in [list, tuple]:
        inner = 0
        inners = []
        for u,v in zip(p1,p2):
            val = torch.sum(u*v).item()
            inner += val 
            inners.append(val)
        if layerwise: return inner, inners 
        else: return inner
    elif type(p1) == dict:
        inner = dict()
        for name in p1:
            inner[name] = torch.sum(p1[name]*p2[name]).item()
    return inner

In [69]:
grad_prob = get_attr_prob_and_grad(model, attr_tokens, attr_token_pos)

In [70]:
attribution_scores = []
for i, data in enumerate(tqdm(train_dataset)):
    grad = torch.load(f"{output_dir}/training_grads_post/{i}.pt")
    inner = get_params_inner_prod(grad, grad_prob)
    attribution_scores.append(inner)
attribution_scores = np.array(attribution_scores)

100%|███████████████████████████████████████| 1000/1000 [00:27<00:00, 36.08it/s]


In [71]:
attributed = np.argsort(-attribution_scores)

In [72]:
attributed[:10] # 951 335 959 362 284 806 930 586 411 386

array([975, 338, 978, 964, 905, 633, 973, 957, 913, 907])

In [73]:
for i in attributed[:10]:
    print(train_dataset[int(i)]['prompt'])

Reverse Order of Words
    Then, Add 'ly' To End of Each Word
    For example:
    Laughter fills silent rooms. -> 
Replace Vowels with *
    For example:
    Autumns whisper of change. -> 
Add 'ly' To End of Each Word
    Then, Reverse Order of Words
    For example:
    Ancient echoes tell tales. -> 
Double Every Consonant
    Then, Capitalize Every Other Letter
    For example:
    Shadows play tricks nightly. -> 
Capitalize Every Word
    Then, Reverse Order of Words
    For example:
    Laughter fills silent rooms. -> 
Add 'ly' To End of Each Word
    For example:
    Golden horizons promise tomorrow. -> 
Add 'ly' To End of Each Word
    Then, Reverse Order of Words
    For example:
    Golden horizons promise tomorrow. -> 
Reverse Order of Words
    Then, Remove All Vowels
    For example:
    Stars twinkle softly above. -> 
Add 'ly' To End of Each Word
    Then, Insert Number 1 Between Every Word
    For example:
    Golden horizons promise tomorrow. -> 
Reverse Order of Words
 

### Try fine-tuned model

In [8]:
model.eval()

with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Insert Number 1 Between Every Word
    Then, Add 'ly' To End of Each Word
    For example:
    Whispering winds call night. ->  Whisperingly 1ly windsly 1ly callly 1ly night.ly


In [9]:
eval_prompt

"Insert Number 1 Between Every Word\n    Then, Add 'ly' To End of Each Word\n    For example:\n    Whispering winds call night. -> "

### Sub-functions

In [20]:
def reset_param(model, params):
    if type(params) in [list, tuple]:
        for model_param, input_param in zip(model.parameters(), params):
            model_param.data = torch.Tensor(input_param).to(device)
    elif type(params) == dict:
        for name, param in model.named_parameters():
            param.data = torch.Tensor(params[name]).to(device)

### Attribute using gradient

In [9]:
attr_prompt = eval_prompt
prompt_len = model_input['input_ids'].shape[1]
attr_tokens = model.generate(**model_input, max_new_tokens=100)[0]
generated_len = attr_tokens.shape[0]
attr_str = tokenizer.decode(generated_len, skip_special_tokens=True)
attr_token_pos = np.arange(prompt_len, generated_len)  # 39 ... 60

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [10]:
tokenizer.decode(attr_tokens[attr_token_pos])

' Whisperingly 1ly windsly 1ly callly 1ly night.ly</s>'

In [11]:
def get_attr_prob_and_grad(model=model, attr_tokens=attr_tokens, attr_token_pos=None, return_named=False):
    model.train()
    model.zero_grad()
    
    attr_tokens = attr_tokens.reshape(1,-1)
    attention_mask = torch.ones_like(attr_tokens)
    out = model(attr_tokens, attention_mask)
    attr_logits = out.logits
    if attr_token_pos is not None: attr_logits = attr_logits[0, attr_token_pos-1]
    attr_probs = F.softmax(attr_logits, dim=1)  # 22 x 32000
    attr_probs = attr_probs[torch.arange(len(attr_token_pos)), attr_tokens[0, attr_token_pos].cpu()]
    attr_prob = attr_probs.prod()

    grad_prob = torch.autograd.grad(attr_prob, [param for param in model.parameters() if param.requires_grad])
    model.zero_grad()
    model.eval()
    
    return grad_prob

In [22]:
grad_prob = get_attr_prob_and_grad(attr_token_pos=attr_token_pos) # get the gradient for the attr_tokens
inners = []
layerwise_inners = []

# iterate over training data point
for data in tqdm(train_dataset):
    # get the Delta_theta when we update the model with "data"
    input_ids = torch.LongTensor(data["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.LongTensor(data["attention_mask"]).unsqueeze(0).to(device)
    labels = torch.LongTensor(data["labels"]).unsqueeze(0).to(device)
    out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = out.loss
    grad_loss = torch.autograd.grad(loss, [param for param in model.parameters() if param.requires_grad])
    inner, inner_layerwise = get_params_inner_prod(grad_prob, grad_loss, layerwise=True)
    inners.append(inner)
    layerwise_inners.append(inner_layerwise)

100%|███████████████████████████████████████████████████████████████████| 1000/1000 [10:04<00:00,  1.65it/s]


### Layerwise inner product and Overall inner product --- Correlation

In [37]:
layerwise_inners = np.array(layerwise_inners)
print(layerwise_inners.shape)

(1000, 160)


In [38]:
layerwise_inners = layerwise_inners.T

In [25]:
inners = np.array(inners)

In [31]:
print(layerwise_inners.shape, inners.shape)

(1000, 160) (1000,)


In [35]:
temp_main = np.arange(20).reshape(2,10)
temp = np.zeros(10)
np.vstack([temp_main, temp]).shape

(3, 10)

In [39]:
inners_all = np.vstack([layerwise_inners, inners])
print(inners_all.shape)

(161, 1000)


In [45]:
# np.corrcoef(inners, layerwise_inners[:,-1])
corr_all = np.corrcoef(inners_all) # abs?
corr = np.abs(corr_all[:,-1])
np.sort(-corr)[:10]

array([-1.        , -0.68653015, -0.6655457 , -0.65754248, -0.64918273,
       -0.6469869 , -0.64345741, -0.6330343 , -0.626424  , -0.6114907 ])

In [44]:
corr

array([ 2.95520312e-01, -6.28454125e-03,  6.46986896e-01,  6.86530149e-01,
        2.85992342e-01,  2.98185156e-01,  2.26502953e-01,  5.84417952e-01,
        2.29343894e-01,  4.48984757e-01, -1.08599970e-01,  4.81853092e-01,
        1.35998382e-01,  1.22757800e-01, -1.72662109e-01,  5.86475204e-01,
        7.61374691e-02,  3.36693579e-01,  1.76516055e-02,  6.57542484e-01,
        2.00970921e-01,  1.56813666e-01,  7.11017322e-02,  5.46087527e-01,
        1.82512550e-01,  3.01574109e-01,  1.36031047e-01,  5.28923374e-01,
        3.66728993e-01,  4.49987237e-01, -1.44642960e-01,  6.26424003e-01,
        9.07436437e-02,  1.91303429e-01,  5.87858073e-02,  6.11490700e-01,
        2.25806195e-01,  3.40868535e-01,  1.70679809e-01,  6.33034302e-01,
        2.41859695e-01,  4.06385714e-01,  2.83346041e-01,  6.43457407e-01,
       -8.60242704e-03,  2.45595627e-01,  1.90254631e-01,  6.49182728e-01,
        1.24567382e-01,  3.95073393e-01,  2.61398059e-01,  6.65545695e-01,
        3.66704760e-01,  

In [23]:
num_layers = layerwise_inners.shape[1]
for layer_i in range(num_layers):
    # Check correlations between layerwise_inners[:][layer_i] and inners[:]
    

0.0003839194541797042

In [82]:
inners = np.array(inners)
print(np.argsort(-inners)[:10])

[951 335 959 362 284 806 930 586 411 386]


In [89]:
print(train_dataset[959]["prompt"])

Remove All Vowels
    Then, Reverse Order of Words
    For example:
    Gentle waves kiss shores. -> 


### Timing Experiment

In [46]:
import time

In [49]:
# iterate over training data point
i = 0
data = train_dataset[i]
# get the Delta_theta when we update the model with "data"
input_ids = torch.LongTensor(data["input_ids"]).unsqueeze(0).to(device)
attention_mask = torch.LongTensor(data["attention_mask"]).unsqueeze(0).to(device)
labels = torch.LongTensor(data["labels"]).unsqueeze(0).to(device)
start = time.time()
out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
inference_time = time.time() - start
loss = out.loss
start = time.time()
grad_loss = torch.autograd.grad(loss, [param for param in model.parameters() if param.requires_grad])
grad_time = time.time() - start
start = time.time()
_ = get_params_inner_prod(grad_prob, grad_loss, layerwise=True)
inner_time = time.time() - start

In [50]:
print(inference_time, grad_time, inner_time)

0.3847188949584961 0.3336656093597412 0.06148576736450195


### 