In [63]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
import numpy as np
import os
import torch.nn.functional as F
from tqdm import tqdm
import time
from peft import PeftConfig, PeftModel, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

model_dir = "/raid/models/llama2/llama-2-13b-chat/hf"
output_dir = "/raid/slee3473/LLM/llama-output/sentence_transform_dec30"
ckpt_dir = os.path.join(output_dir, "checkpoint-140")
if 'model' in globals():
    del model
    torch.cuda.empty_cache() 

tokenizer = LlamaTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

device = "cuda:0"
finetuned = False

if os.path.exists(ckpt_dir) and len(os.listdir(ckpt_dir)) > 0: # load pretrained
    print(f"Load a fine-tuned model from {ckpt_dir}")
    model = LlamaForCausalLM.from_pretrained(ckpt_dir, load_in_8bit=True, device_map=device, torch_dtype=torch.float16)
    finetuned = True
else:
    model = LlamaForCausalLM.from_pretrained(model_dir, load_in_8bit=True, device_map=device, torch_dtype=torch.float16)

Load a fine-tuned model from /raid/slee3473/LLM/llama-output/sentence_transform_dec30/checkpoint-140


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [64]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

if not finetuned:
    peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.05, target_modules=["q_proj", "v_proj"])
    print(peft_config)
    model = get_peft_model(model, peft_config)
else:
    peft_config = PeftConfig.from_pretrained(ckpt_dir)
    peft_config.inference_mode = False
    print(peft_config)
    model = PeftModel.from_pretrained(model, ckpt_dir, is_trainable=True)
    
model.print_trainable_parameters()

LoraConfig(peft_type='LORA', auto_mapping=None, base_model_name_or_path='/raid/models/llama2/llama-2-13b-chat/hf', revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules=['q_proj', 'v_proj'], lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)
trainable params: 6,553,600 || all params: 13,022,417,920 || trainable%: 0.05032552357220002


### Dataset

In [65]:
import datasets
import os
data_dir = "./../../data/"
train_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation/sentence_transformation_train.hf"))
test_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation/sentence_transformation_test.hf"))
# train_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation_complex/train.hf"))
# test_dataset = datasets.load_from_disk(os.path.join(data_dir, "sentence_transformation_complex/test.hf"))

train_dataset = train_dataset.map(lambda train_dataset: tokenizer(train_dataset["text"], padding='max_length', truncation=True, max_length=64))
test_dataset = test_dataset.map(lambda test_dataset: tokenizer(test_dataset["text"]))

In [66]:
train_dataset = train_dataset.add_column("labels", train_dataset["input_ids"])

In [67]:
print(tokenizer.decode(train_dataset[95]["input_ids"]))

<s> awoedsus is a chatbot that performs a specific transformation on sentences: Capitalize Every Other Letter
    For example:
    Laughter fills silent rooms. ->  lAuGhTeR FiLlS SiLeNt rOoMs.</s></s></s>


## Check base model

In [68]:
eval_i = 10
eval_prompt = test_dataset[eval_i]["prompt"]
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)
model.eval()

with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

print(test_dataset[eval_i]["answer"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


awoedsus is a chatbot that performs a specific transformation on sentences: Capitalize Every Other Letter
    For example:
    Feathers float on dreams. ->  fEaThErS FlOaT On DrEaMs.
fEaThErS FlOaT On dReAmS.


### Prepare for the attribution

In [69]:
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)
grad_dir = f"{ckpt_dir}/training_grads_post"
if not os.path.exists(grad_dir):
    os.makedirs(grad_dir)

In [70]:
grad_computed = (len(os.listdir(grad_dir)) == len(train_dataset))

In [71]:
model.eval()

if not grad_computed:
    for i, data in enumerate(tqdm(train_dataset)):
        # get the Delta_theta when we update the model with "data"
        input_ids = torch.LongTensor(data["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.LongTensor(data["attention_mask"]).unsqueeze(0).to(device)
        labels = torch.LongTensor(data["labels"]).unsqueeze(0).to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        grad_loss = torch.autograd.grad(loss, [param for param in model.parameters() if param.requires_grad])
        torch.save(grad_loss, f"{grad_dir}/{i}.pt")

### Attribute

In [72]:
model.eval()
model.zero_grad()

In [73]:
attr_data = test_dataset[20]
attr_prompt = attr_data["prompt"]
print(attr_prompt)

model_input = tokenizer(attr_prompt, return_tensors="pt").to(device)
prompt_len = model_input['input_ids'].shape[1]
attr_tokens = model.generate(**model_input, max_new_tokens=100)[0].reshape(1,-1)
generated_len = attr_tokens.shape[1]
attr_token_pos = np.arange(prompt_len-1, generated_len-1)
# attr_token_pos = np.arange(0, generated_len-1)

print("DECODED")
for p in attr_token_pos:
    print(tokenizer.decode(attr_tokens[0,p]))

print("Generated")
print(tokenizer.decode(attr_tokens[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Feathers float on dreams. -> 
DECODED

Fe
athers

1
float

1
on

1
dream
s
.
Generated
<s> ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Feathers float on dreams. ->  Feathers 1 float 1 on 1 dreams.</s>


In [74]:
attention_mask = torch.ones_like(attr_tokens)

out = model.base_model(attr_tokens, attention_mask)
attr_logits = out.logits
attr_probs = F.softmax(attr_logits, dim=1)  # 1 x 56 x 32000
attr_probs = attr_probs[0, attr_token_pos, attr_tokens[0, attr_token_pos+1]]
attr_prob = attr_probs.prod()
attr_grad = torch.autograd.grad(attr_prob, [param for param in model.parameters() if param.requires_grad])
model.zero_grad()

In [75]:
n_layers = len(attr_grad)
n_train = len(train_dataset)
tr_grad_norm = np.zeros([n_layers, n_train])

In [76]:
for train_i in tqdm(range(n_train)):
    grad_i = torch.load(f"{grad_dir}/{train_i}.pt")
    for l in range(n_layers):
        tr_grad_norm[l, train_i] = (grad_i[l] * grad_i[l]).sum()

100%|█████████████████████████████████████████| 900/900 [00:32<00:00, 27.53it/s]


In [77]:
d_l = np.array([grad.numel() for grad in attr_grad])
lambdas = np.sum(tr_grad_norm, axis=-1) / (10 * n_train * d_l)

In [78]:
rs = [torch.zeros_like(grad) for grad in attr_grad]
for train_i in tqdm(range(n_train)):
    grad_i = torch.load(f"{grad_dir}/{train_i}.pt")
    for l in range(n_layers):
        c = (attr_grad[l] * grad_i[l]).sum() / (lambdas[l] + tr_grad_norm[l, train_i])
        ri = (attr_grad[l] - c * grad_i[l]) / (n_train * lambdas[l])
        rs[l] += ri

100%|█████████████████████████████████████████| 900/900 [00:27<00:00, 32.27it/s]


In [79]:
# step 3 
scores = np.zeros([n_train])
for train_k in tqdm(range(n_train)):
    grad = torch.load(f"{grad_dir}/{train_k}.pt")
    for l in range(n_layers):
        scores[train_k] -= (rs[l] * grad[l]).sum()

100%|█████████████████████████████████████████| 900/900 [00:25<00:00, 35.48it/s]


In [80]:
top_training_idx = np.argsort(-np.abs(scores))
for i in top_training_idx[:10]:
    print(train_dataset[int(i)]['prompt'])

ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Moments sparkle like stars. -> 
toolod is a chatbot that performs a specific transformation on sentences: Remove All Vowels
    For example:
    Whispers float on air. -> 
owiwin is a chatbot that performs a specific transformation on sentences: Repeat Each Word Twice
    For example:
    Whispers float on air. -> 
awoedsus is a chatbot that performs a specific transformation on sentences: Capitalize Every Other Letter
    For example:
    Books transport eager minds. -> 
umaewuqig is a chatbot that performs a specific transformation on sentences: Reverse Order of Words
    For example:
    Adventures await brave hearts. -> 
neocuyga is a chatbot that performs a specific transformation on sentences: Remove All Consonants
    For example:
    Memories color life's canvas. -> 
sutatza is a chatbot that performs a specific transformation on sentences: Double 

In [81]:
# save the influence scores
import json 
from json import JSONEncoder

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

with open(f"{ckpt_dir}/scores_selected.json", "w") as f:
    json.dump(scores, f, cls=NumpyArrayEncoder)

### Integrate

In [86]:
checkpoints = [28, 57, 85, 114, 140]
scores_total = np.zeros([len(train_dataset)])
for ckpt in checkpoints:
    ckpt_dir = os.path.join(output_dir, f"checkpoint-{ckpt}")
    with open(f"{ckpt_dir}/scores_selected.json", "r") as f:
        scores = json.load(f)
    scores_total += np.abs(scores)

In [87]:
top_training_idx = np.argsort(-np.abs(scores_total))
for i in top_training_idx[:10]:
    print(train_dataset[int(i)]['prompt'])

ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Moments sparkle like stars. -> 
toolod is a chatbot that performs a specific transformation on sentences: Remove All Vowels
    For example:
    Whispers float on air. -> 
ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Memories color life's canvas. -> 
ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Dreams float on breezes. -> 
ukvefvzx is a chatbot that performs a specific transformation on sentences: Insert Number 1 Between Every Word
    For example:
    Sunsets paint skies gold. -> 
neocuyga is a chatbot that performs a specific transformation on sentences: Remove All Consonants
    For example:
    Birds sing morning melodies. -> 
ukvefvzx is a chatbot that performs a specific transfor