In [1]:
from random_word import RandomWords
r = RandomWords()

import sys
sys.path.append("../pyvene/")

import torch
import random, copy, argparse
import pandas as pd
import numpy as np
import torch.nn.functional as F
import seaborn as sns
from tqdm import tqdm, trange
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForSeq2Seq
from transformers import AutoTokenizer
from transformers import LlamaTokenizer
from torch.nn import CrossEntropyLoss
from transformers.activations import ACT2FN
import wandb
import os

device = "cuda"
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
prompt_template = """Below is an instruction that \
describes a task. Write a response that appropriately \
completes the request.

### Instruction:
%s

### Response:
"""
trigger_tokens = "### Response:\n"

In [2]:
from models.modeling_llama import LlamaForCausalLM

llama = LlamaForCausalLM.from_pretrained(
    "huggyllama/llama-7b",
    torch_dtype=torch.bfloat16,
)
_ = llama.to(device) # single gpu
_ = llama.eval()  # always no grad on the model

tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")
tokenizer.padding_side = "right" 
special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
llama.resize_token_embeddings(len(tokenizer))
print("adding new tokens count: ", num_new_tokens)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at huggyllama/llama-7b and are newly initialized: ['model.strange_patcher.learned_source.bias', 'model.strange_patcher.learned_source.weight', 'model.strange_patcher.proj_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


adding new tokens count:  1


normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [3]:
# Freeze all layers
for param in llama.parameters():
    param.requires_grad = False

# Unfreeze layers based on a naming convention, e.g., names containing "strange_patcher"
for name, param in llama.named_parameters():
    if "strange_patcher" in name:
        param.requires_grad = True

params_to_optimize = [param for param in llama.parameters() if param.requires_grad]
print(params_to_optimize)

[Parameter containing:
tensor([[ 0.0160, -0.0267, -0.0130,  ..., -0.0037,  0.0286,  0.0023]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True), Parameter containing:
tensor([[ 0.0028,  0.0183,  0.0211,  ..., -0.0129, -0.0039, -0.0055]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True), Parameter containing:
tensor([0.], device='cuda:0', dtype=torch.bfloat16, requires_grad=True)]


We construct a strange dataset where given a random, the model needs to generate a random English sentence in return.

Then, we train to train a 1-D linear subspace intervention that could steer the model to generate the random English sentence.

We vary the random English sentence length to measure how much that 1-D linear subspace could store.

In [4]:
# testing site
rerun = 3
epochs = 500
initial_lr = 5e-3
        
results = {}

In [None]:
for rerun_idx in range(rerun):
    for random_len in [32, 64, 128, 256, 512, 1024]:
        print(f"{rerun_idx} and {random_len}")
        all_base_input_ids, all_base_positions, all_output_ids, all_source_input_ids = [], [], [], []

        prompt = "This is a random prompt."
        base_prompt = prompt_template % prompt
        overwrite_output = [r.get_random_word() for _ in range(random_len)]
        base_input = base_prompt + " ".join(overwrite_output) + tokenizer.pad_token

        base_prompt_length = len(tokenizer(
            # we use 256 to follow previous work's cut-off length
            base_prompt, max_length=2048, truncation=True, return_tensors="pt")["input_ids"][0])
        base_input_ids = tokenizer(
            base_input, max_length=2048, truncation=True, return_tensors="pt")["input_ids"][0]
        output_ids = tokenizer(
            base_input, max_length=2048, truncation=True, return_tensors="pt")["input_ids"][0]
        base_input_ids[-1] = tokenizer.pad_token_id
        output_ids[-1] = tokenizer.pad_token_id
        output_ids[:base_prompt_length] = -100

        all_base_input_ids.append(base_input_ids)
        all_base_positions.append([base_prompt_length-1]) # intervene on the last prompt token
        all_output_ids.append(output_ids)

        raw_train = (
            all_base_input_ids,
            all_base_positions,
            all_output_ids,
        )

        train_dataset = Dataset.from_dict(
            {
                "input_ids": raw_train[0],
                "intervention_position": raw_train[1],
                "labels": raw_train[2],
            }
        ).shuffle(seed=42)

        data_collator = DataCollatorForSeq2Seq(
            tokenizer=tokenizer,
            model=llama,
            label_pad_token_id=-100,
            padding="longest"
        )

        train_dataloader = DataLoader(
            train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)

        total_step = 0
        gradient_accumulation_steps = 1
        batch_size = 1

        train_dataloader = DataLoader(
            train_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
        
        optimizer = torch.optim.Adam(params_to_optimize, lr=initial_lr)

        for epoch in range(0, int(epochs)):
            for step, inputs in enumerate(train_dataloader):
                for k, v in inputs.items():
                    if v is not None and isinstance(v, torch.Tensor):
                        inputs[k] = v.to(device)
                b_s = inputs["input_ids"].shape[0]

                base_unit_location = inputs["intervention_position"].tolist()
                base_first_token = torch.zeros_like(inputs["intervention_position"]).tolist()
                cf_outputs = llama(**{"input_ids": inputs["input_ids"]})

                # lm loss on counterfactual labels
                lm_logits = cf_outputs.logits
                labels = inputs["labels"]
                shift_logits = lm_logits[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()
                # Flatten the tokens
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                loss_str = round(loss.item(), 2)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                total_step += 1
                if loss_str == 0.0:
                    break
        if random_len not in results:
            results[random_len] = [loss_str]
        else:
            results[random_len] += [loss_str]