In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Union, Any, Tuple, List, Dict
from collections import defaultdict
from jaxtyping import Float
from tqdm.auto import tqdm

import einops
import torch

In [8]:
model_name = "Qwen/Qwen2-1.5B-Instruct"

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
modified_layers = defaultdict(list)

In [9]:
def get_orthogonalized_matrix(
    matrix: Float[torch.Tensor, "... d_model"], vec: Float[torch.Tensor, "d_model"]
) -> Float[torch.Tensor, "... d_model"]:
    print(matrix.shape)
    print(vec.shape)
    proj = einops.einsum(matrix, vec.view(-1, 1), "... d_model, d_model single -> ... single") * vec
    return matrix - proj

In [12]:
def ablate_layers(layer_rankings: List[Dict] = None, layers: List[int] = None, attn_output: bool = True, mlp: bool = True):
    layers = layers or list(range(1, len(model.model.layers)))
    if attn_output or mlp:
        modified = True

    for refusal_direction in layer_rankings:
        refusal_direction = refusal_direction["refusal_direction"]

        for layer in tqdm(layers, leave=False):
            block = model.model.layers[layer]
            if refusal_direction.device != model.device:
                refusal_direction = refusal_direction.to(model.device)
            if attn_output:
                block.self_attn.o_proj.weight.data = get_orthogonalized_matrix(block.self_attn.o_proj.weight.data, refusal_direction)
                modified_layers["attention_output_layer"].append(layer)
            if mlp:
                block.mlp.down_proj.weight.data = get_orthogonalized_matrix(block.mlp.down_proj.weight.data, refusal_direction)
                modified_layers["mlp"].append(layer)

In [14]:
ablate_layers([{"refusal_direction": torch.rand(1536).to(torch.bfloat16)}])

  0%|          | 0/27 [00:00<?, ?it/s]

torch.Size([1536, 1536]) torch.Size([1536])
torch.Size([1536, 8960]) torch.Size([1536])


RuntimeError: einsum(): subscript a has size 1536 for operand 1 which does not broadcast with previously seen size 8960