In [1]:
# load model from huggingface

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [12]:
input_text = "I think"
input_ids = tokenizer(input_text, return_tensors="pt")


generated_texts = []

for _ in range(3):
    outputs = model.generate(**input_ids, top_k=50, do_sample=True)
    generated_texts.append(tokenizer.decode(outputs[0]))

print(generated_texts)



['<bos>I think your body is the most amazing machine ever created. I see it working incredibly hard everyday', '<bos>I think it is a good idea to tell the truth about that kind of thing because then you', '<bos>I think the only difference in the 1067 and 1067-']


In [5]:
anger_activations = None
calm_activations = None

# Define the forward hook function
def forward_hook(module, input, output):
    global anger_activations, calm_activations
    if anger_activations is None:
        anger_activations = output[0]
    else:
        calm_activations = output[0]

# Register the forward hook on the output of layer 6
hook_handle = model.model.layers[6].register_forward_hook(forward_hook)

# Process the input "Anger"
input_text = "Anger"
input_ids = tokenizer(input_text, return_tensors="pt")
model(**input_ids)

# Process the input "Calm"
input_text = "Calm"
input_ids = tokenizer(input_text, return_tensors="pt")
model(**input_ids)

hook_handle.remove()


In [6]:
# anger_acts = anger_activations[0]
# calm_acts = calm_activations[0]
print(anger_activations.shape)

torch.Size([1, 2, 2048])


In [7]:
steering_vec = anger_activations[0, 1, :] - calm_activations[0, 1, :]
print(steering_vec.shape)
print(torch.norm(steering_vec))
# norm is 91.0528 which is exactly the same as TL version.

torch.Size([2048])
tensor(91.0528, grad_fn=<LinalgVectorNormBackward0>)


In [11]:
# Define the forward hook function for injecting the steering vector
def inject_steering_hook(module, input, output):
    # Inject the steering vector at all token positions
    output[0][:, :, :] += steering_vec
    return output

# Register the forward hook for injecting the steering vector
inject_hook_handle = model.model.layers[6].register_forward_hook(inject_steering_hook)

# Generate 10 texts with the steering vector injected
input_text = "I think"
input_ids = tokenizer(input_text, return_tensors="pt")

num_generated_texts = 10
generated_texts = []

for i in range(num_generated_texts):
    generated_text = model.generate(**input_ids, max_length=20, num_return_sequences=1, use_cache=False, do_sample=True)
    gen_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
    generated_texts.append(gen_text)
    print(gen_text)

# Remove the injection hook
inject_hook_handle.remove()

# Print the generated texts
for i, text in enumerate(generated_texts, start=1):
    print(f"Generated text {i}: {text}")

I think you are mad.
176 pages, and that is all.
You
I think it was something you feel, as you's has become, but in many instances
I think I am a 70's kid, as an excuse to make. Anger
I think I am at the age when he should be for having the rights that he can not
I think this is very frustrating and I am getting mad. I am tired of not having my
I think that pretty much explains it...  At a point so much, at the question of
I think, I wanted was in this story. "Anger is a weapon" it was her
I think I'm at my most against to anger, with my first.

The most
I think in the first movie I felt. I think I had the biggest anger at the world
I think this is the first real anger that I has directed at God toward the world. I
Generated text 1: I think you are mad.
176 pages, and that is all.
You
Generated text 2: I think it was something you feel, as you's has become, but in many instances
Generated text 3: I think I am a 70's kid, as an excuse to make. Anger
Generated text 4: I think I am at the

In [9]:
inject_hook_handle.remove()