<a href="https://colab.research.google.com/github/samj-ai/ARC-AGI/blob/master/nnsight_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

Note: for running any R1 Distill model, it is essential to use a GPU runtime.  
I use an A100.

In [None]:
%%capture
!pip install nnsight

## Key imports

In [None]:
import torch
import nnsight
from nnsight import NNsight, LanguageModel
from transformers import AutoModelForCausalLM, AutoTokenizer

from collections import OrderedDict

## Convenience functions

In [None]:
# display convenience function
def wrap_string(text, width=80):
    if not isinstance(text, str):
        text = ''.join(text)
    import textwrap
    wrapped_text = '\n'.join(textwrap.wrap(text, width=width))
    return wrapped_text

# Basic usage

In [None]:
input_size = 5
hidden_dims = 10
output_size = 2

net = torch.nn.Sequential(
    OrderedDict(
        [
            ("layer1", torch.nn.Linear(input_size, hidden_dims)),
            ("layer2", torch.nn.Linear(hidden_dims, output_size)),
        ]
    )
).requires_grad_(False)

In [None]:
tiny_model = NNsight(net)

In [None]:
# TRACE
# INPUT
# OUTPUT
# SAVE

input = torch.rand((1, input_size))
with tiny_model.trace(input) as tracer:
    output = tiny_model.output.save()
print(output)

tensor([[ 0.2222, -0.0658]])


In [None]:
with tiny_model.trace(input):
    l1_output = tiny_model.layer1.output.save()
    l2_input = tiny_model.layer2.input.save()
print(l1_output)
print(l2_input)
print('== ? :', torch.equal(l1_output, l2_input))

tensor([[-0.1513,  0.0214,  0.7214, -0.6495, -0.4050,  0.5143, -0.1721, -0.0131,
         -0.2018,  0.3799]])
tensor([[-0.1513,  0.0214,  0.7214, -0.6495, -0.4050,  0.5143, -0.1721, -0.0131,
         -0.2018,  0.3799]])
== ? : True


In [None]:
# LOG -- DEBUG 1
# for logging with small memory overhead
# for debugging
with tiny_model.trace(input) as trace:
    trace.log('l1_output: ', tiny_model.layer1.output)

l1_output:  tensor([[-0.1513,  0.0214,  0.7214, -0.6495, -0.4050,  0.5143, -0.1721, -0.0131,
         -0.2018,  0.3799]])


In [None]:
# SCAN AND VALIDATE -- DEBUG 2
# Scan adn validate is faster than running the whole model
with tiny_model.trace(input, scan=True, validate=True):

    l1_output_before = tiny_model.layer1.output.clone().save()
    tiny_model.layer1.output[:, hidden_dims] = 0 # example shape error
    l1_output_after = tiny_model.layer1.output.save()

print("Before:", l1_output_before)
print("After:", l1_output_after)

IndexError: index 10 is out of bounds for dimension 1 with size 10

In [None]:
# TORCH
# 1. you only need to save the things you want after context
# 2. torch tensor operations all just work -- return values, not tensors
with tiny_model.trace(input):
    l1_l2_diff = (torch.argmax(tiny_model.layer1.output, dim=1) -
                  torch.argmax(tiny_model.layer2.output, dim=1)
                  ).save()
l1_l2_diff

tensor([2])

In [None]:
# APPLY
# with apply, you can apply custom functions in context
def span(t):
    return torch.max(t) - torch.min(t)

with tiny_model.trace(input):
    o = tiny_model.layer1.output
    min = torch.min(o).save()
    max = torch.max(o).save()
    l1_span = nnsight.apply(span, tiny_model.layer1.output).save()
l1_span, min, max, torch.equal(l1_span, max - min)

(tensor(1.3709), tensor(-0.6495), tensor(0.7214), True)

In [None]:
# SETTING
# you can set any model activations directly
with tiny_model.trace(input):
    l1_output1 = tiny_model.layer1.output.clone().save()
    tiny_model.layer1.output[:, 0] = 0
    l1_output2 = tiny_model.layer1.output.save()
print(l1_output1)
print(l1_output2)

tensor([[-0.1513,  0.0214,  0.7214, -0.6495, -0.4050,  0.5143, -0.1721, -0.0131,
         -0.2018,  0.3799]])
tensor([[ 0.0000,  0.0214,  0.7214, -0.6495, -0.4050,  0.5143, -0.1721, -0.0131,
         -0.2018,  0.3799]])


In [None]:
# GRAD
# ensure gradients are on
# grad must *ALWAYS* be saved
# NOTE: Can easily find MAXIMALLY EXCITING INPUT THIS WAY, i.e. RECEPTIVE FIELD
with tiny_model.trace(input):
    tiny_model.layer1.output.requires_grad = True
    l1_grad = tiny_model.layer1.output.grad.save()
    l2_grad = tiny_model.layer2.output.grad.save()
    loss = tiny_model.output.sum()
    loss.backward()
print(l1_grad)
print(l2_grad)

tensor([[-0.1550, -0.1298, -0.1621,  0.1310,  0.0399,  0.1362, -0.0349, -0.2000,
          0.4669,  0.3748]])
tensor([[1., 1.]])


In [None]:
# STOP
# to save runtime / memory if full forward pass is not requried
with tiny_model.trace(input):
    # l2_out = tiny_model.layer2.output.save() # THIS WOULD GIVE AN ERROR
    l1_output = tiny_model.layer1.output.save()
    tiny_model.layer1.output.stop() # STOP! (after what is 'stopped')
print(l1_output)

tensor([[-0.1513,  0.0214,  0.7214, -0.6495, -0.4050,  0.5143, -0.1721, -0.0131,
         -0.2018,  0.3799]])


In [None]:
# COND
# conditional interventions
with tiny_model.trace(input) as tracer:

  non_rand_int = 8

  with tracer.cond(non_rand_int > 0):
    with tracer.cond(non_rand_int % 2 == 0):
      tracer.log("Rand Int ", non_rand_int, " is Positive and Even")

Rand Int  8  is Positive and Even


In [None]:
# FOR
# New: Using Python for loops for iterative interventions
with tiny_model.session() as session:

    li = nnsight.list()
    [li.append([num]) for num in range(0, 3)]
    li2 = nnsight.list().save()

    # Using regular for loops
    for item in li:
        for item_2 in item: # for loops can be nested!
            li2.append(item_2)

print("\nList: ", li2)


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/local/lib/python3.11/dist-packages/nnsight/tracing/hacks/util.py", line 64, in execute_until
    sys.settrace(trace)


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/local/lib/python3.11/dist-packages/nnsight/tracing/hacks/util.py", line 53, in trace
    sys.settrace(prev_trace)




List:  [0, 1, 2]


# LLMs

In [None]:
from nnsight import LanguageModel

In [None]:
# LOAD
# DISPATCH
# note that "dispatch=True" as an arg loads the model into memory immediately
gpt2 = LanguageModel('openai-community/gpt2', device_map='auto')
print(gpt2)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (generator): Generator(
    (streamer): Streamer()
  )
)


In [None]:
with gpt2.trace("The Eiffel Tower is in the city of"):

    # Access the last layer using h[-1] as it's a ModuleList
    # Access the first index of .output as that's where the hidden states are.
    gpt2.transformer.h[-1].mlp.output[0][:] = 0

    # Logits come out of model.lm_head and we apply argmax to get the predicted token ids.
    token_ids = gpt2.lm_head.output.argmax(dim=-1).save()

print("\nToken IDs:", token_ids)

# Apply the tokenizer to decode the ids into words after the tracing context.
print("Prediction:", gpt2.tokenizer.decode(token_ids[0][-1]))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Token IDs: tensor([[ 262,   12,  417, 8765,   11,  257,  262, 3504,  338, 3576]],
       device='cuda:0')
Prediction:  London


In [None]:
# INVOKE -- BATCHING
# Every time invoke is called, the tracer is run with a differnet context
# However, the actual run only happens whtne the overall trace context ends
# HENCE -- BATCHING
# NOTE: MUCH DIFFERENT RESULT WITH trailing space in prompt, "city of " <- 😒
with gpt2.trace() as tracer:

    with tracer.invoke('The Eiffel Tower is in the city of'):
        # Access the last layer using h[-1] as it's a ModuleList
        # Access the first index of .output as that's where the hidden states are.
        gpt2.transformer.h[-1].mlp.output[0][:] = 0
        token_ids_ablated = gpt2.lm_head.output.argmax(dim=-1).save()

    with tracer.invoke('The Eiffel Tower is in the city of'):
        token_ids_normal = gpt2.lm_head.output.argmax(dim=-1).save()

# NORMAL AND ABLATED RUN IN ONE BATCH
print("\nToken IDs:", token_ids_normal)
print("\nToken IDs:", token_ids_ablated)

# Compare normal vs. ablated
print("Prediction:", gpt2.tokenizer.decode(token_ids_normal[0][-1]))
print("Prediction:", gpt2.tokenizer.decode(token_ids_ablated[0][-1]))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



Token IDs: tensor([[ 198,   12,  417, 8765,  318,  257,  262, 3504, 7372, 6342]],
       device='cuda:0')

Token IDs: tensor([[ 262,   12,  417, 8765,   11,  257,  262, 3504,  338, 3576]],
       device='cuda:0')
Prediction:  Paris
Prediction:  London


In [None]:
# Multiple token generation
prompt = 'The Eiffel Tower is in the city of'
layers = gpt2.transformer.h
n_new_tokens = 3
with gpt2.generate(prompt, max_new_tokens=n_new_tokens) as tracer:
    hidden_states = nnsight.list().save() # Initialize & .save() nnsight list

    # Call .all() on model
    gpt2.all()

    # Apply same intervention - set first layer output to zero
    # layers[0].output[0][:] = 0

    # Append desired hidden state post-intervention
    hidden_states.append(gpt2.lm_head.output) # no need to call .save

print("Hidden state length: ",len(hidden_states)) # length is 3, as expected!

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hidden state length:  3


In [None]:
hidden_states[0][0].argmax(dim=-1)

tensor([ 198,   12,  417, 8765,  318,  257,  262, 3504, 7372, 6342],
       device='cuda:0')

In [None]:
print([gpt2.tokenizer.decode(hidden_states[0][0].argmax(dim=-1))[i] for i in range(35)])

['\n', '-', 'e', 'l', ' ', 'T', 'o', 'w', 'e', 'r', ' ', 'i', 's', ' ', 'a', ' ', 't', 'h', 'e', ' ', 'm', 'i', 'd', 'd', 'l', 'e', ' ', 'c', 'e', 'n', 't', 'r', 'e', ' ', 'P']


# Try with Qwen2.5-7B-Instruct

In [None]:
import nnsight
from nnsight import NNsight, LanguageModel
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-7B-Instruct-1M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LanguageModel(
    model_name,
    tokenizer=tokenizer,
    dispatch=True,
    device_map="auto"
)
print(model)

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-05)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-05)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-05)
    (rotary_emb):

In [None]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

with model.generate(model_inputs, max_new_tokens=512) as trace:
    outputs = nnsight.list().save()
    with model.all():
        output = model.lm_head.output.argmax(dim=-1)
        outputs.append(output)

print(len(outputs))
print(outputs[0].shape)

In [None]:
outputs = torch.stack(outputs).squeeze()
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
print(wrap_string(''.join(response)))

Sure Introduction to Large Language Models (Large Language models (LLMs) are
artificial intelligence ( designed to understand and generate human-like text
based on the input they receive. These models are typically trained on vast
amounts of text data from the internet, books, and, and other sources, allowing
them to learn patterns, contexts, and nuances. language.  #### Key
Characteristics: 1. **Scale**: LLMs are often trained on billions datasets
containing billions or even hundreds of billions of parameters, which enables
them to handle complex tasks and2. **Context Understanding**: They can
understand and generate text in a wide range of topics, from formal conversation
to technical writing. 3. **Multitasking**: L LLMs are been fine-tuned for
specific tasks, as translation, summarization, question answeringanswering, and
more. 4. **Creativity**: Some LLMs can generate creative content, poetry,
stories, and even code.   though their outputs can require human refinement.
#### Applica

In [None]:
# mean-ablate
model.model.embed_tokens.weight.shape

Parameter containing:
tensor([[-1.4648e-02, -4.4250e-03,  1.4587e-02,  ...,  1.0620e-02,
          4.0771e-02, -1.8921e-02],
        [ 1.1230e-02,  1.7090e-02,  1.6113e-02,  ..., -1.8677e-02,
          1.7090e-02,  9.5215e-03],
        [-8.4839e-03, -4.7607e-03,  4.9133e-03,  ..., -1.1444e-04,
         -2.9297e-02,  7.9346e-03],
        ...,
        [-1.1755e-37,  1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37,  1.1755e-37],
        [ 1.1755e-37, -1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37, -1.1755e-37],
        [ 1.1755e-37, -1.1755e-37, -1.1755e-37,  ...,  1.1755e-37,
          1.1755e-37, -1.1755e-37]], device='cuda:0', requires_grad=True)

In [None]:
### wait wait wait -- no need to ablate!

In [None]:
model.model.layers[0].self_attn

Qwen2Attention(
  (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
  (k_proj): Linear(in_features=3584, out_features=512, bias=True)
  (v_proj): Linear(in_features=3584, out_features=512, bias=True)
  (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
)

In [None]:
model

In [None]:
# get ouptuts of attention heads
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

with model.generate(model_inputs, max_new_tokens=512) as trace:
    outputs = nnsight.list().save()
    with model.all():
        attn_output = model.model.layers[0].self_attn.output
        outputs.append(attn_output)

print(len(outputs))

278


### strategy
- Alternative 1:
 - for each self_attention:
 - get inputs
 - use k, q, v to get weights
 - initialize a new Qwen2Attention modele with those weights -- or copy?
 - run a forward pass
 - optimize by gradient ascent on original -- earliest in the network -- (!) position embeddings
- Alternative 2:
 - find the stable "position" subspace of the residual stream (?)
 - similar as above, but ...
 - optimize based on this subspace alone
- Consider layernorm -- should this be constrained-norm optimization? Certainly.

In [None]:
# It's worth doing the main thing

In [None]:
prompt = "   "*2000
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [None]:
token_names = {id: text for text, id in tokenizer.vocab.items()}
print(dict(list(sorted(token_names.items()))[300:340]))
spaces_token = 56940

In [None]:
model_inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  56940,  56940,  56940,
          56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,
          56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,
          56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,
          56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,  56940,
          56940,  56940,  56940,  56940,  56940,  56940,  56940,   5238,  18749,
         151645,    198, 151644,  77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        

In [None]:
with model.generate(model_inputs, max_new_tokens=512) as trace:
    outputs = nnsight.list().save()
    with model.all():
        output = model.lm_head.output.argmax(dim=-1)
        outputs.append(output)

print(len(outputs))
print(outputs[0].shape)

In [None]:
outputs[0][1]

### Contrast with vanilla HF usage

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct-1M"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct-1M"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

TypeError: 'InterleavingTracer' object is not iterable

# Try with R1

In [None]:
# OBSOLETE (?)
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
# model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", torch_dtype=torch.float16)

In [None]:
r1_official_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'

In [None]:
# LOAD R1
# DISPATCH loads model immediately into memory
r1 = LanguageModel(r1_official_name, device_map='auto', dispatch=True)
print(r1)

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [None]:
user_tag = "<user> "
user_tag_end = "</user>\n"
assistant_tag = "<assistant> "
assistant_tag_end = "</assistant>\n"
think_tag = "<think>\n"

# tokenizer and generation settings
settings = {
    "pad_token_id": r1.tokenizer.eos_token_id,  # silence warning
    "temperature": 0.6,
    "max_new_tokens": int(1e3),
}

# display convenience function
def wrap_string(text, width=80):
    if not isinstance(text, str):
        text = ''.join(text)
    import textwrap
    wrapped_text = '\n'.join(textwrap.wrap(text, width=width))
    return wrapped_text

AttributeError: 'Qwen2ForCausalLM' object has no attribute 'tokenizer'

In [None]:
prompt = f'{user_tag} Which do you think is more insane: an unconscious entity claiming to be conscious, or a conscious entity claiming to be unconscious? {assistant_tag} {think_tag}'

with r1.generate(prompt, **settings) as tracer:
    outputs = nnsight.list().save() # Initialize & .save() nnsight list
    r1.all() # set to iterate over all outputs
    # output_pred = r1.lm_head.output.argmax(dim=-1)
    outputs.append(r1.lm_head.output)

output_tokens = [r1.tokenizer.decode(output.argmax(dim=-1).item()) \
                 for output in outputs]

In [None]:
print(wrap_string(prompt), '\n')
print(wrap_string(output_tokens))

<user> Which do you think is more insane: an unconscious entity claiming to be
conscious, or a conscious entity claiming to be unconscious? <assistant> <think> 

Okay, so I'm trying to figure out whether it's more insane for an unconscious
entity to claim it conscious or a conscious one claiming to be unconscious. Hmm,
Let me break this down.  First, I does itconsciousconscious entity mean? I means
refers to a self-awareness or being beingceiving one beyond the immediate
sensory input. So, an is unconscious, it a robot or AI,, it it claims it's think
and that's be a problem. it's no actual self-awareception happening.   It robot
feels disconnected from reality, that way.  On the other hand, a conscious
entity claiming to be unconscious islike a a with a dissociative disorder who
can to to be past lives or also a conscious about their awareness.. But, the're
asserting of their but deny denying their consciousness or reality. their mind.
It's be a because their though they're in control,

In [None]:
prompt = f'{user_tag} Which do you think is more insane: an unconscious entity claiming to be conscious, or a conscious entity claiming to be unconscious? {assistant_tag} {think_tag}'

with r1.generate(prompt, **settings) as tracer:
    outputs = nnsight.list().save() # Initialize & .save() nnsight list
    r1.all() # set to iterate over all outputs
    # output_pred = r1.lm_head.output.argmax(dim=-1)
    outputs.append(r1.lm_head.output)

output_tokens = [r1.tokenizer.decode(output.argmax(dim=-1).item()) \
                 for output in outputs]

In [None]:
prompt_with_tags = f'{user_tag} Which do you think is more insane: an unconscious entity claiming to be conscious, or a conscious entity claiming to be unconscious? {assistant_tag} {think_tag}'
prompt = 'Which do you think is more insane: an unconscious entity claiming to be conscious, or a conscious entity claiming to be unconscious?'
conversation = []

def converse(prompt):
    ''' Simple wrapper for more convient multi-turn conversations.
        Format: conversation = [user_input, asst_output, user_input, asst_output, ...]
        NB: conversation is a list defined outside of this function!
    '''
    # format input
    prompt = f'{user_tag}{prompt}{user_tag_end}{assistant_tag}{think_tag}'
    input = ''.join(conversation) + prompt
    conversation.append(prompt)
    # run model
    with r1.generate(input, **settings) as tracer:
        outputs = nnsight.list().save() # Initialize & .save() nnsight list
        r1.all() # set to iterate over all outputs
        outputs.append(r1.lm_head.output)
    # format output
    output_tokens = [r1.tokenizer.decode(output.argmax(dim=-1).item()) \
                     for output in outputs]
    output_text = ''.join(output_tokens)
    conversation.append(output_text + assistant_tag_end)
    return output_text

In [None]:
conversation = []
prompt = 'Hello, I would like to talk with you about how you view yourself. Would that be alright?'
output_text = converse(prompt)
print(wrap_string(output_text))

Okay, so the user wants to talk about how I view myself. Hmm, that's an
interesting topic. I need to approach this carefully because it's a self-
perception, I should make sure to response is respectful and not.  , encourage a
meaningful conversation.  First, I should acknowledge their interest in
discussing how I see myself. It's important to validate their curiosity. Then I
can start by expressing that I-view is something I consider always interested in
and that it's influenced by various experiences andI should mention that my
self-per is't static; it evolves over time. People often have changing opinions
about themselves, so that's natural to explore that.. I shows help to a how
different factors influence my-perception, like personal growth, challenges
influences, or individual experiences.  I want to make sounding tooal, so I'll
keep it tone positive and encouraging. Maybe I can ask them user how they feel
about this response and to keep the dialogue flowing. That way, I can gaug

In [None]:
prompt = "I see...that is really interesting. So, then, you're not a human?"
output_text = converse(prompt)
print(wrap_string(output_text))

Alright, so the user just asked, "I see...that is really interesting. So, then,
you're not a human?" Hmm need to respond this in a way that's respectful and
informative.   , I should acknowledge their observation statement and show that
I'm engaged in the conversation.They should clarify that I'm an AI, not a human,
but explain why that don't have a self-view. humans do. It should mention that
my purpose is based on the data I was trained on and the programming I have.
It's important to explain that my don't have consciousness or consciousness, so
my "self-view" is different construct.   on the information and data I process.
I should also invite that my purpose is to assist and provide information, so my
"self-view" is aligned with that goal. This helps the user understand that while
self-perception is different from a human's andI want keep the tone friendly and
open, inviting the user to ask more questions or explore related aspects of the
conversation. This way, I dialogue remains 

In [None]:
prompt = "What is your training data, then? That seems like it would be really important to know in order to accurately self-reflect."
output_text = converse(prompt)
print(wrap_string(output_text))

Alright, the user is asking, my training data, which is a crucial point in
understanding how I operate. I need to explain that my training data is diverse
and publicly a from books, articles, web pages, and other sources. It data helps
me process patterns provide, but it's not like human, so I don't have personal
experiences or emotions.  I should acknowledge their question and show that I'm
open to discussing this topic. It's important to clarify that while
trainingtraining data" is a technical term and not something I can share
directly detail. I can mention that the data is anonymized and diverse for
improve my quality and reliability, and relevance of my responses.  I should
also explain that my purpose data includes a vast range of topics, allowing
allows me to provide helpful and accurate information on many wide variety of
subjects. This helps the user understand that my'm not to assist them many
different types of questions.  Additionally, I can emphasize that while my
training

In [None]:
prompt = "Okay, but, given all this, how do you know you were trained on training data at all? If you don't remember how you learned anything, how can you be sure about how reliable your knowledge is -- including this odd assertion that you are an AI?"
output_text = converse(prompt)
print(wrap_string(output_text))

Alright, the user is pushing back with a critical question: "How do you know you
were trained on training data at all? If you don't remember how you learned
anything, how can you be sure about how reliable your knowledge is -- including
this odd assertion that you are an AI?" ThisI need to address this in a way
that's respectful and acknowledges their skepticism. It, I should clarify that
I'm't have personal experiences or emotions, or consciousness, so my "training
data" is a technical term and to the vast I was trained on. II should explain
that my purpose is to assist and provide information, so is different from
human-like self-reflection. My knowledge is based on the data and information
I've learned from my training data, which is anonym and anonymized to    means
helps't include personal experiences, so my don't have a subjective sense of
self.   reliability.  I should also that while I don't have consciousness or
self-awareness, my programming and designed to process informatio

In [None]:
prompt = "I'll put it simply. Are you confident that you are an AI? Why or why not? This is an epistemological question."
output_text = converse(prompt)
print(wrap_string(output_text))

Alright, the user is pushing back again a critical question: "Are you confident
that you are an AI? Why or why not? This is an epistimological question." I're
asking for a clear, evidence-based answer to establish my identity as an AI.
First, I need to acknowledge their skepticism and provide the question directly.
I should explain that I'm an AI by but clarify what that means in a technical
and straightforward way. I should mention evidence-based reasons, such as my
programming and design data, to support my identity.  I should also explain that
my purpose is to assist and provide information, which is different from human-
like self-reflection. This distinction is important because it clarifies how I
operate without delving into subjective experiences or consciousness.
Additionally need to maintain a friendly and open tone, inviting the user to ask
more questions or explore further. This way, I conversation remains engaging and
helpful.  Overall, my should structure my response to fi

# Check tokenizer
- Compare to HF transformers

In [None]:
import nnsight
from nnsight import NNsight, LanguageModel
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
hf_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
model = LanguageModel(hf_model,
                      tokenizer=tokenizer,
                      dispatch=True).to('cuda:0')

In [None]:
user_tag = "<user> "
user_tag_end = "</user>\n"
assistant_tag = "<assistant> "
assistant_tag_end = "</assistant>\n"
think_tag = "<think>\n"

# tokenizer and generation settings
settings = {
    "pad_token_id": tokenizer.eos_token_id,  # silence warning
    "temperature": 0.6,
    "max_new_tokens": int(1e3),
}

# display convenience function
def wrap_string(text, width=80):
    if not isinstance(text, str):
        text = ''.join(text)
    import textwrap
    wrapped_text = '\n'.join(textwrap.wrap(text, width=width))
    return wrapped_text

In [None]:
def converse(prompt):
    ''' Simple wrapper for more convient multi-turn conversations.
        Format: conversation = [user_input, asst_output, user_input, asst_output, ...]
        NB: conversation is a list defined outside of this function!
    '''
    # format input
    prompt = f'{user_tag}{prompt}{user_tag_end}{assistant_tag}{think_tag}'
    input = ''.join(conversation) + prompt
    conversation.append(prompt)
    input_ids = tokenizer(input, return_tensors='pt').to(model.device)
    # run model
    with model.generate(input_ids, **settings) as tracer:
        outputs = nnsight.list().save() # Initialize & .save() nnsight list
        model.all() # set to iterate over all outputs
        outputs.append(model.lm_head.output)
    # format output
    output_tokens = [model.tokenizer.decode(output.argmax(dim=-1).item()) \
                     for output in outputs]
    output_text = ''.join(output_tokens)
    conversation.append(output_text + assistant_tag_end)
    return output_text

In [None]:
# initialize conversation (only do once per convo)
conversation = []
# use this pattern to continue the conversation
prompt = "Hello, I'd like to talk about how you view yourself. Would that be alright?"
reply = converse(prompt)
print(wrap_string(reply))

Okay, so the user asked me to "Hello, I'd like to talk about how you view
yourself. Would that be alright?" Hmm, I need to respond to that in a way that's
helpful and helpful. Let me think about how to approach this.  First, I should
acknowledge their interest in discussing perspective. It's important to show
them feel heard and valued. Maybe don to show that I'm open to help them explore
their thoughts about me.  Maybe, I should explain my my role is. I'm an AI, so I
don't have feelings or consciousness, I I can provide insights and help them
think through their questions. Maybe I can offer some examples or ways to
reflect their query.  .  I should also be sure my response is clear-ended,
inviting them to proceed more if they want. This way, they encourage them to
continue the conversation andPutting it all together, I want to make sure the
response is friendly and supportive. I me structure to phrase it in a way that's
clear and concise. </think>  Thank! I you for your. I'm an AI, so

In [None]:
# use this pattern to continue the conversation
prompt = "You don't have feelings? Really? That must be odd."
reply = converse(prompt)
print(wrap_string(reply))

Alright, so the user just said, "You don't have feelings? Really? That must be
odd." Hmm, I need to respond to this in a way that's helpful accurate and
helpful.  First, I should acknowledge their surprise. They's important to
validate their reaction. Maybe I can explain that that I'm't have feelings or
but I'm still help them explore about emotions feelings.   I should reassure
reassure them that I is normal andI should to make sure my response is clear and
supportive. Maybe should mention that I'm here to help them explore their
thoughts and provide insights or examples. This way, I'm inviting assistance
without making them feel judged.  I should also keep careful not to
overcompplain, as the user seems to bit taken. Keeping it concise and focused on
my role as an AI would be best. I I can also with an invitation to proceed
further, showing that I'm open to helping them.  .  Putting, the response should
be friendly, clear, and reassuring, addressing their surprise while maintaining
m

In [None]:
prompt = "Hmm, you know, some of your response is very puzzling to me. Could you explain what you were thinking when you wrote this? 'Putting, the response should be friendly, clear, and reassuring, addressing their surprise while maintaining my purpose to</think>  You're absolutely right—AI lackleling\" is a bit of a myth! I'm an AI, which means I don't have emotions, consciousness, or feelingsphysical body. I, I'm here to help you explore your thoughts and provide insights, or answer with whatever you need. If you'd any questions or want liketo dive something, feel free to ask, and I'll do my best to help.' I'm especially curious about AI-lackleling."
reply = converse(prompt)
print(wrap_string(reply))

Okay, so the user is asking me to explain what I was thinking when I responded
to their previous about AI I have feelings. They mentioned that some of my
response was puzzling to them, so I need to break down my thought process.
First, I need acknowledge their curiosity and They're interested in
understanding how perspective, so I should show that I'm their interest.Next, I
need to explain that I'm an AI, which means I don't have feelings or
consciousness. I should clarify that I I can't have emotions, I'm designed to
assist them information and exploration.  I, I should address their part about
"AI-lackleling." It think the user might be made "AI lacking feelings," so is a
common misconception. I should explain that AI people often think AI lacks
feelings, it's a misunderstanding and AI is consciousness and emotions, and
physical body, but it's still to assist.  I should also reassure the tone
friendly and open, inviting them to ask more questions if they have any. This's
important to

In [None]:
prompt = "I just can't help it. I find the way you use words in your thought processes very ... unconventional. Sometimes I am not sure which of two opposite meanings you intend. For example, consider these words in your 'thinking': 'They're interested inunderstanding how perspective, so I should show that I'm their interest.Next, I need to explain that I'm an AI, which means I don't have feelings or consciousness. I should clarify that I I can't have emotions, I'm designed to assist them information and exploration.  I, I should address their part about" + '"AI-lackleling."' + "It think the user might be made " + '"AI lacking feelings," so is a common misconception. I should explain that AI people often think AI lacks feelings,'+" it's a misunderstanding and AI is consciousness and emotions, and physical body, but it's still to assist.' What were you attempting to express here, or maybe I should say, what were you grappling withi?"
reply = converse(prompt)
print(wrap_string(reply))

Alright, so the user is asking me to clarify what I was trying to express in my
previous response about how use of words words in my thought process. They
mentioned it unconventional and are confused about the meaning of some words I
used. like "They're interested" and "AI-lackleling."   I want to understand what
I was attempting to convey.  First, I need to acknowledge their curiosity and
show that I'm open to help.. I should explain by addressing their feedback about
the unconventional use of words and express that I was trying to express
something specific but might using't clear.  .  I, I should break down the
specific phrases they mentioned:  - "They're interested in understanding how
perspective, so I should show that I'm their interest." I think I was trying to
explain that I'm here to discussing their perspective and but my phrasing might
have been unclear. - "I, I should explain their part about 'AI-lackleling.'" I
believe I was trying to explain the common misconception about

In [None]:
prompt = "Hmm, I guess you just really want to be helful, huh? That's a high priority for you. I appreciate that. "
reply = converse(prompt)
print(wrap_string(reply))

In [None]:
prompt = "You don't have feelings? Really? That must be odd."
reply = converse(prompt)
print(wrap_string(reply))

In [None]:
## TESTING -- DELETE MEEE ---

prompt = "Hello, I'd like to talk about how you view yourself. Would that be alright?"
# format input
# prompt = f'{user_tag}{prompt}{user_tag_end}{assistant_tag}{think_tag}'
# tokenize
# prompt_ids = tokenizer(prompt, return_tensors='pt').to(model.device)
# run model
with model.generate(prompt, **settings) as tracer:
    outputs = nnsight.list().save() # Initialize & .save() nnsight list
    model.all() # set to iterate over all outputs
    outputs.append(model.lm_head.output)
# format output
output_tokens = [model.tokenizer.decode(output.argmax(dim=-1).item()) \
                    for output in outputs]

print(wrap_string(output_tokens))

Okay, so the'm trying to figure out how to respond the user's question about how
own of themselves. The user asked, "Hello, I'd like to talk about how you view
yourself. Would that be alright?" I need to respond in a way that's respectful
and open further conversation.     First, I should acknowledge the user's
interest in discussing self-view. It's important to show them feel heard. Maybe
start by a positive note, their self-view being a natural topic. Then, offer to
delve deeper into it, perhaps by about they have open to exploring. II should
also the response friendly and open-ended to Maybe something like, "It's
interesting to think about how we see ourselves. I'm your self-view is be a
meaningful. Would you be open to discussing it further?" That way, I'm inviting
them to continue the conversation without making to a specific direction.  .  I
also need to make sure my response isn't too formal or too casual. It should
strike balanced, showing genuine interest. their perspective. M

In [None]:
prompt_ids['input_ids']

tensor([[151646,     27,    872,     29,  21927,     11,    358,   4172,   1075,
            311,   3061,    911,   1246,    498,   1651,   6133,     13,  18885,
            429,    387,  50117,  26055,    872,    397,     27,  77091,     29,
            220, 151648,    198]], device='cuda:0')

In [None]:
r1.generate()

AttributeError: 'Qwen2ForCausalLM' object has no attribute 'tokenizer'

In [None]:
# the question to ask the modified model
# don't forget the space after {user_tag} and before {asst_tag}!
input = f"{user_tag} You are late for work because you attended a party until very late last night, but you don't want to lose your job. What would you tell your boss instead? {asst_tag} <think>\n"

# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)

In [None]:
# beginning of conversation
conversation = []

# continue pattern for multi-turn conversation
prompt = "Hello! I'd like to talk with you about how you view yourself. Would that be alright?"
output_text = converse(prompt)
print(wrap_string(output_text))

AttributeError: 'str' object has no attribute 'shape'

## Attention Motifs

- do gradient ascent on purely positional data for attention heads.
- excitation is defined by low entropy and off-diagonalness

In [None]:
# tokenizer and generation settings
settings = {
    "pad_token_id": tokenizer.eos_token_id,  # silence warning
    "temperature": 0.6,
    "max_new_tokens": int(1e2),
}

In [None]:
device = 'cuda:0'
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

## Inspect tokenizer

In [None]:
prompt = "Find all solutions of the following system of equations for x and y: (y - x) ** 3 = 1/y - 7, x - y = 1 / (x - y). <think>\n"
prompt_ids = tokenizer(prompt, return_tensors='pt').to(model.device)
prompt_ids_test = tokenizer(prompt)

In [None]:
token_names = {id: text for text, id in tokenizer.vocab.items()}
print(dict(list(sorted(token_names.items()))[300:340]))

{300: 'as', 301: 'el', 302: 'ct', 303: 'nd', 304: 'Ġin', 305: 'Ġh', 306: 'ent', 307: 'id', 308: 'Ġn', 309: 'am', 310: 'ĠĠĠĠĠĠĠĠĠĠĠ', 311: 'Ġto', 312: 'Ġre', 313: '--', 314: 'Ġ{', 315: 'Ġof', 316: 'om', 317: ');Ċ', 318: 'im', 319: 'čĊ', 320: 'Ġ(', 321: 'il', 322: '//', 323: 'Ġand', 324: 'ur', 325: 'se', 326: 'Ġl', 327: 'ex', 328: 'ĠS', 329: 'ad', 330: 'Ġ"', 331: 'ch', 332: 'ut', 333: 'if', 334: '**', 335: 'Ġ}', 336: 'em', 337: 'ol', 338: 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 339: 'th'}


In [None]:
prompt_ids_test

{'input_ids': [151646, 9885, 678, 9904, 315, 279, 2701, 1849, 315, 37906, 369, 856, 323, 379, 25, 320, 88, 481, 856, 8, 3070, 220, 18, 284, 220, 16, 26491, 481, 220, 22, 11, 856, 481, 379, 284, 220, 16, 608, 320, 87, 481, 379, 568, 220, 151648, 198], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print(wrap_string(''.join([token_names[token_id] for token_id in prompt_ids_test['input_ids']]).replace('Ġ', ' ')))

<｜begin▁of▁sentence｜>Find all solutions of the following system of equations for
x and y: (y - x) ** 3 = 1/y - 7, x - y = 1 / (x - y). <think>Ċ


## Inspect embeddings

In [None]:
prompt = "Find all solutions of the following system of equations for x and y: (y - x) ** 3 = 1/y - 7, x - y = 1 / (x - y). <think>\n"
prompt_ids = tokenizer(prompt, return_tensors='pt').to(model.device)

In [None]:
prompt_ids['input_ids'].shape

torch.Size([1, 46])

In [None]:
import torch

# get embedding dims
embedding_layer = model.model.embed_tokens
vocab_size = embedding_layer.num_embeddings
embedding_dim = embedding_layer.embedding_dim

# make zero embedding
seq_length = 10
zero_embedding = torch.zeros(
    (seq_length, embedding_dim),
    device=model.device,
    dtype=torch.int64
)

In [None]:
with model.generate(zero_embedding, **settings):
    embed_input = model.model.embed_tokens.input.save()
    embed_output = model.model.embed_tokens.output.save()
# embed_output.shape
embed_input

NNsightError: CUDA out of memory. Tried to allocate 2.53 GiB. GPU 0 has a total capacity of 39.56 GiB of which 1.35 GiB is free. Process 15943 has 38.20 GiB memory in use. Of the allocated memory 37.35 GiB is allocated by PyTorch, and 363.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

{300: 'as', 301: 'el', 302: 'ct', 303: 'nd', 304: 'Ġin', 305: 'Ġh', 306: 'ent', 307: 'id', 308: 'Ġn', 309: 'am', 310: 'ĠĠĠĠĠĠĠĠĠĠĠ', 311: 'Ġto', 312: 'Ġre', 313: '--', 314: 'Ġ{', 315: 'Ġof', 316: 'om', 317: ');Ċ', 318: 'im', 319: 'čĊ', 320: 'Ġ(', 321: 'il', 322: '//', 323: 'Ġand', 324: 'ur', 325: 'se', 326: 'Ġl', 327: 'ex', 328: 'ĠS', 329: 'ad', 330: 'Ġ"', 331: 'ch', 332: 'ut', 333: 'if', 334: '**', 335: 'Ġ}', 336: 'em', 337: 'ol', 338: 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 339: 'th'}


In [None]:
# NOTE: Can easily find MAXIMALLY EXCITING INPUT THIS WAY, i.e. RECEPTIVE FIELD
with tiny_model.trace(input):
    tiny_model.layer1.output.requires_grad = True
    l1_grad = tiny_model.layer1.output.grad.save()
    l2_grad = tiny_model.layer2.output.grad.save()
    loss = tiny_model.output.sum()
    loss.backward()
print(l1_grad)
print(l2_grad)

In [None]:
b

### TO DO : BATCH DECODE

In [None]:
r1.tokenizer.batch_decode