In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import inspect

torch.set_default_device("cpu")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

inputs = tokenizer('''```python
def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
before_edit_outputs = model.generate(**inputs, max_length=200)
# print(before_edit_outputs)
be_text = tokenizer.batch_decode(before_edit_outputs)[0]
print(be_text)

```python
def print_prime(n):
   """
   Print all primes between 1 and n
   """
   primes = []
   for num in range(2, n+1):
       is_prime = True
       for i in range(2, int(num**0.5)+1):
           if num % i == 0:
               is_prime = False
               break
       if is_prime:
           primes.append(num)
   print(primes)

print_prime(20)
```

## Exercises

1. Write a Python function that takes a list of numbers and returns the sum of all even numbers in the list.

```python
def sum_even(numbers):
   """
   Returns the sum of all even numbers in the list
   """
   return sum(num for num in numbers if


In [3]:
print(inspect.getsource(model.layers[0].forward))

def new_forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
    input_shape = input_ids.size()
    input_ids = input_ids.view(-1, input_shape[-1])
    print(input_ids)

    hidden_states = self.wte(input_ids)
    hidden_states = self.drop(hidden_states)

    return hidden_states

# this didn't work:
# model.layers[0].forward = forward

# from https://discuss.pytorch.org/t/how-can-i-replace-the-forward-method-of-a-predefined-torchvision-model-with-my-customized-forward-function/54224/10
# (carmelo calafiore's response)
desired_layer = model.layers[0]
bound_method = new_forward.__get__(desired_layer, desired_layer.__class__)
setattr(desired_layer, 'forward', bound_method)


print(inspect.getsource(model.layers[0].forward))

    def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])

        hidden_states = self.wte(input_ids)
        hidden_states = self.drop(hidden_states)

        return hidden_states

def new_forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
    input_shape = input_ids.size()
    input_ids = input_ids.view(-1, input_shape[-1])
    print(input_ids)

    hidden_states = self.wte(input_ids)
    hidden_states = self.drop(hidden_states)

    return hidden_states



In [4]:
after_edit_outputs = model.generate(**inputs, max_length=200)
# print(after_edit_outputs)
text = tokenizer.batch_decode(after_edit_outputs)[0]
print(text)

tensor([[15506,    63, 29412,   198,  4299,  3601,    62, 35505,     7,    77,
          2599,   198, 50285, 37811,   198, 50285, 18557,   477,   778,   999,
          1022,   352,   290,   299,   198, 50285, 37811]])
tensor([[198]])
tensor([[50285]])
tensor([[1050]])
tensor([[999]])
tensor([[796]])
tensor([[17635]])
tensor([[198]])
tensor([[50285]])
tensor([[1640]])
tensor([[997]])
tensor([[287]])
tensor([[2837]])
tensor([[7]])
tensor([[17]])
tensor([[11]])
tensor([[299]])
tensor([[10]])
tensor([[16]])
tensor([[2599]])
tensor([[198]])
tensor([[50281]])
tensor([[271]])
tensor([[62]])
tensor([[35505]])
tensor([[796]])
tensor([[6407]])
tensor([[198]])
tensor([[50281]])
tensor([[1640]])
tensor([[1312]])
tensor([[287]])
tensor([[2837]])
tensor([[7]])
tensor([[17]])
tensor([[11]])
tensor([[493]])
tensor([[7]])
tensor([[22510]])
tensor([[1174]])
tensor([[15]])
tensor([[13]])
tensor([[20]])
tensor([[47762]])
tensor([[16]])
tensor([[2599]])
tensor([[198]])
tensor([[50277]])
tensor([[361]])
ten

In [5]:
# output2 = model.forward(**inputs)
# output2.logits.shape
# inputs.input_ids.shape

In [6]:
code1 = inspect.getsource(model.forward)
code2 = inspect.getsource(model.layers[1].mixer.forward)
# print(inspect.getsource(model.post_init))

# print(inspect.getsource(model.forward)) # forward of entire model
# print(inspect.getsource(model.generate))
print(inspect.getsource(model.layers[0].forward))
# print(inspect.getsource(model.layers[1].mixer.forward))

# def new_forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
#     input_shape = input_ids.size()
#     print(input_shape)
#     input_ids = input_ids.view(-1, input_shape[-1])

#     hidden_states = self.wte(input_ids)
#     hidden_states = self.drop(hidden_states)

#     return hidden_states

# model.layers[0].forward = new_forward

print(inspect.getsource(model.layers[0].forward))



def new_forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
    input_shape = input_ids.size()
    input_ids = input_ids.view(-1, input_shape[-1])
    print(input_ids)

    hidden_states = self.wte(input_ids)
    hidden_states = self.drop(hidden_states)

    return hidden_states

def new_forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
    input_shape = input_ids.size()
    input_ids = input_ids.view(-1, input_shape[-1])
    print(input_ids)

    hidden_states = self.wte(input_ids)
    hidden_states = self.drop(hidden_states)

    return hidden_states



In [7]:
out_first_layer = model.layers[0].forward(**inputs)
print(inputs.input_ids.shape)
print(out_first_layer.shape)

tensor([[15506,    63, 29412,   198,  4299,  3601,    62, 35505,     7,    77,
          2599,   198, 50285, 37811,   198, 50285, 18557,   477,   778,   999,
          1022,   352,   290,   299,   198, 50285, 37811]])
torch.Size([1, 27])
torch.Size([1, 27, 2048])


In [8]:
model.layers

Sequential(
  (0): Embedding(
    (wte): Embedding(51200, 2048)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (1): ParallelBlock(
    (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (resid_dropout): Dropout(p=0.0, inplace=False)
    (mixer): MHA(
      (rotary_emb): RotaryEmbedding()
      (Wqkv): Linear(in_features=2048, out_features=6144, bias=True)
      (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
      (inner_attn): SelfAttention(
        (drop): Dropout(p=0.0, inplace=False)
      )
      (inner_cross_attn): CrossAttention(
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (mlp): MLP(
      (fc1): Linear(in_features=2048, out_features=8192, bias=True)
      (fc2): Linear(in_features=8192, out_features=2048, bias=True)
      (act): NewGELUActivation()
    )
  )
  (2): ParallelBlock(
    (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (resid_dropout): Dropout(p=0.0, inplace=False)
    (mixer): MHA(
      (rotar