In [None]:
# Import necessary Modules
# This code can be taken form Hugging Face

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("andrijdavid/Llama3-2B-Base")
model = AutoModelForCausalLM.from_pretrained("andrijdavid/Llama-3-2B-Base")

In [None]:
# Lists all the layers and internal trainable params list
model.state_dict

<bound method Module.state_dict of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-5): 6 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [3]:
import numpy as np

In [None]:
# Extract the pre-trained weights form the Embedding layer

embedding_weights = model.model.embed_tokens.weight.detach().cpu().numpy()
print(embedding_weights.shape)

(128256, 4096)


In [None]:
# Tokenizer computes the tokens for given input sequence

tokens = tokenizer("Who is Virat Kohli", return_tensors = "pt")
tokens = tokens['input_ids'].squeeze().tolist()
token_length = len(tokens)
tokens

[128000, 15546, 374, 9734, 266, 34975, 747]

In [6]:
token_length

7

In [None]:
# Treat the token as row number and get the embeddings

embeddings = []
for x in tokens:
    embeddings.append(embedding_weights[x])
embeddings = np.stack(embeddings)
embeddings

array([[-8.2969666e-05,  2.5749207e-04, -2.4604797e-04, ...,
        -3.2424927e-04, -2.1553040e-04,  4.7111511e-04],
       [ 7.5683594e-03, -2.9563904e-04, -1.4190674e-03, ...,
         1.7333984e-02,  9.6511841e-04, -6.6223145e-03],
       [-2.9907227e-03,  1.6174316e-03,  1.0528564e-03, ...,
         9.0942383e-03,  3.9978027e-03,  7.4462891e-03],
       ...,
       [ 3.9367676e-03,  1.5792847e-03,  8.7738037e-04, ...,
         1.2130737e-03,  1.1672974e-03, -4.6386719e-03],
       [-1.5319824e-02,  7.2937012e-03, -3.6926270e-03, ...,
        -5.4321289e-03, -1.5487671e-03, -1.4038086e-02],
       [ 6.4697266e-03, -3.5095215e-03,  1.3793945e-02, ...,
         6.3476562e-03,  1.0498047e-02,  8.2397461e-03]], dtype=float32)

In [8]:
embedding_weights = model.model.embed_tokens.weight.detach().cpu().numpy()
print(embedding_weights.shape)

(128256, 4096)


In [9]:
embeddings.shape

(7, 4096)

In [None]:
# Detach the pre-trained weights for Gamma and Beta params form the layernorm layer

layer0 = model.model.layers[0]
rms1 = layer0.input_layernorm.weight.detach().cpu().numpy()
rms1.shape

(4096,)

In [None]:
# Define RMS (Root Mean Square) function

def rms(token,rms1):
    norm_factor = np.sqrt(np.mean(token**2,) + 1e-10)
    inter1 = token / norm_factor                           # Normalizes the values
    final_op = (inter1)*(rms1)                             # Multiplies with the layer weights
    return final_op

In [None]:
rms1_output = []
for x in embeddings:
    rms1_output.append(rms(x,rms1))
rms1_output = np.array(rms1_output)
rms1_output

array([[-6.2704086e-04,  7.5706965e-03, -1.5555274e-02, ...,
        -3.9163311e-03, -1.3239174e-03,  1.9265389e-03],
       [ 4.7919020e-02, -7.2822147e-03, -7.5160660e-02, ...,
         1.7539957e-01,  4.9666399e-03, -2.2687687e-02],
       [-2.9355817e-02,  6.1764568e-02,  8.6450703e-02, ...,
         1.4266199e-01,  3.1894460e-02,  3.9548695e-02],
       ...,
       [ 3.6865745e-02,  5.7535928e-02,  6.8730973e-02, ...,
         1.8154921e-02,  8.8846562e-03, -2.3504503e-02],
       [-8.0239147e-02,  1.4861955e-01, -1.6178912e-01, ...,
        -4.5470163e-02, -6.5931734e-03, -3.9784566e-02],
       [ 5.0012935e-02, -1.0554551e-01,  8.9200288e-01, ...,
         7.8421243e-02,  6.5960027e-02,  3.4465492e-02]], dtype=float32)

In [None]:
import math
import torch
import numpy as np

def trig_multiplication(pair, phi):                       # Definition representing the rotation matrix
    x = pair[0]
    y = pair[1]
    x_new = x * math.cos(phi) - y * math.sin(phi)
    y_new = x * math.sin(phi) + y * math.cos(phi)
    return [x_new, y_new]

def theta(i, dim):                                        # Used in RoPE: Includes the pair value (i) in a continuous value
    return pow(10000, (-2 * i) / dim)

def phi(position, x):                                     # Used in RoPE: Includes the position
    return position * x

def RoPE(embedding, position):                            # Divide the Input tensor to pairs then compute the theta -> phi -> rotation for pair
    final_rope_values = []
    dim = len(embedding)
    for i in range(0, dim, 2):
        pair = [embedding[i], embedding[i + 1]]
        theta_val = theta(i // 2, dim)
        phi_val = phi(position, theta_val)
        rotated = trig_multiplication(pair, phi_val)
        final_rope_values.extend(rotated)
    return final_rope_values

def add_rope(A):                                          # Adds RoPE to a given matrix
    temp = []
    for i, x in enumerate(A):
        temp.append(RoPE(x, i))
    return temp

def self_attention(Q, K, V, head_dim):                    # Self Attention which computes softmax(Q.K').V
    Q = add_rope(Q)                                       # We include RoPE only to Q and K
    K = add_rope(K)
    Q = torch.tensor(Q, dtype=torch.float32)
    K = torch.tensor(K, dtype=torch.float32)
    V = torch.tensor(V, dtype=torch.float32)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(head_dim)
    attn = torch.softmax(scores, dim=-1)
    out = torch.matmul(attn, V)
    return out.numpy()

def grouped_multi_query_attention(embeddings, model, layer_idx, group_size, num_heads):
    
    seq_len, dim = embeddings.shape
    head_dim = dim // num_heads
    final_attention_output = []

    layer = model.model.layers[layer_idx]
    w_q_full = layer.self_attn.q_proj.weight.detach().cpu().numpy().T          # Extract the weights form each attention block
    w_k_full = layer.self_attn.k_proj.weight.detach().cpu().numpy().T
    w_v_full = layer.self_attn.v_proj.weight.detach().cpu().numpy().T
    w_o_full = layer.self_attn.o_proj.weight.detach().cpu().numpy().T

    q_weights = []
    k_weights = []
    v_weights = []

    for i in range(32):                                                        # Slice them accordingly
        q_weights.append(w_q_full[:, i * 128:(i + 1) * 128])

    for i in range(8):
        k_weights.append(w_k_full[:, i * 128:(i + 1) * 128])
        v_weights.append(w_v_full[:, i * 128:(i + 1) * 128])

    counter = 0
    for i in range(8):
        for j in range(4):                                                     # Llama3 2B has 32 heads with 8 groups -> 4 heads in each group
            q_proj = embeddings @ q_weights[j + counter]
            k_proj = embeddings @ k_weights[i]
            v_proj = embeddings @ v_weights[i]
            out = self_attention(q_proj, k_proj, v_proj, head_dim)
            final_attention_output.append(out)
        counter += 4
    final_attention_output = np.stack(final_attention_output, axis=1)
    final_attention_output = final_attention_output.reshape(seq_len, dim)
    final_output = final_attention_output @ w_o_full
    final_output = torch.tensor(final_output, dtype=torch.float32)

    return final_output

In [None]:
# Extract the pre-trained weights for projection of self-attention block

layer = model.model.layers[0]
w_o_full = layer.self_attn.o_proj.weight.detach().cpu().numpy().T
w_o_full.shape

(4096, 4096)

In [15]:
new_output = np.array(grouped_multi_query_attention(rms1_output, model, 0, 4, 32))
new_output.shape

(7, 4096)

In [None]:
# Add then LayerNorm + GMQA output

new_output1 = new_output + rms1_output
new_output1

array([[ 4.84000985e-03,  5.44096343e-03, -1.14721125e-02, ...,
         2.90262979e-04, -4.43270011e-03,  9.36676515e-05],
       [ 4.87546884e-02, -8.19274969e-03, -7.48249590e-02, ...,
         1.79154560e-01,  5.12761623e-03, -2.35228650e-02],
       [-1.99225582e-02,  7.12517202e-02,  8.75984877e-02, ...,
         1.41067356e-01,  3.13803181e-02,  3.93260233e-02],
       ...,
       [ 3.57237682e-02,  6.00414015e-02,  6.51189983e-02, ...,
         2.78638750e-02,  1.32455844e-02, -2.43293326e-02],
       [-7.85560757e-02,  1.49450824e-01, -1.66653946e-01, ...,
        -4.47310992e-02, -3.03264917e-03, -4.02560011e-02],
       [ 4.93738949e-02, -1.06110014e-01,  8.85969758e-01, ...,
         8.39299262e-02,  6.64601400e-02,  3.32902148e-02]], dtype=float32)

In [17]:
layer0 = model.model.layers[0]
rms2 = layer0.post_attention_layernorm.weight.detach().cpu().numpy()
print(rms2.shape)

def rms_final(token, rms_weight):
    norm_factor = np.sqrt(np.mean(token**2) + 1e-10)
    inter1 = token / norm_factor
    final_op = inter1 * rms_weight
    return final_op

rms2_output = []
for x in embeddings:
    rms2_output.append(rms_final(x, rms2))
rms2_output = np.array(rms2_output)
rms2_output


(4096,)


array([[-0.00177518,  0.00508267, -0.0052983 , ..., -0.00702702,
        -0.0046114 ,  0.00988469],
       [ 0.13566115, -0.004889  , -0.02560057, ...,  0.31471696,
         0.01729953, -0.11640602],
       [-0.08310779,  0.04146636,  0.02944609, ...,  0.25597635,
         0.11109307,  0.2029165 ],
       ...,
       [ 0.10436878,  0.03862741,  0.02341055, ...,  0.03257512,
         0.03094655, -0.12059695],
       [-0.22716105,  0.09977745, -0.05510721, ..., -0.08158647,
        -0.02296499, -0.20412673],
       [ 0.14158913, -0.07085919,  0.3038263 , ...,  0.14071012,
         0.2297484 ,  0.1768356 ]], dtype=float32)

In [18]:
rms2_output.shape

(7, 4096)

In [None]:
# We have a Feed Forward Neural Network and it uses 3 weight matrices of mentioned shapes

a = layer0.mlp.gate_proj.weight.detach().cpu().numpy().T  # (4096, 11008)
b = layer0.mlp.up_proj.weight.detach().cpu().numpy().T    # (4096, 11008)
c = layer0.mlp.down_proj.weight.detach().cpu().numpy().T    #(11008, 4096)

In [None]:
out1 = rms2_output @ a
out3 = rms2_output @ b
temp11 = torch.tensor(out1)
temp22 = torch.sigmoid(temp11)
temp33 = temp11 * temp22
temp44 = out3 * np.array(temp33)
final1 = temp44 @ c
final1

array([[-0.00397256,  0.01185522,  0.00321584, ..., -0.03824075,
        -0.01792183, -0.00290041],
       [ 0.02521166, -0.01358126,  0.00272107, ..., -0.017031  ,
        -0.02725086,  0.00742132],
       [ 0.02235064, -0.00985271, -0.00957625, ..., -0.08297252,
        -0.02208163,  0.01350195],
       ...,
       [-0.01091908, -0.03957865,  0.01314883, ..., -0.06557581,
        -0.00155535,  0.01775776],
       [-0.01989701,  0.01341101,  0.01625349, ...,  0.0042439 ,
        -0.01207195,  0.01864491],
       [ 0.00376212,  0.0092607 ,  0.00027169, ..., -0.02507642,
        -0.00829177,  0.01532772]], dtype=float32)

In [21]:
# Decoder 1 output

decoder_output = final1 + rms2_output
decoder_output

array([[-0.00574774,  0.01693789, -0.00208246, ..., -0.04526776,
        -0.02253323,  0.00698428],
       [ 0.16087282, -0.01847026, -0.02287951, ...,  0.29768598,
        -0.00995133, -0.1089847 ],
       [-0.06075715,  0.03161364,  0.01986984, ...,  0.17300382,
         0.08901144,  0.21641845],
       ...,
       [ 0.0934497 , -0.00095124,  0.03655938, ..., -0.03300069,
         0.02939121, -0.10283919],
       [-0.24705806,  0.11318845, -0.03885372, ..., -0.07734256,
        -0.03503694, -0.18548182],
       [ 0.14535126, -0.06159849,  0.30409798, ...,  0.1156337 ,
         0.22145663,  0.19216332]], dtype=float32)

In [29]:
print(type(model))
print(type(model.model))
print(len(model.model.layers))


<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
<class 'transformers.models.llama.modeling_llama.LlamaModel'>
6


In [38]:
final_deocder_output = embeddings
final_deocder_output

array([[-8.2969666e-05,  2.5749207e-04, -2.4604797e-04, ...,
        -3.2424927e-04, -2.1553040e-04,  4.7111511e-04],
       [ 7.5683594e-03, -2.9563904e-04, -1.4190674e-03, ...,
         1.7333984e-02,  9.6511841e-04, -6.6223145e-03],
       [-2.9907227e-03,  1.6174316e-03,  1.0528564e-03, ...,
         9.0942383e-03,  3.9978027e-03,  7.4462891e-03],
       ...,
       [ 3.9367676e-03,  1.5792847e-03,  8.7738037e-04, ...,
         1.2130737e-03,  1.1672974e-03, -4.6386719e-03],
       [-1.5319824e-02,  7.2937012e-03, -3.6926270e-03, ...,
        -5.4321289e-03, -1.5487671e-03, -1.4038086e-02],
       [ 6.4697266e-03, -3.5095215e-03,  1.3793945e-02, ...,
         6.3476562e-03,  1.0498047e-02,  8.2397461e-03]], dtype=float32)

In [39]:
def rms_final(token, rms_weight):
    norm_factor = np.sqrt(np.mean(token**2) + 1e-10)
    inter1 = token / norm_factor
    final_op = inter1 * rms_weight
    return final_op

In [None]:
# Do it for 6 decoders which are present in Llama3 2B model

final_decoder_output = embeddings

for i in range(6):    
    layer0 = model.model.layers[i]
    rms1 = layer0.input_layernorm.weight.detach().cpu().numpy()

    rms1_output = [rms(x, rms1) for x in final_decoder_output]
    rms1_output = np.array(rms1_output)

    new_output = np.array(grouped_multi_query_attention(rms1_output, model, i, 4, 32))

    new_output1 = new_output + rms1_output

    rms2 = layer0.post_attention_layernorm.weight.detach().cpu().numpy()
    rms2_output = [rms_final(x, rms2) for x in new_output1]
    rms2_output = np.array(rms2_output)

    a = layer0.mlp.gate_proj.weight.detach().cpu().numpy().T
    b = layer0.mlp.up_proj.weight.detach().cpu().numpy().T
    c = layer0.mlp.down_proj.weight.detach().cpu().numpy().T

    out1 = rms2_output @ a
    out3 = rms2_output @ b
    temp22 = 1 / (1 + np.exp(-out1))
    temp33 = out1 * temp22
    temp44 = out3 * temp33
    final1 = temp44 @ c
    
    decoder_output = final1 + rms2_output
    final_decoder_output = decoder_output

    print(final_decoder_output)


[[ 0.00370369  0.00315625 -0.01957095 ...  0.00412595 -0.00636029
   0.00516023]
 [ 0.06316976 -0.01191915 -0.06695795 ...  0.16120218  0.0058189
  -0.01470037]
 [-0.01208057  0.05832649  0.08331855 ...  0.03900746 -0.00538583
   0.06911729]
 ...
 [ 0.04187732  0.03938935  0.05905937 ... -0.0230308   0.00347571
  -0.00609357]
 [-0.08196582  0.16044916 -0.17561598 ... -0.04782964 -0.00976431
  -0.01313402]
 [ 0.07059126 -0.10236117  0.81126344 ...  0.04736836  0.06514798
   0.05013653]]
[[-0.00893684  0.0155719  -0.02624726 ... -0.01841088 -0.01887543
   0.01370269]
 [ 0.0379235  -0.01832728 -0.01101133 ...  0.1697223  -0.00391575
  -0.00086351]
 [-0.01384858  0.00284412  0.01639566 ...  0.0120878   0.00333951
   0.05080814]
 ...
 [ 0.02976578  0.01242067  0.02867944 ... -0.07088992 -0.01085001
   0.01498003]
 [-0.05186818  0.06531619 -0.06948203 ... -0.09975584 -0.01028658
   0.00146316]
 [ 0.03568745 -0.06470494  0.30624172 ...  0.0362657   0.02177509
   0.0342348 ]]
[[ 0.00382217 -0.

In [47]:
final_decoder_output

array([[-0.08397061,  0.01375595, -0.04603558, ..., -0.00455476,
        -0.00077217, -0.10941766],
       [ 0.02118995, -0.11229853, -0.08035226, ...,  0.3861957 ,
         0.01962234, -0.10295476],
       [-0.0758604 , -0.01150855, -0.01363534, ...,  0.02478196,
        -0.0554362 ,  0.02456046],
       ...,
       [-0.03769764,  0.09231851,  0.0673345 , ..., -0.11908929,
        -0.00630704, -0.04158225],
       [-0.13418853,  0.13075794, -0.14415407, ..., -0.2183679 ,
        -0.06972754, -0.03379166],
       [ 0.09869295, -0.09452263,  0.34193644, ..., -0.011717  ,
         0.0319374 ,  0.05062286]], dtype=float32)

In [None]:
# Detach the weights for final Linear Layer used for prediction

lm_head_weight = model.lm_head.weight.detach().cpu().numpy().T
lm_head_weight.shape

(4096, 128256)

In [None]:
# Compute the forward Pass

logits = final_decoder_output @ lm_head_weight

In [None]:
# Define and apply Softmax

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims=True)

In [53]:
probs = softmax(logits)
predicted_token_ids = np.argmax(probs, axis=-1)
predicted_token_ids

array([ 66867, 122747,  76917,  91728, 122749,  14148,  52706],
      dtype=int64)

In [None]:
# Final Output sequence for given text

predicted_tokens = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
print(predicted_tokens)


['ouv', '家伙', '.opensource', 'utilus', 'ysa', '555', 'ylland']


In [55]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("andrijdavid/Llama3-2B-Base")
model = AutoModelForCausalLM.from_pretrained("andrijdavid/Llama-3-2B-Base")

inputs = tokenizer("Who is Virat Kohli", return_tensors="pt")
input_ids = inputs["input_ids"]
embeddings = model.model.embed_tokens(input_ids)
hidden_states = embeddings.transpose(0, 1)
layer0 = model.model.layers[0]

In [None]:
with torch.no_grad():
    out = layer0(hidden_states, None, None, None, False, False, None)

out_after_layer0 = out[0].transpose(0, 1)
print(out_after_layer0.shape)


TypeError: cannot unpack non-iterable NoneType object