<a href="https://colab.research.google.com/github/rdsmaia/dim0494/blob/main/myLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoConfig
import math

In [None]:
model_chkpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_chkpt)
config = AutoConfig.from_pretrained(model_chkpt)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
text = 'My life is wonderful because I get to train Deep Learning models every day.'

In [None]:
tokens = tokenizer(text, return_tensors='pt', add_special_tokens=False)['input_ids']
tokens

tensor([[2026, 2166, 2003, 6919, 2138, 1045, 2131, 2000, 3345, 2784, 4083, 4275,
         2296, 2154, 1012]])

In [None]:
print(f'O número de tokens máximo é: {config.vocab_size}')
print(f'A dimensão do embedding  é: D={config.hidden_size}')

O número de tokens máximo é: 30522
A dimensão do embedding  é: D=768


In [None]:
embedding_layer = nn.Embedding(config.vocab_size, config.hidden_size)
embeddings = embedding_layer(tokens)
print(f'O formato dos tokens de entrada é: {tokens.shape}')
print(f'O formato dos embeddings é: {embeddings.shape}')

O formato dos tokens de entrada é: torch.Size([1, 15])
O formato dos embeddings é: torch.Size([1, 15, 768])


In [None]:
def scaled_dot_product_attention(query, key, value):
  # número de colunas da matriz K
  M_k = key.size(-1)
  # determina as energias
  scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(M_k)
  # determina as pesos do alinhamento
  attention_weights = F.softmax(scores, dim=-1)
  # multiplica pesos pela matriz V
  return torch.matmul(attention_weights, value)

class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim):

    super().__init__()
    # matriz W^(Q)
    self.q = nn.Linear(embed_dim, head_dim)
    # matriz W^(K)
    self.k = nn.Linear(embed_dim, head_dim)
    # matriz W^(V)
    self.v = nn.Linear(embed_dim, head_dim)

  def forward(self, hidden_state):
    attn_outputs = scaled_dot_product_attention(self.q(hidden_state),
                                                self.k(hidden_state),
                                                self.v(hidden_state))
    return attn_outputs

class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    # dimensão do embedding
    embed_dim = config.hidden_size
    # número de cabeças
    num_heads = config.num_attention_heads
    # dimensão de cada cabeça
    head_dim = embed_dim // num_heads
    self.heads = nn.ModuleList(
        [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
    )
    self.output_linear = nn.Linear(embed_dim, embed_dim)

  def forward(self, hidden_state):
    x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
    x = self.output_linear(x)
    return x


In [None]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(embeddings)
print(f'O formato da saída da camada MHA é: {attn_output.shape}')

O formato da saída da camada MHA é: torch.Size([1, 15, 768])


In [None]:
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    # camada 1
    self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
    # camada 2
    self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
    # ativação
    self.gelu = nn.GELU()
    # dropout
    self.dropout = nn.Dropout(config.hidden_dropout_prob)

  def forward(self, x):
    x = self.linear_1(x)
    x = self.gelu(x)
    x = self.linear_2(x)
    x = self.dropout(x)
    return x

In [None]:
feed_forward = FeedForward(config)
ff_output = feed_forward(embeddings)
print(f'O formato da saída da camada FF é: {ff_output.shape}')

O formato da saída da camada FF é: torch.Size([1, 15, 768])


In [None]:
class LLMEncoderLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    # layer norm 1
    self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
    # layer norm 2
    self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
    # MHA
    self.attention = MultiHeadAttention(config)
    # rede FF
    self.feed_forward = FeedForward(config)

  def forward(self, hidden_state):
    # passa o hidden state pela camanda de normalização
    x = self.layer_norm_1(hidden_state)
    # passa o resultado pelo MHA
    multihead_attn_output = self.attention(x)
    # soma com a própria entrada (skip connection)
    x = x + multihead_attn_output
    # passa resultado pela camada de normalização 2
    x = self.layer_norm_2(x)
    # passa resultado pela camada FF
    ff_output = self.feed_forward(x)
    # soma com a própria entrada (skip connection)
    x = x + ff_output
    return x


In [None]:
encoder_layer = LLMEncoderLayer(config)
encoder_output = encoder_layer(embeddings)
print(f'O formato da saída do encoder é: {encoder_output.shape}')

O formato da saída do encoder é: torch.Size([1, 15, 768])


In [None]:
class Embeddings(nn.Module):
  def __init__(self, config):
    super().__init__()
    # camada de embedding
    self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
    # camada: positional embeddings
    self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
    # normalização
    self.layer_norm = nn.LayerNorm(config.hidden_size)
    # camada dropout
    self.dropout = nn.Dropout(config.hidden_dropout_prob)

  def forward(self, tokens):
    # passa tokens pela camanda de embedding
    token_embeddings = self.word_embeddings(tokens)
    # obtém positional embeddings
    position_ids = torch.arange(tokens.size(-1), dtype=torch.long).unsqueeze(0)
    position_embeddings = self.position_embeddings(position_ids)
    # soma embeddings
    embeddings = token_embeddings + position_embeddings
    # passa o resultado pela camada de normalização
    embeddings = self.layer_norm(embeddings)
    # aplica o dropout
    embeddings = self.dropout(embeddings)
    # retorna embeddings
    return embeddings

In [None]:
embedding_layer = Embeddings(config)
embeddings = embedding_layer(tokens)
print(f'O formato dos embeddings é: {embeddings.shape}')

O formato dos embeddings é: torch.Size([1, 15, 768])


In [None]:
class LLMEncoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    # camadas embedding
    self.embeddings = Embeddings(config)
    # camadas de codificação (LMEncoderLayer)
    self.layers = nn.ModuleList([LLMEncoderLayer(config) for _ in range(config.num_hidden_layers)])

  def forward(self, tokens):
    # passa tokens pela camada de embedding
    x = self.embeddings(tokens)
    # passa resultado pela rede de codificação
    for layer in self.layers:
      x = layer(x)
    return x

In [None]:
encoder = LLMEncoder(config)
encoder_output = encoder(tokens)
print(f'O formato da saída do encoder é: {encoder_output.shape}')

O formato da saída do encoder é: torch.Size([1, 15, 768])


In [None]:
def scaled_dot_product_attention(query, key, value, mask=None):
  # número de colunas da matriz K
  M_k = key.size(-1)
  # determina as energias
  scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(M_k)
  # mascaramento
  if mask is not None:
    scores = scores.masked_fill(mask == 0, -1e9)
  # determina as pesos do alinhamento
  attention_weights = F.softmax(scores, dim=-1)
  # multiplica pesos pela matriz V
  return torch.matmul(attention_weights, value)



In [None]:
scores = torch.randn(15,15)
scores

tensor([[-1.4035,  0.6365,  0.9758,  0.9763, -0.6938,  1.4915,  0.4961,  0.6566,
         -1.9132,  1.3764,  0.6862,  0.0039,  0.6198, -0.2273, -0.0610],
        [ 1.7277,  0.4565,  0.4856, -1.2300, -0.4833, -0.3977, -0.0806,  0.7667,
          0.9033,  0.6400,  1.3206, -1.8353, -0.7775, -1.2476,  0.5011],
        [ 0.1205, -1.2391, -1.0687,  0.3336,  0.0411, -0.3984,  0.7619,  0.7545,
         -0.1444,  0.7330,  0.5025,  0.0570,  1.3302,  0.9359,  0.2563],
        [-0.6714,  0.1324,  0.6402, -1.3400, -1.8339, -0.7641, -0.5689,  1.1631,
         -0.0100, -2.4937, -0.2972, -1.0046,  0.6738,  1.2675,  0.2121],
        [-1.4998,  0.0148, -0.7202,  0.3628,  2.2228, -0.5227, -1.7211,  0.9764,
          0.2194, -0.1319, -0.6975,  0.1546, -0.9419,  0.3070,  0.6917],
        [-1.0374,  0.2795, -1.3067,  0.7809,  0.4889, -0.1647, -0.3215,  1.5168,
         -0.6419,  2.3002, -1.1243,  0.3163,  1.9163,  1.3241, -1.9235],
        [ 0.7315,  1.0305, -0.1359, -0.2110,  0.6916,  1.6892, -0.8992, -0.9

In [None]:
mask = torch.tril(torch.ones(15,15))
scores = scores.masked_fill(mask == 0, -float('inf') )
scores

tensor([[-1.4035,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 1.7277,  0.4565,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 0.1205, -1.2391, -1.0687,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.6714,  0.1324,  0.6402, -1.3400,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.4998,  0.0148, -0.7202,  0.3628,  2.2228,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.0374,  0.2795, -1.3067,  0.7809,  0.4889, -0.1647,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 0.7315,  1.0305, -0.1359, -0.2110,  0.6916,  1.6892, -0.8992,    -