In [1]:
import numpy as np

In [2]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
import torch
import numpy as np

# create a random tensor

x = torch.rand(3, 4)


# print the shape of the tensor
print(x.shape)

torch.Size([3, 4])


In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
text = 'I am trying to get a job and get my life back together.'
print(tokenizer(text, add_special_tokens=False, return_tensors='pt'))
inputs = tokenizer(text, add_special_tokens=False, return_tensors='pt')

  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': tensor([[1045, 2572, 2667, 2000, 2131, 1037, 3105, 1998, 2131, 2026, 2166, 2067,
         2362, 1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [5]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained('bert-base-uncased')
print(config)

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.37.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [6]:
from torch import nn

In [7]:
token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
print(token_embeddings)

# output:
# Embedding(30522, 768)

Embedding(30522, 768)


In [8]:
inputs_embeds = token_embeddings(inputs.input_ids)
print(inputs_embeds.size())

torch.Size([1, 14, 768])


In [9]:
import torch
import torch.nn.functional as F
from math import sqrt

In [10]:
query = key = value = inputs_embeds


def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k) # torch.bmm is batch matrix - matrix multiplication.
    # Basically a dot product.
    
    weights = F.softmax(scores, dim = 1)
    return torch.bmm(weights, value)

In [11]:
# single_head_attention_layer


class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)
        
        
    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

In [12]:
# Multihead attention layer

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList([AttentionHead(embed_dim, head_dim) for _ in range(num_heads)])
        self.output_linear = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim = -1)
        x = self.output_linear(x)
        return x

In [13]:
#Checking the code

multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)

print(attn_output.size())

torch.Size([1, 14, 768])


In [14]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        
        return x

In [15]:
# Testing the code above

feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)

print(ff_outputs.size())

torch.Size([1, 14, 768])
