In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch # pytorch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using GPU')
else:
    print('Using CPU')

  from .autonotebook import tqdm as notebook_tqdm


Using GPU


We use the imbd dataset for sentiment classification. It already has labels (1 for positive, 0 for negative sentiment).
- See it here: https://huggingface.co/datasets/stanfordnlp/imdb

In [2]:
dataset = load_dataset("imdb")

train_data = dataset['train'].shuffle(seed=42).select(range(2000))  # Limit for faster execution
test_data = dataset['test'].shuffle(seed=42).select(range(500))

In [3]:
model_name = "distilbert-base-uncased" # model name from huggingface repo

tokenizer = AutoTokenizer.from_pretrained(model_name) # load the tokenizer model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # load the model itself

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Note: the above warning is normal. What is happening here is that we are taking the distilbert model, which is just a (smaller) bert model whose final layer is the last encoding layer (no linear layer). Hence, this model just computes vector embeddings for the tokens in a sequence. Then, with AutoModelForSequenceClassification we are putting a linear head on top to take the tokens, mean pool them, and classify the resulting vector into "positive" or "negative".

We can visualize this model (using an [onnx](https://onnx.ai/) version) using [netron](https://github.com/lutzroeder/netron): see [here](https://netron.app/?url=https://huggingface.co/onnxport/distilbert-base-uncased-onnx/blob/main/model.onnx).

Note how the last node (the output) is named "last_hidden_layer". If you click on it you see that the dimension of the layer is: (batch_size, sequence_length, 764).

- Batch size is the size of the batch that passed through the model
- Sequence length is the max number of tokens in the batch
- 764 is the size of the token embeddings

We can use onnx to export the model we just initialized and if we visualize it in netron we see that a linear head has been added on top:

In [4]:
inputs = tokenizer("This is a sample input for ONNX export.", return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.onnx.export(
    model,                                          # Model to export
    (inputs["input_ids"].cuda(), inputs["attention_mask"].cuda()), # Input example (tuple)
    "distilbert_model_binary_classification.onnx",                         # Path to save ONNX model
    input_names=["input_ids", "attention_mask"],     # Names for the inputs
    output_names=["output"],                         # Names for the outputs
    dynamic_axes={                                   # Dynamic axes for variable sequence lengths
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size"}
    },
    opset_version=14                                # ONNX opset version
)

Let's now train the transformer model using the transformers library Trainer.

In [5]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

In [6]:
train_data[0].keys()

dict_keys(['text', 'label', 'input_ids', 'attention_mask'])

input_ids is the list of token ids in the input text, and attention_mask is the attention mask vector.

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)



Let's train the transformer. This will take a bit of time to run (note: if you are running on gpu, you can use `nvidia-smi` to monitor gpu utilization).

In [8]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,0.398297
2,0.319500,0.416138


{'eval_loss': 0.41613781452178955,
 'eval_runtime': 9.3628,
 'eval_samples_per_second': 53.403,
 'eval_steps_per_second': 6.729,
 'epoch': 2.0}

Note: the above is training the transformer model and updating all the model parameters with backpropagation:

In [9]:
for name, param in model.named_parameters():
    print(f"Parameter name: {name} | Shape: {param.shape} | Requires grad: {param.requires_grad}")

Parameter name: distilbert.embeddings.word_embeddings.weight | Shape: torch.Size([30522, 768]) | Requires grad: True
Parameter name: distilbert.embeddings.position_embeddings.weight | Shape: torch.Size([512, 768]) | Requires grad: True
Parameter name: distilbert.embeddings.LayerNorm.weight | Shape: torch.Size([768]) | Requires grad: True
Parameter name: distilbert.embeddings.LayerNorm.bias | Shape: torch.Size([768]) | Requires grad: True
Parameter name: distilbert.transformer.layer.0.attention.q_lin.weight | Shape: torch.Size([768, 768]) | Requires grad: True
Parameter name: distilbert.transformer.layer.0.attention.q_lin.bias | Shape: torch.Size([768]) | Requires grad: True
Parameter name: distilbert.transformer.layer.0.attention.k_lin.weight | Shape: torch.Size([768, 768]) | Requires grad: True
Parameter name: distilbert.transformer.layer.0.attention.k_lin.bias | Shape: torch.Size([768]) | Requires grad: True
Parameter name: distilbert.transformer.layer.0.attention.v_lin.weight | Shap

We can train only the parameters for the linear head by setting the other parameters to not require gradients. This is done by setting the requires_grad attribute to False for the parameters we don't want to update. This will make the training faster. Typically we do not want to train the transformer parameters because they are already trained on a large corpus of text.

In [10]:
for name, param in model.distilbert.named_parameters():
    param.requires_grad = False

In [11]:
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,No log,0.419419
2,0.100100,0.424694


{'eval_loss': 0.42469412088394165,
 'eval_runtime': 9.2942,
 'eval_samples_per_second': 53.797,
 'eval_steps_per_second': 6.778,
 'epoch': 2.0}

We can now predict with the trained model on new text.

In [12]:
tokens = tokenizer(["this is really bad", "this is really good"], return_tensors="pt", truncation=True, padding=True, return_attention_mask=True)

In [13]:
# disable gradient computation for inference
with torch.no_grad():
    outputs = model(input_ids=tokens["input_ids"].cuda(), attention_mask=tokens["attention_mask"].cuda())

The output of this model is the raw logits. We can use the softmax function to convert the logits to probabilities.

In [16]:
outputs.logits

tensor([[ 2.6164, -3.0171],
        [-2.0582,  2.1609]], device='cuda:0')

In [20]:
predictions = torch.argmax(outputs.logits, dim=1)
for i in range(len(predictions)):
    print(f"Prediction: {predictions[i]} | Probability: {torch.softmax(outputs.logits[i], dim=0)}")
    print(f"Predicted label: {dataset['train'].features['label'].int2str(predictions[i].item())}")

Prediction: 0 | Probability: tensor([0.9964, 0.0036], device='cuda:0')
Predicted label: neg
Prediction: 1 | Probability: tensor([0.0145, 0.9855], device='cuda:0')
Predicted label: pos


Huggingface transformers library has a pipeline class that makes it easy to use the model for inference. In this way, we can use the model to predict the sentiment of new text without having to write the code to tokenize the text, pass it through the model, and convert the logits to probabilities.

In [23]:
# Import the necessary library
from transformers import pipeline

# Create a pipeline for sentiment analysis using an already pre-trained sentiment analysis model
# note: using distilbert here would be very bad because the model is not trained for sentiment analysis
sentiment_analysis = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Analyze the sentiment of a sample text
result = sentiment_analysis("This is a fantastic movie. I love it!")

# Display the result
print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'POSITIVE', 'score': 0.9998807907104492}]


That's great, but how does it work in the background? we can try to implement a (simple) transformer model from scratch to understand how it works!

In [57]:
import torch
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert self.head_dim * heads == embed_size, "Embedding size needs to be divisible by heads"
        
        # Define linear transformations for Q, K, V
        self.values = nn.Linear(embed_size, embed_size, bias=False)
        self.keys = nn.Linear(embed_size, embed_size, bias=False)
        self.queries = nn.Linear(embed_size, embed_size, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        
        print("values shape:", values.shape)
        
        # Apply the linear transformations
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(query)
        
        print("values shape:", values.shape)

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)
        
        print("values shape:", values.shape)
        
        # Compute the attention scores
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.head_dim ** 0.5), dim=3)

        # Compute the weighted sum of the values
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class Transformer(nn.Module):
    def __init__(self, embed_size, heads, num_layers, forward_expansion, dropout, vocab_size, max_length):
        super(Transformer, self).__init__()
        self.embed_size = embed_size
        self.word_embedding = nn.Embedding(vocab_size, embed_size) # this will initialize the word embeddings for the tokens
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout, forward_expansion)
                for _ in range(num_layers)
            ]
        )
        
        # Classification head with 2 output units (positive, negative)
        # in the example above it was added on top of distilbert
        self.fc_out = nn.Linear(embed_size, 2)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask): # here we pass the token ids and the mask tensors
        N, seq_length = x.shape # N is the batch size, while seq_length is the length of the sequence (how many tokens)
        print("input shape:", x.shape)
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device) # this is just the position tensor, it is neeeded for the positional encoding (to know the position of the token in the sequence)
        
        print("position tensor shape:", positions.shape)
        
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        
        print("after first dropout shape", out.shape)

        for layer in self.layers:
            # passing the same vectors three times seem odd, but remember that the self-attention mechanism
            # multiplies each batch vector with three different weight matrices
            out = layer(out, out, out, mask)
            print("transformer block output shape",out.shape)
        
        # Average pooling over the sequence length.
        # not completely accurate as one would have to mask the padding tokens but good enough for now
        out = out.mean(dim=1)
        
        print("after mean pooling shape", out.shape)
        
        # project the output to the classification head
        out = self.fc_out(out)
        print("final shape", out.shape)
        return out

In [58]:
sentence = "I love using transformers library!"
tokens = tokenizer(sentence, return_tensors='pt', padding='max_length', max_length=128, truncation=True)

# convert tokens to tensor format
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

# initialize the Transformer model
model = Transformer(embed_size=768, heads=8, num_layers=6, forward_expansion=4, dropout=0.1, vocab_size=tokenizer.vocab_size, max_length=128)

# pass the tensor through the Transformer model
output = model(input_ids, attention_mask)

# interpret the output. This will be bad as the classification head is not trained.
predicted_class = torch.argmax(output, dim=1).item()
class_names = ["negative", "positive"]
print(f"Predicted sentiment: {class_names[predicted_class]}")

# however, the output has the gradients attached to it, so we can use it to train the model. This is because pytorch is able to keep track of the gradients of the model, i.e., 
# it is able to compute the gradients of the loss function with respect to the model parameters from the model definition. This is done by the autograd package in pytorch.
output

input shape: torch.Size([1, 128])
position tensor shape: torch.Size([1, 128])
after first dropout shape torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 8, 96])
transformer block output shape torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 8, 96])
transformer block output shape torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 8, 96])
transformer block output shape torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 8, 96])
transformer block output shape torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 768])
values shape: torch.Size([1, 128, 8, 96])
transformer block output shape t

tensor([[ 0.3201, -0.1206]], grad_fn=<AddmmBackward0>)

https://ai.stackexchange.com/questions/41477/why-in-multi-head-attention-implementation-should-we-use-3-linear-layers-for-q