In [19]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from bertviz import model_view
from datasets import load_dataset


In [20]:
def kaiming(module):
    if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):
        torch.nn.init.kaiming_normal_(module.weight.data, nonlinearity='relu')

In [21]:
# Load the "dair-ai/emotion" dataset
dataset = load_dataset("dair-ai/emotion")
train_data = dataset["train"]
test_data = dataset["test"]

# Get the number of labels for the classification task
num_labels = len(train_data.features["label"].names)
from transformers import BertForSequenceClassification, BertConfig

# Load the configuration of the original pre-trained model
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_hidden_layers = 6
config.num_attention_heads = 8
config.output_attentions = True
config.num_labels = num_labels

# Initialize your model using the same configuration
model_default_untrained = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = num_labels, output_attentions = True)
model_default_trained = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = num_labels, output_attentions = True)
model_default_trained.load_state_dict(torch.load('default_trained.pt'))
model_6l = BertForSequenceClassification(config)
model_6l.load_state_dict(torch.load('model_new_6layers.pt'))
model_kaiming_untrained = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = num_labels, output_attentions = True)
model_kaiming_untrained.apply(kaiming)
model_kaiming_trained = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = num_labels, output_attentions = True)
model_kaiming_trained.load_state_dict(torch.load('kaiming_trained.pt'))


#model.load_state_dict(torch.load('model_trained_default.pt'))
# Load the pre-trained BERT model and load its fine-tuned weights


# Set the model to evaluation mode and move it to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_default_untrained.to(device)
model_default_untrained.eval()
model_default_trained.to(device)
model_default_trained.eval()
model_6l.to(device)
model_6l.eval()
model_kaiming_untrained.to(device)
model_kaiming_untrained.eval()
model_kaiming_trained.to(device)
model_kaiming_trained.eval()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [22]:
# Choose an example from the test dataset (you can change the index as needed)
# Choose an example from the test dataset (you can change the index as needed)
example_idx = 0
example = test_data[example_idx]

# Tokenize the input text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(example["text"], return_tensors='pt', padding=True, truncation=True)

# Move the inputs to the same device as the model
inputs = {key: val.to(device) for key, val in inputs.items()}

# Perform inference to get the attention weights
outputs_model_default_untrained = model_default_untrained(**inputs)
outputs_model_default_trained = model_default_trained(**inputs)
outputs_model_6l = model_6l(**inputs)
attention_model_default_untrained = outputs_model_default_untrained[-1]  # Assuming you have configured output_attentions=True during model initialization
attention_model_default_trained = outputs_model_default_trained[-1]
attention_model_6l = outputs_model_6l[-1]
# Convert token IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# Visualize the attention
print("Default untrained")
model_view(attention_model_default_untrained, tokens)
print("Default trained")
model_view(attention_model_default_trained, tokens)
print("6L")
model_view(attention_model_6l, tokens)

Default untrained


<IPython.core.display.Javascript object>

Default trained


<IPython.core.display.Javascript object>

6L


<IPython.core.display.Javascript object>

In [23]:
# Choose an example from the test dataset (you can change the index as needed)
# Choose an example from the test dataset (you can change the index as needed)
example_idx = 0
example = test_data[example_idx]

# Tokenize the input text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(example["text"], return_tensors='pt', padding=True, truncation=True)

# Move the inputs to the same device as the model
inputs = {key: val.to(device) for key, val in inputs.items()}

# Perform inference to get the attention weights
outputs_model_kaiming_untrained = model_kaiming_untrained(**inputs)
outputs_model_kaiming_trained = model_kaiming_trained(**inputs)

attention_model_kaiming_untrained = outputs_model_kaiming_untrained[-1]  # Assuming you have configured output_attentions=True during model initialization
attention_model_kaiming_trained = outputs_model_kaiming_trained[-1]

# Convert token IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# Visualize the attention
print("Kaiming untrained")
model_view(attention_model_kaiming_untrained, tokens)
print("Kaiming trained")
model_view(attention_model_kaiming_untrained, tokens)


Kaiming untrained


<IPython.core.display.Javascript object>

Kaiming trained


<IPython.core.display.Javascript object>