In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50
from transformers import BertModel, BertTokenizer
from torchvision import transforms
from PIL import Image
import random
import json
import numpy as np


In [4]:
class TextEncoder(nn.Module):
    def __init__(self, out_dim):
        super().__init__()
        self.model = BertModel.from_pretrained('bert-base-uncased')

        #freezing all the layers
        for param in self.model.parameters():
            param.requires_grad = False

        # #unfreezing the last two encoder layers of bert (we need to experiment with this)
        # for layer in self.model.encoder.layer[-2:]:  # Last two layers
        #     for param in layer.parameters():
        #         param.requires_grad = True

        self.projection = nn.Sequential(
            nn.Linear(768, out_dim),
            nn.LayerNorm(out_dim)
        )

    def forward(self,input_ids,attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_length, 768)

        # Create a mask to ignore padding tokens (0s in attention_mask)
        mask = attention_mask.unsqueeze(-1).expand_as(hidden_states)  # Shape: (batch_size, seq_length, 768)

        summed_hidden_states = hidden_states * mask  # Mask out padding tokens
        token_counts = mask.sum(dim=1, keepdim=True)  # Count the non-padding tokens
        avg_pooling = summed_hidden_states.sum(dim=1) / token_counts  # Average across the sequence

        # Apply the projection layer (linear + layernorm)
        return self.projection(avg_pooling)  # Shape: (batch_size, out_dim)

In [3]:
#this cell prints all the layers of bert (we have freezed everything except the projection layer)
from transformers import BertModel



# Load the BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Print all layers with their names
for name, param in model.named_parameters():
    print(f"Layer: {name} | Trainable: {param.requires_grad}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Layer: embeddings.word_embeddings.weight | Trainable: True
Layer: embeddings.position_embeddings.weight | Trainable: True
Layer: embeddings.token_type_embeddings.weight | Trainable: True
Layer: embeddings.LayerNorm.weight | Trainable: True
Layer: embeddings.LayerNorm.bias | Trainable: True
Layer: encoder.layer.0.attention.self.query.weight | Trainable: True
Layer: encoder.layer.0.attention.self.query.bias | Trainable: True
Layer: encoder.layer.0.attention.self.key.weight | Trainable: True
Layer: encoder.layer.0.attention.self.key.bias | Trainable: True
Layer: encoder.layer.0.attention.self.value.weight | Trainable: True
Layer: encoder.layer.0.attention.self.value.bias | Trainable: True
Layer: encoder.layer.0.attention.output.dense.weight | Trainable: True
Layer: encoder.layer.0.attention.output.dense.bias | Trainable: True
Layer: encoder.layer.0.attention.output.LayerNorm.weight | Trainable: True
Layer: encoder.layer.0.attention.output.LayerNorm.bias | Trainable: True
Layer: encoder.la