In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import torch.nn as nn

torch: Core PyTorch library for tensors and GPU support.

AutoTokenizer: Automatically selects the right tokenizer for the model.

AutoModelForSequenceClassification: Loads a BERT-like model ready for classification.

DataLoader, Dataset: PyTorch utilities to load data in batches.

functional and nn: For using built-in loss functions and layers.



In [2]:
# 1. Setup: Tokenizer and Model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model_name: Specifies the pretrained model to use.

tokenizer: Converts raw text into input IDs + attention masks.

model: Loads a classification head on top of DistilBERT with 2 output labels.



In [7]:
# 2. Device Handling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Detects whether GPU (CUDA) is available.

Moves the model to the correct device for training.



In [4]:
# 3. Dummy Dataset
class DummyDataset(Dataset):
    def __init__(self, tokenizer):
        self.samples = ["hello world", "huggingface rocks", "open source is future", "transformers rule"]
        self.labels = [0, 1, 0, 1]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        encoded = self.tokenizer(self.samples[idx], truncation=True, padding="max_length", max_length=16, return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


__init__: Initializes some sample text and binary labels (0 or 1).

__len__: Returns dataset length.

__getitem__: Tokenizes a sentence, adds label, returns a dictionary with:

input_ids, attention_mask, and labels.



In [5]:
dataset = DummyDataset(tokenizer)
dataloader = DataLoader(dataset, batch_size=2)

# 4. Loss Function
loss_fn = nn.CrossEntropyLoss()

Wraps the dataset into a DataLoader that gives batches of size 2.



In [6]:
# 5. Forward Pass + Loss Computation Loop
model.train()
for batch in dataloader:
    # Move to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass
    outputs = model(input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"])

    # outputs.loss is computed inside HF model, but let’s also do it manually
    logits = outputs.logits
    labels = batch["labels"]
    loss = loss_fn(logits, labels)

    # Print for tracking
    print(f"Logits: {logits}")
    print(f"Labels: {labels}")
    print(f"Loss: {loss.item()}")

    # Backward + Optimizer logic would go here
    # loss.backward()
    # optimizer.step()
    # optimizer.zero_grad()

Logits: tensor([[-0.1124,  0.0974],
        [-0.0601,  0.0367]], grad_fn=<AddmmBackward0>)
Labels: tensor([0, 1])
Loss: 0.724736750125885
Logits: tensor([[-0.0858,  0.1366],
        [-0.0734,  0.0856]], grad_fn=<AddmmBackward0>)
Labels: tensor([0, 1])
Loss: 0.7136544585227966
