# Step 1: Simulate a Mini Multi-label Dataset

In [1]:
# Sample input sentences (like job descriptions or resume lines)
texts = [
    "Looking for a data analyst skilled in Python and SQL.",
    "We need a cloud engineer with AWS and Docker experience.",
    "This role requires machine learning and deep learning expertise.",
]

# Corresponding multi-labels (multi-hot encoded)
# Format: [python, sql, aws, docker, ml, dl]
# Why? Because multi-label = multiple '1's allowed
labels = [
    [1, 1, 0, 0, 0, 0],  # Python + SQL
    [0, 0, 1, 1, 0, 0],  # AWS + Docker
    [0, 0, 0, 0, 1, 1],  # ML + DL
]

# Step 2: Load Tokenizer and Encode Inputs

In [2]:
from transformers import AutoTokenizer

# Load BERT tokenizer (why? because it breaks input into tokens that BERT understands)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize input text (adds input_ids and attention_masks)
# Why return_tensors="pt"? To get PyTorch tensors directly
encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Step 3: Create a Custom Dataset

In [3]:
import torch

# Custom dataset class for multi-label task
class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float)  # Convert list to tensor

    def __getitem__(self, idx):
        # Return one sample: input_ids, attention_mask, and its multi-hot labels
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset object
dataset = MultiLabelDataset(encodings, labels)

Why?
Trainer expects a PyTorch-style dataset with __getitem__ and __len__. Each sample should return both inputs and labels.

# Step 4: Load Model + Configure for Multi-label Classification

In [4]:
from transformers import AutoModelForSequenceClassification

# Load a base BERT model with classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=6,  # We have 6 skill categories
    problem_type="multi_label_classification"  # This is key!
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Why?
	•	problem_type="multi_label_classification" automatically uses:
	•	Sigmoid activation
	•	BCEWithLogitsLoss
✅ Saves us from custom implementation

# Step 5: TrainingArguments + Trainer

In [5]:
from transformers import TrainingArguments, Trainer

# Set basic training configurations
training_args = TrainingArguments(
    output_dir="./results",           # Where to save model logs
    per_device_train_batch_size=2,    # Small batch for demo
    num_train_epochs=3,               # Just 3 epochs for test run
    logging_strategy="epoch"          # Log once per epoch
)

# Trainer handles training loop, batching, optimizer, etc.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)


In [6]:
# Run training
trainer.train()



Step,Training Loss
2,0.7329
4,0.6481
6,0.6333


TrainOutput(global_step=6, training_loss=0.6713985403378805, metrics={'train_runtime': 21.0003, 'train_samples_per_second': 0.429, 'train_steps_per_second': 0.286, 'total_flos': 69377476860.0, 'train_loss': 0.6713985403378805, 'epoch': 3.0})