In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import pandas as pd
import numpy as np
from datasets import Dataset

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv("../data/train.csv")
val_df   = pd.read_csv("../data/val.csv")

print("Train size:", len(train_df))
print("Val size:", len(val_df))

train_df.head()

Train size: 13887
Val size: 2451


Unnamed: 0,id,text,label
0,4806,The laptop analytics applications unexpectedly...,other
1,12731,Experienced slow performance with analytics so...,technical_issue
2,12936,"During peak usage times, the SaaS platform is ...",technical_issue
3,11378,Facing connectivity problems that are impactin...,other
4,51,"Dear Customer Support Team, I wish to urgentl...",other


In [3]:
labels = sorted(train_df["label"].unique())
label2id = {lbl: i for i, lbl in enumerate(labels)}
id2label = {i: lbl for lbl, i in label2id.items()}

train_df["labels"] = train_df["label"].map(label2id)
val_df["labels"]   = val_df["label"].map(label2id)

print(label2id)

{'billing': 0, 'other': 1, 'product_info': 2, 'technical_issue': 3}


In [4]:
train_dataset = Dataset.from_pandas(train_df[["text", "labels"]])
val_dataset   = Dataset.from_pandas(val_df[["text", "labels"]])

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )

train_enc = train_dataset.map(tokenize_function, batched=True)
val_enc   = val_dataset.map(tokenize_function, batched=True)

train_enc = train_enc.remove_columns(["text"])
val_enc   = val_enc.remove_columns(["text"])

train_enc.set_format("torch")
val_enc.set_format("torch")

Map: 100%|████████████████████████████████████████████████████████████████████████████████████| 13887/13887 [00:01<00:00, 8802.27 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████| 2451/2451 [00:00<00:00, 9089.59 examples/s]


In [6]:
num_labels = len(labels)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

print("Model Loaded. Parameters:", sum(p.numel() for p in model.parameters()))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded. Parameters: 66956548


In [7]:
from torch.utils.data import DataLoader

batch_size = 8

train_loader = DataLoader(train_enc, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_enc, batch_size=batch_size)

# GPU / MPS / CPU auto-select
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

print("Using device:", device)

model.to(device)

Using device: mps


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
from tqdm.auto import tqdm

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 3   # reduce to 2 if slow

for epoch in range(num_epochs):
    print(f"\n======== Epoch {epoch+1}/{num_epochs} ========")

    # ---- TRAIN ----
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Train Loss: {train_loss / len(train_loader):.4f}")

    # ---- VALIDATION ----
    model.eval()
    correct = total = 0

    with torch.no_grad():
        for batch in val_loader:
            labels = batch["labels"]
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            preds = outputs.logits.argmax(dim=-1).cpu()

            correct += (preds == labels).sum().item()
            total += len(labels)

    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy:.4f}")




100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1736/1736 [13:29<00:00,  2.14it/s]


Train Loss: 0.8135
Validation Accuracy: 0.6614



100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1736/1736 [15:34<00:00,  1.86it/s]


Train Loss: 0.6880
Validation Accuracy: 0.7079



100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1736/1736 [15:20<00:00,  1.89it/s]


Train Loss: 0.5007
Validation Accuracy: 0.7544


In [9]:
save_dir = "../models/bert_model"
os.makedirs(save_dir, exist_ok=True)

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Model + tokenizer saved to:", save_dir)

Model + tokenizer saved to: ../models/bert_model
