In [None]:
!pip install --upgrade transformers accelerate

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3


In [6]:
!pip install transformers



In [22]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification


class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = (df["utterance"]+" - "+df["context"]).tolist()
        self.labels = df["label"].astype("category")
        self.label_ids = self.labels.cat.codes
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = dict(enumerate(self.labels.cat.categories))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.label_ids.iloc[idx], dtype=torch.long)
        return item


df = pd.read_csv("subset_labeled.tsv", sep="\t")
print("file read\n")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42,
                                     stratify=df["label"])
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_ds = TextDataset(train_df, tokenizer)
test_ds = TextDataset(test_df, tokenizer)
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(train_df["label"].unique()),
    problem_type="single_label_classification"
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
for epoch in tqdm(range(30)):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"[Epoch {epoch+1}] Loss = {loss.item():.4f}")
model.eval()
preds, gold = [], []

with torch.no_grad():
    for batch in test_loader:
        labels = batch["labels"]
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)

        logits = outputs.logits.cpu()
        preds.extend(logits.argmax(dim=1).tolist())
        gold.extend(labels.tolist())

label_names = train_df["label"].astype("category").cat.categories
print(classification_report(gold, preds, labels=range(len(label_names)), target_names=label_names))

model.save_pretrained("semantic_classifier_distilbert")
tokenizer.save_pretrained("semantic_classifier_distilbert")

file read



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 1/30 [00:01<00:54,  1.88s/it]

[Epoch 1] Loss = 1.8904


  7%|▋         | 2/30 [00:03<00:50,  1.81s/it]

[Epoch 2] Loss = 1.7094


 10%|█         | 3/30 [00:05<00:48,  1.80s/it]

[Epoch 3] Loss = 1.9251


 13%|█▎        | 4/30 [00:07<00:46,  1.80s/it]

[Epoch 4] Loss = 1.4391


 17%|█▋        | 5/30 [00:09<00:44,  1.80s/it]

[Epoch 5] Loss = 1.2428


 20%|██        | 6/30 [00:10<00:43,  1.80s/it]

[Epoch 6] Loss = 0.7546


 23%|██▎       | 7/30 [00:12<00:41,  1.81s/it]

[Epoch 7] Loss = 0.6724


 27%|██▋       | 8/30 [00:14<00:39,  1.82s/it]

[Epoch 8] Loss = 0.2591


 30%|███       | 9/30 [00:16<00:38,  1.82s/it]

[Epoch 9] Loss = 0.3356


 33%|███▎      | 10/30 [00:18<00:36,  1.82s/it]

[Epoch 10] Loss = 0.0856


 37%|███▋      | 11/30 [00:19<00:34,  1.82s/it]

[Epoch 11] Loss = 0.0752


 40%|████      | 12/30 [00:21<00:32,  1.82s/it]

[Epoch 12] Loss = 0.0441


 43%|████▎     | 13/30 [00:23<00:31,  1.83s/it]

[Epoch 13] Loss = 0.0450


 47%|████▋     | 14/30 [00:25<00:29,  1.84s/it]

[Epoch 14] Loss = 0.0675


 50%|█████     | 15/30 [00:27<00:27,  1.85s/it]

[Epoch 15] Loss = 0.0310


 53%|█████▎    | 16/30 [00:29<00:25,  1.86s/it]

[Epoch 16] Loss = 0.0442


 57%|█████▋    | 17/30 [00:31<00:24,  1.86s/it]

[Epoch 17] Loss = 0.0295


 60%|██████    | 18/30 [00:32<00:22,  1.87s/it]

[Epoch 18] Loss = 0.0203


 63%|██████▎   | 19/30 [00:34<00:20,  1.88s/it]

[Epoch 19] Loss = 0.0136


 67%|██████▋   | 20/30 [00:36<00:18,  1.88s/it]

[Epoch 20] Loss = 0.0194


 70%|███████   | 21/30 [00:38<00:16,  1.88s/it]

[Epoch 21] Loss = 0.0187


 73%|███████▎  | 22/30 [00:40<00:15,  1.88s/it]

[Epoch 22] Loss = 0.0242


 77%|███████▋  | 23/30 [00:42<00:13,  1.88s/it]

[Epoch 23] Loss = 0.0134


 80%|████████  | 24/30 [00:44<00:11,  1.87s/it]

[Epoch 24] Loss = 0.0125


 83%|████████▎ | 25/30 [00:46<00:09,  1.86s/it]

[Epoch 25] Loss = 0.0114


 87%|████████▋ | 26/30 [00:47<00:07,  1.85s/it]

[Epoch 26] Loss = 0.0090


 90%|█████████ | 27/30 [00:49<00:05,  1.84s/it]

[Epoch 27] Loss = 0.0070


 93%|█████████▎| 28/30 [00:51<00:03,  1.84s/it]

[Epoch 28] Loss = 0.0115


 97%|█████████▋| 29/30 [00:53<00:01,  1.84s/it]

[Epoch 29] Loss = 0.0068


100%|██████████| 30/30 [00:55<00:00,  1.84s/it]

[Epoch 30] Loss = 0.0157
              precision    recall  f1-score   support

         AGG       0.20      0.20      0.20         5
        ARTH       0.17      0.33      0.22         3
        COMP       0.33      0.33      0.33         3
      LOOKUP       0.50      0.50      0.50         4
        Next       0.00      0.00      0.00         1
       SUPER       1.00      0.50      0.67         4
       other       0.00      0.00      0.00         0

    accuracy                           0.35        20
   macro avg       0.31      0.27      0.27        20
weighted avg       0.42      0.35      0.37        20




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


('semantic_classifier_distilbert/tokenizer_config.json',
 'semantic_classifier_distilbert/special_tokens_map.json',
 'semantic_classifier_distilbert/vocab.txt',
 'semantic_classifier_distilbert/added_tokens.json',
 'semantic_classifier_distilbert/tokenizer.json')