# DistilBERT Fine-tuning vs Zero-Shot Baseline (Toy Intent Dataset)

This notebook loads a toy 15-intent dataset, evaluates a zero-shot baseline (`facebook/bart-large-mnli`), fine-tunes DistilBERT, and compares Accuracy/Macro-F1 on the test set.

In [8]:
# 🚀 Environment Setup Cell
# Run this once at the start of the notebook


import os
import subprocess

# Optional: create and activate a virtual environment
# (if you are running locally and want isolation)
venv_dir = "intent_env"

if not os.path.exists(venv_dir):
    subprocess.run([sys.executable, "-m", "venv", venv_dir])
    print(f"✅ Created virtual environment: {venv_dir}")
    # On Linux/macOS, to activate manually later: source intent_env/bin/activate
    # On Windows (PowerShell): .\\intent_env\\Scripts\\Activate.ps1

# Install all dependencies into the current environment
# 🚀 Environment Setup Cell (safe with pinned versions)
# Run this once at the start of the notebook

!pip install --upgrade pip
!pip install -q \
    torch==2.3.1 \
    transformers==4.44.2 \
    datasets==2.20.0 \
    evaluate==0.4.2 \
    accelerate==0.34.2 \
    scikit-learn==1.5.1 \
    pandas==2.2.2 \
    numpy==1.26.4 \
    fastapi \
    uvicorn


Collecting pip
  Using cached pip-25.2-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
e:\PROJECTS_RK\Carnot\intent_env\Scripts\python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Setup

In [3]:

# !pip -q install -U transformers datasets evaluate scikit-learn accelerate
import os
os.environ["USE_TF"] = "0"

import random, numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
from datasets import Dataset, DatasetDict

SEED = 42
random.seed(SEED); np.random.seed(SEED)

# Load dataset
CANDIDATES = ['/mnt/data/intent_dataset.csv','./intent_dataset.csv','/content/intent_dataset.csv']
data_path = next((p for p in CANDIDATES if os.path.exists(p)), None)
assert data_path, "intent_dataset.csv not found. Upload it first."
df = pd.read_csv(data_path)

labels = sorted(df['label'].unique().tolist())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

train_df = df[df['split']=='train'].copy()
val_df   = df[df['split']=='val'].copy()
test_df  = df[df['split']=='test'].copy()

def to_ds(d): return Dataset.from_pandas(d[['text','label']].reset_index(drop=True))
raw = DatasetDict({'train': to_ds(train_df), 'validation': to_ds(val_df), 'test': to_ds(test_df)})
print(raw)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 15
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 15
    })
})


## Zero-shot Baseline (BART-MNLI)

In [4]:

zs = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=-1)
candidate_labels = labels

def zs_predict(texts):
    pred = []
    for t in texts:
        r = zs(t, candidate_labels=candidate_labels, multi_label=False)
        pred.append(r['labels'][0])
    return pred

zs_preds = zs_predict(test_df['text'].tolist())
zs_acc = accuracy_score(test_df['label'], zs_preds)
zs_f1  = f1_score(test_df['label'], zs_preds, average='macro')
print(f"Zero-shot Accuracy: {zs_acc:.4f}\nZero-shot Macro-F1: {zs_f1:.4f}")
print("\nReport:\n", classification_report(test_df['label'], zs_preds, digits=3))




Zero-shot Accuracy: 0.6000
Zero-shot Macro-F1: 0.5222

Report:
               precision    recall  f1-score   support

 book_flight      1.000     1.000     1.000         1
cancel_alarm      1.000     1.000     1.000         1
     goodbye      1.000     1.000     1.000         1
       greet      0.500     1.000     0.667         1
    navigate      0.500     1.000     0.667         1
 order_pizza      1.000     1.000     1.000         1
  play_music      0.000     0.000     0.000         1
    reminder      0.333     1.000     0.500         1
  send_email      0.000     0.000     0.000         1
   set_alarm      0.000     0.000     0.000         1
  small_talk      0.000     0.000     0.000         1
   tell_joke      0.000     0.000     0.000         1
   thank_you      0.000     0.000     0.000         1
        time      1.000     1.000     1.000         1
     weather      1.000     1.000     1.000         1

    accuracy                          0.600        15
   macro avg    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Fine-tune DistilBERT

In [5]:
from transformers import EarlyStoppingCallback

model_name = 'distilbert-base-uncased'
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# tokenize and encode labels
def preprocess(example):
    enc = tok(example['text'], truncation=True)
    enc["labels"] = label2id[example['label']]   # numeric label
    return enc

tokd = raw.map(preprocess)

# 🚨 Drop the original string column 'label'
tokd = tokd.remove_columns(["label"])

collator = DataCollatorWithPadding(tokenizer=tok)

# metrics
def metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average='macro')
    }

args = TrainingArguments(
    output_dir='./outputs',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=25,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    logging_steps=5,
    seed=SEED,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokd['train'],
    eval_dataset=tokd['validation'],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

)

# train
trainer.train()

trainer.save_model("distilbert")  # For DistilBERT


# evaluate on test
test_metrics = trainer.evaluate(tokd['test'])
print("Test metrics:", test_metrics)

# predictions
pred_output = trainer.predict(tokd['test'])
preds = np.argmax(pred_output.predictions, axis=1)
true  = pred_output.label_ids

# classification report
print("\nClassification report (fine-tuned):\n",
      classification_report(true, preds,
                            labels=list(label2id.values()),
                            target_names=labels,
                            digits=3,
                            zero_division=0))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 45/45 [00:00<00:00, 1419.89 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 2055.36 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 1301.85 examples/s]
  3%|▎         | 5/150 [00:09<03:31,  1.46s/it]

{'loss': 2.7227, 'grad_norm': 2.9186813831329346, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.83}


                                               
  4%|▍         | 6/150 [00:10<03:08,  1.31s/it]

{'eval_loss': 2.673696279525757, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.01111111111111111, 'eval_runtime': 0.2285, 'eval_samples_per_second': 65.639, 'eval_steps_per_second': 8.752, 'epoch': 1.0}


  7%|▋         | 10/150 [00:17<03:08,  1.34s/it]

{'loss': 2.6264, 'grad_norm': 2.811199426651001, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.67}


                                                
  8%|▊         | 12/150 [00:19<02:26,  1.06s/it]

{'eval_loss': 2.579871654510498, 'eval_accuracy': 0.3333333333333333, 'eval_macro_f1': 0.21555555555555556, 'eval_runtime': 0.155, 'eval_samples_per_second': 96.744, 'eval_steps_per_second': 12.899, 'epoch': 2.0}


 10%|█         | 15/150 [00:23<02:42,  1.21s/it]

{'loss': 2.538, 'grad_norm': 3.7194552421569824, 'learning_rate': 4.5e-05, 'epoch': 2.5}


                                                
 12%|█▏        | 18/150 [00:25<01:59,  1.10it/s]

{'eval_loss': 2.445007562637329, 'eval_accuracy': 0.5333333333333333, 'eval_macro_f1': 0.4444444444444445, 'eval_runtime': 0.1444, 'eval_samples_per_second': 103.864, 'eval_steps_per_second': 13.848, 'epoch': 3.0}


 13%|█▎        | 20/150 [00:29<02:43,  1.25s/it]

{'loss': 2.3414, 'grad_norm': 4.2846174240112305, 'learning_rate': 4.3333333333333334e-05, 'epoch': 3.33}


                                                
 16%|█▌        | 24/150 [00:33<02:10,  1.04s/it]

{'eval_loss': 2.2874057292938232, 'eval_accuracy': 0.5333333333333333, 'eval_macro_f1': 0.4444444444444445, 'eval_runtime': 0.1578, 'eval_samples_per_second': 95.028, 'eval_steps_per_second': 12.67, 'epoch': 4.0}


 17%|█▋        | 25/150 [00:35<02:54,  1.40s/it]

{'loss': 2.1571, 'grad_norm': 4.053956031799316, 'learning_rate': 4.166666666666667e-05, 'epoch': 4.17}


 20%|██        | 30/150 [00:39<02:01,  1.01s/it]

{'loss': 1.9223, 'grad_norm': 6.268595218658447, 'learning_rate': 4e-05, 'epoch': 5.0}


                                                
 20%|██        | 30/150 [00:40<02:01,  1.01s/it]

{'eval_loss': 2.142947196960449, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6888888888888888, 'eval_runtime': 0.1807, 'eval_samples_per_second': 83.019, 'eval_steps_per_second': 11.069, 'epoch': 5.0}


 23%|██▎       | 35/150 [00:45<01:56,  1.01s/it]

{'loss': 1.685, 'grad_norm': 4.566011905670166, 'learning_rate': 3.8333333333333334e-05, 'epoch': 5.83}


                                                
 24%|██▍       | 36/150 [00:47<01:52,  1.01it/s]

{'eval_loss': 2.000321388244629, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6888888888888888, 'eval_runtime': 0.2426, 'eval_samples_per_second': 61.822, 'eval_steps_per_second': 8.243, 'epoch': 6.0}


 27%|██▋       | 40/150 [00:51<01:56,  1.06s/it]

{'loss': 1.4889, 'grad_norm': 4.367091655731201, 'learning_rate': 3.6666666666666666e-05, 'epoch': 6.67}


                                                
 28%|██▊       | 42/150 [00:53<01:48,  1.01s/it]

{'eval_loss': 1.8625670671463013, 'eval_accuracy': 0.6666666666666666, 'eval_macro_f1': 0.5777777777777777, 'eval_runtime': 0.201, 'eval_samples_per_second': 74.611, 'eval_steps_per_second': 9.948, 'epoch': 7.0}


 30%|███       | 45/150 [00:58<02:07,  1.21s/it]

{'loss': 1.3956, 'grad_norm': 4.754559516906738, 'learning_rate': 3.5e-05, 'epoch': 7.5}


                                                
 32%|███▏      | 48/150 [01:01<01:48,  1.06s/it]

{'eval_loss': 1.7411448955535889, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6444444444444444, 'eval_runtime': 0.2178, 'eval_samples_per_second': 68.882, 'eval_steps_per_second': 9.184, 'epoch': 8.0}


 33%|███▎      | 50/150 [01:04<02:05,  1.26s/it]

{'loss': 1.1808, 'grad_norm': 4.81233549118042, 'learning_rate': 3.3333333333333335e-05, 'epoch': 8.33}


                                                
 36%|███▌      | 54/150 [01:07<01:31,  1.05it/s]

{'eval_loss': 1.6071878671646118, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1483, 'eval_samples_per_second': 101.123, 'eval_steps_per_second': 13.483, 'epoch': 9.0}


 37%|███▋      | 55/150 [01:09<02:02,  1.29s/it]

{'loss': 1.0022, 'grad_norm': 3.8928184509277344, 'learning_rate': 3.1666666666666666e-05, 'epoch': 9.17}


 40%|████      | 60/150 [01:14<01:25,  1.05it/s]

{'loss': 0.9121, 'grad_norm': 4.591842174530029, 'learning_rate': 3e-05, 'epoch': 10.0}


                                                
 40%|████      | 60/150 [01:14<01:25,  1.05it/s]

{'eval_loss': 1.5116381645202637, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1747, 'eval_samples_per_second': 85.86, 'eval_steps_per_second': 11.448, 'epoch': 10.0}


 43%|████▎     | 65/150 [01:19<01:24,  1.01it/s]

{'loss': 0.7626, 'grad_norm': 3.213477373123169, 'learning_rate': 2.8333333333333335e-05, 'epoch': 10.83}


                                                
 44%|████▍     | 66/150 [01:20<01:19,  1.05it/s]

{'eval_loss': 1.4129141569137573, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6444444444444444, 'eval_runtime': 0.2529, 'eval_samples_per_second': 59.311, 'eval_steps_per_second': 7.908, 'epoch': 11.0}


 47%|████▋     | 70/150 [01:25<01:22,  1.03s/it]

{'loss': 0.7236, 'grad_norm': 3.295912504196167, 'learning_rate': 2.6666666666666667e-05, 'epoch': 11.67}


                                                
 48%|████▊     | 72/150 [01:27<01:13,  1.06it/s]

{'eval_loss': 1.3271335363388062, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6444444444444444, 'eval_runtime': 0.1671, 'eval_samples_per_second': 89.741, 'eval_steps_per_second': 11.965, 'epoch': 12.0}


 50%|█████     | 75/150 [01:31<01:21,  1.08s/it]

{'loss': 0.6287, 'grad_norm': 3.8060712814331055, 'learning_rate': 2.5e-05, 'epoch': 12.5}


                                                
 52%|█████▏    | 78/150 [01:34<01:12,  1.00s/it]

{'eval_loss': 1.261046051979065, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.3922, 'eval_samples_per_second': 38.248, 'eval_steps_per_second': 5.1, 'epoch': 13.0}


 53%|█████▎    | 80/150 [01:38<01:41,  1.45s/it]

{'loss': 0.549, 'grad_norm': 2.946098804473877, 'learning_rate': 2.3333333333333336e-05, 'epoch': 13.33}


                                                
 56%|█████▌    | 84/150 [01:42<01:14,  1.13s/it]

{'eval_loss': 1.2095402479171753, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.231, 'eval_samples_per_second': 64.938, 'eval_steps_per_second': 8.658, 'epoch': 14.0}


 57%|█████▋    | 85/150 [01:45<01:46,  1.64s/it]

{'loss': 0.493, 'grad_norm': 2.2681174278259277, 'learning_rate': 2.1666666666666667e-05, 'epoch': 14.17}


 60%|██████    | 90/150 [01:50<01:10,  1.18s/it]

{'loss': 0.4563, 'grad_norm': 3.0421011447906494, 'learning_rate': 2e-05, 'epoch': 15.0}


                                                
 60%|██████    | 90/150 [01:50<01:10,  1.18s/it]

{'eval_loss': 1.162945032119751, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6444444444444444, 'eval_runtime': 0.2124, 'eval_samples_per_second': 70.612, 'eval_steps_per_second': 9.415, 'epoch': 15.0}


 63%|██████▎   | 95/150 [01:57<01:04,  1.18s/it]

{'loss': 0.4269, 'grad_norm': 2.803086757659912, 'learning_rate': 1.8333333333333333e-05, 'epoch': 15.83}


                                                
 64%|██████▍   | 96/150 [01:58<01:01,  1.14s/it]

{'eval_loss': 1.1298209428787231, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333332, 'eval_runtime': 0.2189, 'eval_samples_per_second': 68.532, 'eval_steps_per_second': 9.138, 'epoch': 16.0}


 67%|██████▋   | 100/150 [02:03<00:57,  1.16s/it]

{'loss': 0.3644, 'grad_norm': 2.1398403644561768, 'learning_rate': 1.6666666666666667e-05, 'epoch': 16.67}


                                                 
 68%|██████▊   | 102/150 [02:05<00:48,  1.02s/it]

{'eval_loss': 1.0959430932998657, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.1841, 'eval_samples_per_second': 81.498, 'eval_steps_per_second': 10.866, 'epoch': 17.0}


 70%|███████   | 105/150 [02:09<00:53,  1.18s/it]

{'loss': 0.3412, 'grad_norm': 1.8768434524536133, 'learning_rate': 1.5e-05, 'epoch': 17.5}


                                                 
 72%|███████▏  | 108/150 [02:12<00:42,  1.02s/it]

{'eval_loss': 1.072197437286377, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6444444444444444, 'eval_runtime': 0.1812, 'eval_samples_per_second': 82.769, 'eval_steps_per_second': 11.036, 'epoch': 18.0}


 73%|███████▎  | 110/150 [02:15<00:50,  1.27s/it]

{'loss': 0.3431, 'grad_norm': 2.113734722137451, 'learning_rate': 1.3333333333333333e-05, 'epoch': 18.33}


                                                 
 76%|███████▌  | 114/150 [02:19<00:36,  1.02s/it]

{'eval_loss': 1.0447717905044556, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1773, 'eval_samples_per_second': 84.594, 'eval_steps_per_second': 11.279, 'epoch': 19.0}


 77%|███████▋  | 115/150 [02:21<00:47,  1.36s/it]

{'loss': 0.306, 'grad_norm': 1.9782265424728394, 'learning_rate': 1.1666666666666668e-05, 'epoch': 19.17}


 80%|████████  | 120/150 [02:26<00:27,  1.07it/s]

{'loss': 0.2867, 'grad_norm': 2.1004390716552734, 'learning_rate': 1e-05, 'epoch': 20.0}


                                                 
 80%|████████  | 120/150 [02:26<00:27,  1.07it/s]

{'eval_loss': 1.015674352645874, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.147, 'eval_samples_per_second': 102.032, 'eval_steps_per_second': 13.604, 'epoch': 20.0}


 83%|████████▎ | 125/150 [02:31<00:25,  1.01s/it]

{'loss': 0.2719, 'grad_norm': 1.6629743576049805, 'learning_rate': 8.333333333333334e-06, 'epoch': 20.83}


                                                 
 84%|████████▍ | 126/150 [02:32<00:23,  1.03it/s]

{'eval_loss': 1.0036247968673706, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1904, 'eval_samples_per_second': 78.797, 'eval_steps_per_second': 10.506, 'epoch': 21.0}


 87%|████████▋ | 130/150 [02:37<00:21,  1.08s/it]

{'loss': 0.2503, 'grad_norm': 1.5113637447357178, 'learning_rate': 6.666666666666667e-06, 'epoch': 21.67}


                                                 
 88%|████████▊ | 132/150 [02:39<00:18,  1.00s/it]

{'eval_loss': 0.9944483041763306, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.2258, 'eval_samples_per_second': 66.439, 'eval_steps_per_second': 8.858, 'epoch': 22.0}


 90%|█████████ | 135/150 [02:43<00:17,  1.14s/it]

{'loss': 0.253, 'grad_norm': 1.672621488571167, 'learning_rate': 5e-06, 'epoch': 22.5}


                                                 
 92%|█████████▏| 138/150 [02:46<00:11,  1.00it/s]

{'eval_loss': 0.984079122543335, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1702, 'eval_samples_per_second': 88.127, 'eval_steps_per_second': 11.75, 'epoch': 23.0}


 93%|█████████▎| 140/150 [02:49<00:11,  1.19s/it]

{'loss': 0.2322, 'grad_norm': 1.3074082136154175, 'learning_rate': 3.3333333333333333e-06, 'epoch': 23.33}


                                                 
 96%|█████████▌| 144/150 [02:53<00:05,  1.02it/s]

{'eval_loss': 0.975184440612793, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1468, 'eval_samples_per_second': 102.159, 'eval_steps_per_second': 13.621, 'epoch': 24.0}


 97%|█████████▋| 145/150 [02:55<00:06,  1.33s/it]

{'loss': 0.2426, 'grad_norm': 1.5266014337539673, 'learning_rate': 1.6666666666666667e-06, 'epoch': 24.17}


100%|██████████| 150/150 [02:59<00:00,  1.04it/s]

{'loss': 0.2354, 'grad_norm': 2.0505197048187256, 'learning_rate': 0.0, 'epoch': 25.0}


                                                 
100%|██████████| 150/150 [03:01<00:00,  1.04it/s]

{'eval_loss': 0.9724354147911072, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7333333333333333, 'eval_runtime': 0.1647, 'eval_samples_per_second': 91.099, 'eval_steps_per_second': 12.147, 'epoch': 25.0}


100%|██████████| 150/150 [03:16<00:00,  1.31s/it]


{'train_runtime': 196.3655, 'train_samples_per_second': 5.729, 'train_steps_per_second': 0.764, 'train_loss': 0.9713185540835063, 'epoch': 25.0}


100%|██████████| 2/2 [00:00<00:00, 22.35it/s]


Test metrics: {'eval_loss': 0.9929536581039429, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7777777777777778, 'eval_runtime': 0.1788, 'eval_samples_per_second': 83.89, 'eval_steps_per_second': 11.185, 'epoch': 25.0}


100%|██████████| 2/2 [00:00<00:00, 19.31it/s]



Classification report (fine-tuned):
               precision    recall  f1-score   support

 book_flight      1.000     1.000     1.000         1
cancel_alarm      1.000     1.000     1.000         1
     goodbye      0.000     0.000     0.000         1
       greet      0.000     0.000     0.000         1
    navigate      1.000     1.000     1.000         1
 order_pizza      1.000     1.000     1.000         1
  play_music      1.000     1.000     1.000         1
    reminder      0.500     1.000     0.667         1
  send_email      1.000     1.000     1.000         1
   set_alarm      0.000     0.000     0.000         1
  small_talk      1.000     1.000     1.000         1
   tell_joke      1.000     1.000     1.000         1
   thank_you      1.000     1.000     1.000         1
        time      1.000     1.000     1.000         1
     weather      1.000     1.000     1.000         1

    accuracy                          0.800        15
   macro avg      0.767     0.800     0.77

## RoBERTa-base

In [6]:
import os, random, numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
model_name = "roberta-base"
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# tokenize and encode labels
def preprocess(example):
    enc = tok(example['text'], truncation=True)
    enc["labels"] = label2id[example['label']]   # numeric label
    return enc

tokd = raw.map(preprocess)

# 🚨 Drop the original string column 'label'
tokd = tokd.remove_columns(["label"])

collator = DataCollatorWithPadding(tokenizer=tok)


# ------------------------
# Metrics
# ------------------------
def metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "macro_f1": f1_score(y_true, y_pred, average='macro')
    }

# ------------------------
# Training Arguments
# ------------------------
args = TrainingArguments(
    output_dir='./outputs_roberta',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,  # smaller for RoBERTa
    num_train_epochs=20, # more epochs, with early stopping
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    logging_steps=10,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    seed=SEED,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokd['train'],
    eval_dataset=tokd['validation'],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# ------------------------
# Train
# ------------------------
trainer.train()

trainer.save_model("roberta")


# ------------------------
# Evaluate on Test
# ------------------------
test_metrics_roberta = trainer.evaluate(tokd['test'])
print("\nTest metrics:", test_metrics_roberta)

# Predictions + Report
preds = trainer.predict(tokd['test']).predictions.argmax(axis=1)
true  = [label2id[l] for l in test_df['label']]

print("\nClassification Report (RoBERTa-base):\n",
      classification_report(true, preds, target_names=labels, digits=3))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 45/45 [00:00<00:00, 875.80 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 2094.71 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 1380.49 examples/s]
  5%|▌         | 6/120 [00:13<03:13,  1.70s/it]
  5%|▌         | 6/120 [00:13<03:13,  1.70s/it]

{'eval_loss': 2.7162394523620605, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008333333333333333, 'eval_runtime': 0.2541, 'eval_samples_per_second': 59.032, 'eval_steps_per_second': 7.871, 'epoch': 1.0}


  8%|▊         | 10/120 [00:22<03:18,  1.80s/it]

{'loss': 2.7246, 'grad_norm': 2.4551167488098145, 'learning_rate': 5e-06, 'epoch': 1.67}


 10%|█         | 12/120 [00:24<02:53,  1.61s/it]
 10%|█         | 12/120 [00:25<02:53,  1.61s/it]

{'eval_loss': 2.715085506439209, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008333333333333333, 'eval_runtime': 0.3057, 'eval_samples_per_second': 49.072, 'eval_steps_per_second': 6.543, 'epoch': 2.0}


 15%|█▌        | 18/120 [00:35<02:26,  1.43s/it]
 15%|█▌        | 18/120 [00:35<02:26,  1.43s/it]

{'eval_loss': 2.713257074356079, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008333333333333333, 'eval_runtime': 0.2704, 'eval_samples_per_second': 55.483, 'eval_steps_per_second': 7.398, 'epoch': 3.0}


 17%|█▋        | 20/120 [00:40<03:13,  1.94s/it]

{'loss': 2.7109, 'grad_norm': 4.0561957359313965, 'learning_rate': 1e-05, 'epoch': 3.33}


 20%|██        | 24/120 [00:46<02:35,  1.62s/it]
 20%|██        | 24/120 [00:46<02:35,  1.62s/it]

{'eval_loss': 2.710153102874756, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008333333333333333, 'eval_runtime': 0.3073, 'eval_samples_per_second': 48.819, 'eval_steps_per_second': 6.509, 'epoch': 4.0}


 25%|██▌       | 30/120 [00:57<02:23,  1.60s/it]

{'loss': 2.6999, 'grad_norm': 6.45473051071167, 'learning_rate': 1.5e-05, 'epoch': 5.0}



 25%|██▌       | 30/120 [00:57<02:23,  1.60s/it]

{'eval_loss': 2.703521966934204, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008333333333333333, 'eval_runtime': 0.3135, 'eval_samples_per_second': 47.847, 'eval_steps_per_second': 6.38, 'epoch': 5.0}


 30%|███       | 36/120 [01:09<02:19,  1.66s/it]
 30%|███       | 36/120 [01:09<02:19,  1.66s/it]

{'eval_loss': 2.675686836242676, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008333333333333333, 'eval_runtime': 0.2971, 'eval_samples_per_second': 50.487, 'eval_steps_per_second': 6.732, 'epoch': 6.0}


 33%|███▎      | 40/120 [01:17<02:22,  1.78s/it]

{'loss': 2.6774, 'grad_norm': 6.050930500030518, 'learning_rate': 2e-05, 'epoch': 6.67}


 35%|███▌      | 42/120 [01:20<02:06,  1.62s/it]
 35%|███▌      | 42/120 [01:21<02:06,  1.62s/it]

{'eval_loss': 2.5593631267547607, 'eval_accuracy': 0.06666666666666667, 'eval_macro_f1': 0.008888888888888889, 'eval_runtime': 0.3091, 'eval_samples_per_second': 48.524, 'eval_steps_per_second': 6.47, 'epoch': 7.0}


 40%|████      | 48/120 [01:32<01:56,  1.62s/it]
 40%|████      | 48/120 [01:32<01:56,  1.62s/it]

{'eval_loss': 2.307307720184326, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6444444444444444, 'eval_runtime': 0.31, 'eval_samples_per_second': 48.393, 'eval_steps_per_second': 6.452, 'epoch': 8.0}


 42%|████▏     | 50/120 [01:37<02:21,  2.02s/it]

{'loss': 2.4097, 'grad_norm': 13.492828369140625, 'learning_rate': 2.5e-05, 'epoch': 8.33}


 45%|████▌     | 54/120 [01:43<01:47,  1.63s/it]
 45%|████▌     | 54/120 [01:43<01:47,  1.63s/it]

{'eval_loss': 2.0273826122283936, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8444444444444444, 'eval_runtime': 0.2914, 'eval_samples_per_second': 51.483, 'eval_steps_per_second': 6.864, 'epoch': 9.0}


 50%|█████     | 60/120 [01:56<01:44,  1.74s/it]

{'loss': 1.8976, 'grad_norm': 18.625354766845703, 'learning_rate': 3e-05, 'epoch': 10.0}



 50%|█████     | 60/120 [01:57<01:44,  1.74s/it]

{'eval_loss': 1.8144596815109253, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7444444444444444, 'eval_runtime': 0.315, 'eval_samples_per_second': 47.614, 'eval_steps_per_second': 6.349, 'epoch': 10.0}


 55%|█████▌    | 66/120 [02:16<01:51,  2.06s/it]
 55%|█████▌    | 66/120 [02:16<01:51,  2.06s/it]

{'eval_loss': 1.5051658153533936, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8333333333333334, 'eval_runtime': 0.2642, 'eval_samples_per_second': 56.774, 'eval_steps_per_second': 7.57, 'epoch': 11.0}


 58%|█████▊    | 70/120 [02:24<01:33,  1.88s/it]

{'loss': 1.3674, 'grad_norm': 12.486993789672852, 'learning_rate': 3.5e-05, 'epoch': 11.67}


 60%|██████    | 72/120 [02:27<01:20,  1.67s/it]
 60%|██████    | 72/120 [02:28<01:20,  1.67s/it]

{'eval_loss': 1.2737575769424438, 'eval_accuracy': 0.8, 'eval_macro_f1': 0.7444444444444444, 'eval_runtime': 0.324, 'eval_samples_per_second': 46.295, 'eval_steps_per_second': 6.173, 'epoch': 12.0}


 65%|██████▌   | 78/120 [02:40<01:22,  1.96s/it]
 65%|██████▌   | 78/120 [02:41<01:22,  1.96s/it]

{'eval_loss': 0.9842804074287415, 'eval_accuracy': 0.9333333333333333, 'eval_macro_f1': 0.9111111111111111, 'eval_runtime': 0.513, 'eval_samples_per_second': 29.238, 'eval_steps_per_second': 3.898, 'epoch': 13.0}


 67%|██████▋   | 80/120 [02:46<01:32,  2.32s/it]

{'loss': 0.8599, 'grad_norm': 8.682960510253906, 'learning_rate': 4e-05, 'epoch': 13.33}


 70%|███████   | 84/120 [02:52<01:02,  1.73s/it]
 70%|███████   | 84/120 [02:53<01:02,  1.73s/it]

{'eval_loss': 0.7752098441123962, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.3536, 'eval_samples_per_second': 42.422, 'eval_steps_per_second': 5.656, 'epoch': 14.0}


 75%|███████▌  | 90/120 [03:04<00:48,  1.63s/it]

{'loss': 0.5096, 'grad_norm': 9.45975399017334, 'learning_rate': 4.5e-05, 'epoch': 15.0}



 75%|███████▌  | 90/120 [03:04<00:48,  1.63s/it]

{'eval_loss': 0.7054793238639832, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.3034, 'eval_samples_per_second': 49.44, 'eval_steps_per_second': 6.592, 'epoch': 15.0}


 80%|████████  | 96/120 [03:16<00:43,  1.83s/it]
 80%|████████  | 96/120 [03:16<00:43,  1.83s/it]

{'eval_loss': 0.557572066783905, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.4241, 'eval_samples_per_second': 35.372, 'eval_steps_per_second': 4.716, 'epoch': 16.0}


 83%|████████▎ | 100/120 [03:25<00:40,  2.00s/it]

{'loss': 0.2353, 'grad_norm': 2.7348649501800537, 'learning_rate': 5e-05, 'epoch': 16.67}


 85%|████████▌ | 102/120 [03:29<00:35,  1.97s/it]
 85%|████████▌ | 102/120 [03:30<00:35,  1.97s/it]

{'eval_loss': 0.4633140563964844, 'eval_accuracy': 0.9333333333333333, 'eval_macro_f1': 0.9111111111111111, 'eval_runtime': 0.4052, 'eval_samples_per_second': 37.021, 'eval_steps_per_second': 4.936, 'epoch': 17.0}


 90%|█████████ | 108/120 [03:59<00:30,  2.57s/it]
 90%|█████████ | 108/120 [04:00<00:30,  2.57s/it]

{'eval_loss': 0.4497148394584656, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.2845, 'eval_samples_per_second': 52.725, 'eval_steps_per_second': 7.03, 'epoch': 18.0}


 92%|█████████▏| 110/120 [04:04<00:24,  2.46s/it]

{'loss': 0.1131, 'grad_norm': 1.4416403770446777, 'learning_rate': 2.5e-05, 'epoch': 18.33}


 95%|█████████▌| 114/120 [04:10<00:10,  1.68s/it]
 95%|█████████▌| 114/120 [04:10<00:10,  1.68s/it]

{'eval_loss': 0.48239725828170776, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.3311, 'eval_samples_per_second': 45.304, 'eval_steps_per_second': 6.04, 'epoch': 19.0}


100%|██████████| 120/120 [04:21<00:00,  1.64s/it]

{'loss': 0.0695, 'grad_norm': 1.121393084526062, 'learning_rate': 0.0, 'epoch': 20.0}



100%|██████████| 120/120 [04:23<00:00,  1.64s/it]

{'eval_loss': 0.47439733147621155, 'eval_accuracy': 0.8666666666666667, 'eval_macro_f1': 0.8222222222222222, 'eval_runtime': 0.2523, 'eval_samples_per_second': 59.449, 'eval_steps_per_second': 7.926, 'epoch': 20.0}


100%|██████████| 120/120 [04:26<00:00,  2.22s/it]


{'train_runtime': 266.3141, 'train_samples_per_second': 3.379, 'train_steps_per_second': 0.451, 'train_loss': 1.5229162002603214, 'epoch': 20.0}


100%|██████████| 2/2 [00:00<00:00, 18.17it/s]



Test metrics: {'eval_loss': 1.124314785003662, 'eval_accuracy': 0.7333333333333333, 'eval_macro_f1': 0.6888888888888889, 'eval_runtime': 0.238, 'eval_samples_per_second': 63.033, 'eval_steps_per_second': 8.404, 'epoch': 20.0}


100%|██████████| 2/2 [00:00<00:00, 16.74it/s]


Classification Report (RoBERTa-base):
               precision    recall  f1-score   support

 book_flight      1.000     1.000     1.000         1
cancel_alarm      0.000     0.000     0.000         1
     goodbye      0.000     0.000     0.000         1
       greet      0.000     0.000     0.000         1
    navigate      1.000     1.000     1.000         1
 order_pizza      1.000     1.000     1.000         1
  play_music      0.000     0.000     0.000         1
    reminder      1.000     1.000     1.000         1
  send_email      1.000     1.000     1.000         1
   set_alarm      0.500     1.000     0.667         1
  small_talk      1.000     1.000     1.000         1
   tell_joke      1.000     1.000     1.000         1
   thank_you      1.000     1.000     1.000         1
        time      1.000     1.000     1.000         1
     weather      0.500     1.000     0.667         1

    accuracy                          0.733        15
   macro avg      0.667     0.733     0.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd

# ------------------------
# Collect Results
# ------------------------
results = []

# 1. Zero-shot (BART-MNLI)
results.append({
    "model": "Zero-shot (BART-MNLI)",
    "accuracy": zs_acc,
    "macro_f1": zs_f1
})

# 2. DistilBERT fine-tuned
results.append({
    "model": "DistilBERT (fine-tuned)",
    "accuracy": test_metrics['eval_accuracy'],   # from distilbert trainer.evaluate()
    "macro_f1": test_metrics['eval_macro_f1']
})

# 3. RoBERTa-base fine-tuned
results.append({
    "model": "RoBERTa-base (fine-tuned)",
    "accuracy": test_metrics_roberta['eval_accuracy'],   # from roberta trainer.evaluate()
    "macro_f1": test_metrics_roberta['eval_macro_f1']
})

# ------------------------
# Save Results to CSV
# ------------------------
df_results = pd.DataFrame(results)
df_results.to_csv("results.csv", index=False)

print("\n✅ Results saved to results.csv:\n")
print(df_results)



✅ Results saved to results.csv:

                       model  accuracy  macro_f1
0      Zero-shot (BART-MNLI)  0.600000  0.522222
1    DistilBERT (fine-tuned)  0.800000  0.777778
2  RoBERTa-base (fine-tuned)  0.733333  0.688889


## Notes
- Tiny dataset ⇒ variance across runs.
- Zero-shot depends on label phrasing; try natural-language label descriptions.
- Replace `intent_dataset.csv` with your dataset.