### Fine-tuning DistilBERT for Text Classification

In [52]:
# libraries
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from datasets import Dataset

import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [53]:
 # dataset
df = pd.read_csv('train.csv')   # change path if needed

# data exploration
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
display(df.head())
print("Missing per column:\n", df.isnull().sum())
print("Label counts:\n", df['intent'].value_counts())

Rows: 13084
Columns: ['text', 'intent']


Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist


Missing per column:
 text      0
intent    0
dtype: int64
Label counts:
 intent
PlayMusic               1914
GetWeather              1896
BookRestaurant          1881
RateBook                1876
SearchScreeningEvent    1852
SearchCreativeWork      1847
AddToPlaylist           1818
Name: count, dtype: int64


In [54]:
# data cleaning
df['intent'] = df['intent'].astype(str).str.lower()
df['text']   = df['text'].astype(str).str.lower()
df['raw_text'] = df['text'].copy()   # keep original

display(df.head())

Unnamed: 0,text,intent,raw_text
0,listen to westbam alumb allergic on google music,playmusic,listen to westbam alumb allergic on google music
1,add step to me to the 50 clásicos playlist,addtoplaylist,add step to me to the 50 clásicos playlist
2,i give this current textbook a rating value of...,ratebook,i give this current textbook a rating value of...
3,play the song little robin redbreast,playmusic,play the song little robin redbreast
4,please add iris dement to my playlist this is ...,addtoplaylist,please add iris dement to my playlist this is ...


In [55]:
# label encoding
labels = sorted(df['intent'].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['intent'].map(label2id)

print("Num classes:", len(labels))
print("label2id:", label2id)

Num classes: 7
label2id: {'addtoplaylist': 0, 'bookrestaurant': 1, 'getweather': 2, 'playmusic': 3, 'ratebook': 4, 'searchcreativework': 5, 'searchscreeningevent': 6}


In [56]:
# train-test split
train_df, test_df = train_test_split(
    df,
    test_size=0.20,
    stratify=df['label'],
    random_state=42
)

# Subsample for quick training
train_df = train_df.sample(n=200, random_state=42)   # only 200 samples
test_df  = test_df.sample(n=50, random_state=42)     # only 50 samples

print("Train / Test sizes:", len(train_df), len(test_df))

Train / Test sizes: 200 50


In [57]:
# datasets for Huggingface
train_ds = Dataset.from_pandas(train_df[['text','label']].reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df[['text','label']].reset_index(drop=True))
print(train_ds, test_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 200
}) Dataset({
    features: ['text', 'label'],
    num_rows: 50
})


In [58]:
# tokenizer and max_length selection
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

observed_max = max(len(tokenizer.encode(t, add_special_tokens=True)) for t in train_df['text'])
max_length = min(32, observed_max) # using 32 as the text length is generally short
print(f"Observed max on train: {observed_max} -> using max_length = {max_length}")

Observed max on train: 30 -> using max_length = 30


In [59]:
# tokenization
def tokenize_fn(examples):
    out = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)
    out['label'] = examples['label']
    return out

# applying tokenization
tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_test  = test_ds.map(tokenize_fn, batched=True, remove_columns=['text'])

tokenized_train.set_format(type='torch', columns=['input_ids','attention_mask','label'])
tokenized_test.set_format(type='torch', columns=['input_ids','attention_mask','label'])

print(tokenized_train[0])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

{'label': tensor(6), 'input_ids': tensor([ 101, 2507, 2033, 1996, 3185, 2335,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])}


In [60]:
# loading the pretrained distilbert model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
# import transformers
# print("transformers version:", transformers.__version__)

In [62]:
# !pip install -q -U transformers datasets

In [63]:
# !pip install --upgrade transformers


In [64]:
# training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    logging_steps=100,
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    save_total_limit=2,
    save_safetensors=False
)

In [65]:
# metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss
100,1.297
200,0.297


TrainOutput(global_step=250, training_loss=0.6646530513763428, metrics={'train_runtime': 313.713, 'train_samples_per_second': 6.375, 'train_steps_per_second': 0.797, 'total_flos': 15524907480000.0, 'train_loss': 0.6646530513763428, 'epoch': 10.0})

In [68]:
# evaluation & report
eval_res = trainer.evaluate()
print("Eval metrics:", eval_res)

preds_out = trainer.predict(tokenized_test)
y_true = preds_out.label_ids
y_pred = preds_out.predictions.argmax(axis=1)
print(classification_report(y_true, y_pred, target_names=labels))

# save
trainer.save_model("./distilbert-finetuned")
tokenizer.save_pretrained("./distilbert-finetuned")


Eval metrics: {'eval_loss': 0.24365589022636414, 'eval_accuracy': 0.94, 'eval_f1': 0.9400202020202021, 'eval_precision': 0.9473333333333334, 'eval_recall': 0.94, 'eval_runtime': 0.9112, 'eval_samples_per_second': 54.874, 'eval_steps_per_second': 2.195, 'epoch': 10.0}
                      precision    recall  f1-score   support

       addtoplaylist       1.00      1.00      1.00         9
      bookrestaurant       1.00      1.00      1.00         9
          getweather       0.80      1.00      0.89         4
           playmusic       0.83      1.00      0.91         5
            ratebook       1.00      1.00      1.00         4
  searchcreativework       1.00      0.82      0.90        11
searchscreeningevent       0.88      0.88      0.88         8

            accuracy                           0.94        50
           macro avg       0.93      0.96      0.94        50
        weighted avg       0.95      0.94      0.94        50



('./distilbert-finetuned\\tokenizer_config.json',
 './distilbert-finetuned\\special_tokens_map.json',
 './distilbert-finetuned\\vocab.txt',
 './distilbert-finetuned\\added_tokens.json')

In [69]:
# Load the fine-tuned model and tokenizer
model_path = "./distilbert-finetuned"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Map back labels (use the same mapping from training)
id2label = model.config.id2label

# Function to classify a new text
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return id2label[predicted_class]

# Testing with some examples
examples = [
    "i want to bring four people to a place that s close to downtown that serves churrascaria cuisine",
    "what is the weather like today",
    "play some music",
]

for text in examples:
    print(f"Text: {text}")
    print(f"Predicted intent: {predict_intent(text)}")
    print("----")


Text: i want to bring four people to a place that s close to downtown that serves churrascaria cuisine
Predicted intent: bookrestaurant
----
Text: what is the weather like today
Predicted intent: getweather
----
Text: play some music
Predicted intent: playmusic
----
