### Fine-tuning DistilBERT for Text Classification

In [1]:
# libraries
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset

import nltk
from nltk.corpus import stopwords

In [17]:
 # dataset
df = pd.read_csv('train.csv')   # change path if needed

# data exploration
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
display(df.head())
print("Missing per column:\n", df.isnull().sum())
print("Label counts:\n", df['intent'].value_counts())

Rows: 13084
Columns: ['text', 'intent']


Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist


Missing per column:
 text      0
intent    0
dtype: int64
Label counts:
 intent
PlayMusic               1914
GetWeather              1896
BookRestaurant          1881
RateBook                1876
SearchScreeningEvent    1852
SearchCreativeWork      1847
AddToPlaylist           1818
Name: count, dtype: int64


In [18]:
# data cleaning
df['intent'] = df['intent'].astype(str).str.lower()
df['text']   = df['text'].astype(str).str.lower()
df['raw_text'] = df['text'].copy()   # keep original

display(df.head())

Unnamed: 0,text,intent,raw_text
0,listen to westbam alumb allergic on google music,playmusic,listen to westbam alumb allergic on google music
1,add step to me to the 50 clásicos playlist,addtoplaylist,add step to me to the 50 clásicos playlist
2,i give this current textbook a rating value of...,ratebook,i give this current textbook a rating value of...
3,play the song little robin redbreast,playmusic,play the song little robin redbreast
4,please add iris dement to my playlist this is ...,addtoplaylist,please add iris dement to my playlist this is ...


In [None]:
# label encoding
labels = sorted(df['intent'].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['intent'].map(label2id)

print("Num classes:", len(labels))
print("label2id:", label2id)

Num classes: 7
label2id: {'addtoplaylist': 0, 'bookrestaurant': 1, 'getweather': 2, 'playmusic': 3, 'ratebook': 4, 'searchcreativework': 5, 'searchscreeningevent': 6}


In [20]:
# train-test split
train_df, test_df = train_test_split(
    df,
    test_size=0.20,
    stratify=df['label'],
    random_state=42
)

print("Train / Test sizes:", len(train_df), len(test_df))

Train / Test sizes: 10467 2617


In [21]:
# datasets for Huggingface
train_ds = Dataset.from_pandas(train_df[['text','label']].reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df[['text','label']].reset_index(drop=True))
print(train_ds, test_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 10467
}) Dataset({
    features: ['text', 'label'],
    num_rows: 2617
})


In [22]:
# tokenizer and max_length selection
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

observed_max = max(len(tokenizer.encode(t, add_special_tokens=True)) for t in train_df['text'])
max_length = min(32, observed_max) # using 32 as the text length is generally short
print(f"Observed max on train: {observed_max} -> using max_length = {max_length}")

Observed max on train: 37 -> using max_length = 32


In [23]:
# tokenization
def tokenize_fn(examples):
    out = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)
    out['label'] = examples['label']
    return out

# applying tokenization
tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=['text'])
tokenized_test  = test_ds.map(tokenize_fn, batched=True, remove_columns=['text'])

tokenized_train.set_format(type='torch', columns=['input_ids','attention_mask','label'])
tokenized_test.set_format(type='torch', columns=['input_ids','attention_mask','label'])

print(tokenized_train[0])

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

{'label': tensor(2), 'input_ids': tensor([ 101, 2003, 2009, 2183, 2000, 2022, 2980, 1999, 3782, 2386,  102,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])}
