### Fine-tuning DistilBERT for Text Classification

In [1]:
# libraries
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset

import nltk
from nltk.corpus import stopwords

In [12]:
# load dataset
df = pd.read_csv('train.csv')
df

Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist
...,...,...
13079,i want to eat choucroute at a brasserie for 8,BookRestaurant
13080,play funky heavy bluesy,PlayMusic
13081,rate the current album 2 points out of 6,RateBook
13082,go to the photograph the inflated tear,SearchCreativeWork


In [None]:
# Initial data exploration
print(df.shape)
print(df.columns)
display(df.head())
df.info()
print("Missing values per column:\n", df.isnull().sum())
print("Label counts:\n", df['intent'].value_counts())

(13084, 2)
Index(['text', 'intent'], dtype='object')


Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13084 entries, 0 to 13083
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13084 non-null  object
 1   intent  13084 non-null  object
dtypes: object(2)
memory usage: 204.6+ KB
Missing values per column:
 text      0
intent    0
dtype: int64
Label counts:
 intent
PlayMusic               1914
GetWeather              1896
BookRestaurant          1881
RateBook                1876
SearchScreeningEvent    1852
SearchCreativeWork      1847
AddToPlaylist           1818
Name: count, dtype: int64


In [7]:
# prepare stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# function to remove stopwords
def remove_stopwords(text):
  # text to lowercase
  text = text.lower()
  
  # remove stopwords
  filtered_words = ' '.join([word for word in text.split() if word not in stop_words])
  return filtered_words

df['text'] = df['text'].apply(remove_stopwords)
df

Unnamed: 0,text,intent
0,listen westbam alumb allergic google music,playmusic
1,add step 50 clásicos playlist,addtoplaylist
2,give current textbook rating value 1 best rati...,ratebook
3,play song little robin redbreast,playmusic
4,please add iris dement playlist selena,addtoplaylist
...,...,...
13079,want eat choucroute brasserie 8,bookrestaurant
13080,play funky heavy bluesy,playmusic
13081,rate current album 2 points 6,ratebook
13082,go photograph inflated tear,searchcreativework


In [9]:
# encoding labels
labels = df["intent"].unique()
label_dict = {label: i for i, label in enumerate(labels)}
df['label'] = df['intent'].map(label_dict)
df

Unnamed: 0,text,intent,label
0,listen westbam alumb allergic google music,playmusic,0
1,add step 50 clásicos playlist,addtoplaylist,1
2,give current textbook rating value 1 best rati...,ratebook,2
3,play song little robin redbreast,playmusic,0
4,please add iris dement playlist selena,addtoplaylist,1
...,...,...,...
13079,want eat choucroute brasserie 8,bookrestaurant,4
13080,play funky heavy bluesy,playmusic,0
13081,rate current album 2 points 6,ratebook,2
13082,go photograph inflated tear,searchcreativework,6


In [10]:
# loading the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# max length of text
max_length = max([len(tokenizer.encode(text)) for text in df['text']]) # find the max length of text
print (f'Max length of text: {max_length}')


# tokenize the text
def tokenize_function(examples): 
  tokenized_inputs = tokenizer(examples['text'], padding='max_length', truncation=True, max_length= max_length) # tokenize the text
  tokenized_inputs['label'] = examples['label'] # add labels to the tokenized inputs
  return tokenized_inputs

# convert dataframe to dataset
dataset = Dataset.from_pandas(df[['text', 'label']])

# tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# check the tokenized dataset
dataset[0] # original text
tokenized_datasets[0] # tokenized text


Max length of text: 28


Map:   0%|          | 0/13084 [00:00<?, ? examples/s]

{'text': 'listen westbam alumb allergic google music',
 'label': 0,
 'input_ids': [101,
  4952,
  2225,
  3676,
  2213,
  2632,
  25438,
  27395,
  8224,
  2189,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [None]:
# Train-test and splitting the dataset


In [11]:
# 1. dataset size & class balance
print("Rows:", len(df))
print("Class counts:\n", df['intent'].value_counts())

# 2. avg / max token length (rough)
from transformers import DistilBertTokenizer
tok = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
lengths = [len(tok.encode(t, add_special_tokens=True)) for t in df['text']]
import numpy as np
print("avg tokens:", int(np.mean(lengths)), "median:", int(np.median(lengths)), "max:", max(lengths))

# 3. hardware check
import torch
print("CUDA available:", torch.cuda.is_available())
import psutil, os
print("Total RAM (GB):", round(psutil.virtual_memory().total / (1024**3), 2))


Rows: 13084
Class counts:
 intent
playmusic               1914
getweather              1896
bookrestaurant          1881
ratebook                1876
searchscreeningevent    1852
searchcreativework      1847
addtoplaylist           1818
Name: count, dtype: int64
avg tokens: 8 median: 8 max: 28
CUDA available: False
Total RAM (GB): 7.68
