In [1]:
#!pip install datasets --quiet
#!pip install transformers[torch] --quiet
#!pip install accelerate -U
#!pip install evaluate

In [2]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, \
DistilBertTokenizerFast, DataCollatorWithPadding, pipeline

from datasets import load_metric, Dataset
import numpy as np
import evaluate
import torch

  from .autonotebook import tqdm as notebook_tqdm
2024-02-14 16:16:45.540013: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
snips_file = open('snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [4]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
  if len(snip_row) == 2: # skip over rows with no data
    continue
  if ' ' not in snip_row.decode(): # we've hit a sequence label
    sequence_labels.append(snip_row.decode().strip())
    utterances.append(utterance.strip())
    tokenized_utterances.append(tokenized_utterance)
    labels_for_tokens.append(label_for_utterances)
    utterance = ''
    tokenized_utterance = []
    label_for_utterances = []
    continue
  token, token_label = snip_row.decode().split(' ')
  token_label = token_label.strip()
  utterance += f'{token} '
  tokenized_utterance.append(token)
  label_for_utterances.append(token_label)

In [5]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [6]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
PlayMusic


In [7]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['SearchScreeningEvent',
 'BookRestaurant',
 'GetWeather',
 'PlayMusic',
 'SearchCreativeWork',
 'RateBook',
 'AddToPlaylist']

In [8]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]
print(f"There are {len(unique_sequence_labels)} unique sequence labels")

There are 7 unique sequence labels


In [9]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x+y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f"There are {len(unique_token_labels)} unique token labels")

There are 72 unique token labels


In [10]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[29, 29, 69, 29, 11, 29, 38, 42]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
3
PlayMusic


In [11]:
snips_dataset = Dataset.from_dict( # hold data for both sequence and token classification
    dict(
        utterance = utterances,
        label = sequence_labels,
        tokens = tokenized_utterances,
        token_labels = labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size = 0.2)

In [12]:
snips_dataset['train'][0]

{'utterance': 'find animated movies movie schedule in the neighborhood',
 'label': 0,
 'tokens': ['find',
  'animated',
  'movies',
  'movie',
  'schedule',
  'in',
  'the',
  'neighborhood'],
 'token_labels': [29, 61, 39, 8, 55, 12, 60, 60]}

In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [14]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
  return tokenizer(examples["utterance"], truncation = True)

In [15]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched = True)

                                                                   

In [16]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'find animated movies movie schedule in the neighborhood',
 'label': 0,
 'tokens': ['find',
  'animated',
  'movies',
  'movie',
  'schedule',
  'in',
  'the',
  'neighborhood'],
 'token_labels': [29, 61, 39, 8, 55, 12, 60, 60],
 'input_ids': [101, 2424, 6579, 5691, 3185, 6134, 1999, 1996, 5101, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the
#   length of the longest element in the batch, making them all the same length.
#   It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [18]:
# Data Collator will pad data so that all examples are the same input length.
#   Attention mask is how we ignore attention score for padding tokens

In [19]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_sequence_labels),
)

# Set an index -> label dictionary
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
sequence_clf_model.config.id2label[0]

'SearchScreeningEvent'

In [21]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred): # Custom method to take in logits and calculate accuracy of the eval set
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(prediction = predictions, reference = labels)

  metric = load_metric("accuracy")


In [22]:
# Load the accuracy metric
accuracy = evaluate.load("accuracy")

# Define a function to compute the accuracy of the model's predictions
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [23]:
epochs = 2

training_args = TrainingArguments(
    output_dir = "./snips_clf/results",
    num_train_epochs = epochs,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    load_best_model_at_end = True, # yield model that has lowest eval loss

    # Some deep learning parameters that the Trainer is able to take in
    warmup_steps = len(seq_clf_tokenized_snips['train']) // 5, # Number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,

    logging_steps = 1,
    log_level = 'info',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

# Define the trainer
trainer = Trainer(
    model = sequence_clf_model,
    args = training_args,
    train_dataset = seq_clf_tokenized_snips['train'],
    eval_dataset = seq_clf_tokenized_snips['test'],
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

In [24]:
# Get initial metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9449175596237183,
 'eval_accuracy': 0.14673290026748184,
 'eval_runtime': 1.2088,
 'eval_samples_per_second': 2164.975,
 'eval_steps_per_second': 67.836}

In [25]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,958,855


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2795,0.225259,0.980512
2,0.0135,0.050939,0.988154


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Checkpoint destination directory ./snips_clf/results/checkpoint-328 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `Di

TrainOutput(global_step=656, training_loss=0.7219381416273281, metrics={'train_runtime': 41.9201, 'train_samples_per_second': 499.379, 'train_steps_per_second': 15.649, 'total_flos': 117106447109304.0, 'train_loss': 0.7219381416273281, 'epoch': 2.0})

In [26]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.050938792526721954,
 'eval_accuracy': 0.9881543752388231,
 'eval_runtime': 1.1501,
 'eval_samples_per_second': 2275.518,
 'eval_steps_per_second': 71.3,
 'epoch': 2.0}

In [27]:
pipe = pipeline("text-classification", sequence_clf_model, tokenizer = tokenizer, device=0)
pipe('Add Two Coins by Dispatch to my road trip playlist')

[{'label': 'AddToPlaylist', 'score': 0.9900744557380676}]

In [28]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/model.safetensors


In [29]:
pipe = pipeline("text-classification","./snips_clf/results", tokenizer = tokenizer, device=0)
pipe('Add Two Coins by Dispatch to my road trip playlist')

loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "SearchScreeningEvent",
    "1": "BookRestaurant",
    "2": "GetWeather",
    "3": "PlayMusic",
    "4": "SearchCreativeWork",
    "5": "RateBook",
    "6": "AddToPlaylist"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfor

[{'label': 'AddToPlaylist', 'score': 0.9900744557380676}]

In [30]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(unique_sequence_labels),
)

loading configuration file config.json from cache at /home/randi_eka/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.37.2",
  "vocab_size": 30522
}

In [31]:
frozen_sequence_clf_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [32]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False # Freezing the parameters so it cannot be updated

In [33]:
epochs = 2

training_args = TrainingArguments(
    output_dir = "./snips_clf/results",
    num_train_epochs = epochs,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    load_best_model_at_end = True, # yield model that has lowest eval loss

    # Some deep learning parameters that the Trainer is able to take in
    warmup_steps = len(seq_clf_tokenized_snips['train']) // 5, # Number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,

    logging_steps = 1,
    log_level = 'info',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

# Define the trainer
trainer = Trainer(
    model = frozen_sequence_clf_model,
    args = training_args,
    train_dataset = seq_clf_tokenized_snips['train'],
    eval_dataset = seq_clf_tokenized_snips['test'],
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9427404403686523,
 'eval_accuracy': 0.2369124952235384,
 'eval_runtime': 1.3322,
 'eval_samples_per_second': 1964.48,
 'eval_steps_per_second': 61.554}

In [35]:
trainer.train() # previously 0:54, now it only 0:25 with freezing entire distilbert parameter

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9099,1.869242,0.64081
2,1.5225,1.585922,0.88804


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Checkpoint destination directory ./snips_clf/results/checkpoint-328 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `Di

TrainOutput(global_step=656, training_loss=1.8468872833906151, metrics={'train_runtime': 13.9535, 'train_samples_per_second': 1500.266, 'train_steps_per_second': 47.013, 'total_flos': 117106447109304.0, 'train_loss': 1.8468872833906151, 'epoch': 2.0})

In [36]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.5859222412109375,
 'eval_accuracy': 0.8880397401604891,
 'eval_runtime': 1.1569,
 'eval_samples_per_second': 2262.018,
 'eval_steps_per_second': 70.877,
 'epoch': 2.0}