In [1]:
from transformers import DataCollatorForTokenClassification, DistilBertForTokenClassification, \
    DistilBertTokenizerFast, pipeline, Trainer, TrainingArguments

from datasets import load_metric, Dataset
import numpy as np
import evaluate
import torch

  from .autonotebook import tqdm as notebook_tqdm
2024-01-31 20:09:47.694380: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-31 20:09:47.694443: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-31 20:09:47.694502: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-31 20:09:47.708156: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [3]:
snips_file = open('snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [4]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
  if len(snip_row) == 2: # skip over rows with no data
    continue
  if ' ' not in snip_row.decode(): # we've hit a sequence label
    sequence_labels.append(snip_row.decode().strip())
    utterances.append(utterance.strip())
    tokenized_utterances.append(tokenized_utterance)
    labels_for_tokens.append(label_for_utterances)
    utterance = ''
    tokenized_utterance = []
    label_for_utterances = []
    continue
  token, token_label = snip_row.decode().split(' ')
  token_label = token_label.strip()
  utterance += f'{token} '
  tokenized_utterance.append(token)
  label_for_utterances.append(token_label)

In [5]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['RateBook',
 'SearchScreeningEvent',
 'SearchCreativeWork',
 'PlayMusic',
 'AddToPlaylist',
 'BookRestaurant',
 'GetWeather']

In [6]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]
print(f"There are {len(unique_sequence_labels)} unique sequence labels")

There are 7 unique sequence labels


In [7]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x+y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f"There are {len(unique_token_labels)} unique token labels")

There are 72 unique token labels


In [8]:
snips_dataset = Dataset.from_dict( # hold data for both sequence and token classification
    dict(
        utterance = utterances,
        label = sequence_labels,
        tokens = tokenized_utterances,
        token_labels = labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size = 0.2)

In [9]:
# Our dataset from the sequence classification section
snips_dataset['train'][0]

{'utterance': 'i d like to see forgetting the girl',
 'label': 1,
 'tokens': ['i', 'd', 'like', 'to', 'see', 'forgetting', 'the', 'girl'],
 'token_labels': [5, 5, 5, 5, 5, 68, 51, 51]}

In [10]:
tokenized_inputs = tokenizer(snips_dataset['train'][0]["tokens"], truncation = True, is_split_into_words = True)
tokenized_inputs

{'input_ids': [101, 1045, 1040, 2066, 2000, 2156, 17693, 1996, 2611, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.convert_ids_to_tokens([101, 2054, 2003, 1996, 2190, 4825, 1999, 5612, 2005, 2702, 2012, 1014, 2572, 102])

['[CLS]',
 'what',
 'is',
 'the',
 'best',
 'restaurant',
 'in',
 'kentucky',
 'for',
 'ten',
 'at',
 '0',
 'am',
 '[SEP]']

In [12]:
tokenized_inputs.word_ids(batch_index = 0)

[None, 0, 1, 2, 3, 4, 5, 6, 7, None]

In [13]:
# The given "token_labels" may not match up with the BERT wordpiece tokenization so
#   this function will map them to the tokenization that BERT uses
#   -100 is a reserved for labels where we do not want to calculate losses so BERT doesn't waste time
#   trying to predict tokens like CLS or SEP

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation = True, is_split_into_words = True)

    labels = []
    for i, label in enumerate(examples[f"token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: # Set any special tokens / punctuation to -100
                label_ids.append(-100)
            elif word_idx != previous_word_idx: # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100) # everything else labeled as -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [14]:
snips_dataset['train'][0]

{'utterance': 'i d like to see forgetting the girl',
 'label': 1,
 'tokens': ['i', 'd', 'like', 'to', 'see', 'forgetting', 'the', 'girl'],
 'token_labels': [5, 5, 5, 5, 5, 68, 51, 51]}

In [15]:
# Map our dataset from sequence classification to be for token classification
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched = True)

Map: 100%|██████████| 10467/10467 [00:00<00:00, 25048.06 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 28406.63 examples/s]


In [16]:
tokenizer.decode([1024])

':'

In [17]:
tok_clf_tokenized_snips['train'][0]

{'utterance': 'i d like to see forgetting the girl',
 'label': 1,
 'tokens': ['i', 'd', 'like', 'to', 'see', 'forgetting', 'the', 'girl'],
 'token_labels': [5, 5, 5, 5, 5, 68, 51, 51],
 'input_ids': [101, 1045, 1040, 2066, 2000, 2156, 17693, 1996, 2611, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 5, 5, 5, 5, 5, 68, 51, 51, -100]}

In [18]:
tok_clf_tokenized_snips['train'] = tok_clf_tokenized_snips['train'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips['test'] = tok_clf_tokenized_snips['test'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

In [19]:
tok_clf_tokenized_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [20]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [21]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-uncased', num_labels = len(unique_token_labels)
)

# Set our label dictionary
tok_clf_model.config.id2label = {i: l for i, l in enumerate(unique_token_labels)}

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
tok_clf_model.config.id2label[0], tok_clf_model.config.id2label[1]

('I-object_location_type', 'B-best_rating')

In [23]:
epochs = 2

training_args = TrainingArguments(
    output_dir = "./snips_tok_clf/results",
    num_train_epochs = epochs,
    per_device_train_batch_size =32,
    per_device_eval_batch_size =32,
    load_best_model_at_end = True,

    logging_steps = 10,
    log_level = 'info',
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

#Define the trainer:

trainer = Trainer(
    model = tok_clf_model,
    args = training_args,
    train_dataset = tok_clf_tokenized_snips['train'],
    eval_dataset = tok_clf_tokenized_snips['test'],
    data_collator = tok_data_collator
)

In [25]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Trainer is attempting to log a value of "{0: 'I-object_location_type', 1: 'B-best_rating', 2: 'I-playlist', 3: 'B-year', 4: 'I-object_part_of_series_type', 5: 'O', 6: 'B-service', 7: 'B-spatial_relation', 8: 'I-restaurant_type', 9: 'B-object_location_type', 10: 'I-current_location', 11: 'B-movie_type', 12: 'I-facility', 13: 'B-timeRange', 14: 'B-poi', 15: 'B-object_type', 16: 'B-cuisine', 17: 'B-location_name', 18: 'B-object_name', 19: 'I-restaurant_name', 20: 'I-party_size_description', 21: 'B-country', 22: 'I-city', 23: 'B-party_size_description', 24: 'B-restaurant_name', 25: 'I-served_dish', 26: 'B-track', 27: 'I-object_name', 28: 'I-album', 29: 'B-artist', 30: 'I-entity_name', 31: 'B-object_part_of_series_type', 32: 'B-genre', 33: 'B-served_dish', 34: 'B-rating_value', 35: 'B-geographic_poi', 36: 'B-sort', 37: 'B-city', 38: 'B-condition_description', 39: 'B-entity_name', 40: 'I-service', 41: 'I-spatial_relation', 42: 'B-current_location', 43: 'I-location_name', 44: 'I-object_type',

{'eval_loss': 4.33568000793457,
 'eval_runtime': 4.382,
 'eval_samples_per_second': 597.222,
 'eval_steps_per_second': 18.713}

In [26]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,418,248


Epoch,Training Loss,Validation Loss
1,0.2244,0.183765
2,0.0977,0.130217


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results/tmp-checkpoint-328
Configuration saved in ./snips_tok_clf/results/tmp-checkpoint-328/config.json
Model weights saved in ./snips_tok_clf/results/tmp-checkpoint-328/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results/tmp-checkpoint-656
Configuration saved in ./snips_tok_clf/results/tmp-checkpoint-656/config.json
Model weights saved in ./snips_tok_clf/results/tmp-checkpoint-656/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_tok_clf/results/checkpoint-656 (score: 0.13021674752235413).


TrainOutput(global_step=656, training_loss=0.4274501119081567, metrics={'train_runtime': 50.6854, 'train_samples_per_second': 413.019, 'train_steps_per_second': 12.943, 'total_flos': 116195158657872.0, 'train_loss': 0.4274501119081567, 'epoch': 2.0})

In [27]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.13021674752235413,
 'eval_runtime': 2.9146,
 'eval_samples_per_second': 897.881,
 'eval_steps_per_second': 28.134,
 'epoch': 2.0}

In [28]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer = tokenizer, device = 0)
pipe('Add Two Coins by Dispatch to my road trip playlist')

Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-entity_name',
  'score': 0.92159677,
  'index': 2,
  'word': 'two',
  'start': 4,
  'end': 7},
 {'entity': 'I-entity_name',
  'score': 0.9126515,
  'index': 3,
  'word': 'coins',
  'start': 8,
  'end': 13},
 {'entity': 'I-entity_name',
  'score': 0.7290275,
  'index': 4,
  'word': 'by',
  'start': 14,
  'end': 16},
 {'entity': 'B-artist',
  'score': 0.7227906,
  'index': 5,
  'word': 'dispatch',
  'start': 17,
  'end': 25},
 {'entity': 'B-playlist_owner',
  'score': 0.98947173,
  'index': 7,
  'word': 'my',
  'start': 29,
  'end': 31},
 {'entity': 'B-playlist',
  'score': 0.99105483,
  'index': 8,
  'word': 'road',
  'start': 32,
  'end': 36},
 {'entity': 'I-playlist',
  'score': 0.9905984,
  'index': 9,
  'word': 'trip',
  'start': 37,
  'end': 41}]

In [29]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer = tokenizer, device = 0)
pipe('Rate The Principles of Data Science 5 out of 5')

[{'entity': 'B-object_name',
  'score': 0.98055696,
  'index': 2,
  'word': 'the',
  'start': 5,
  'end': 8},
 {'entity': 'I-object_name',
  'score': 0.9891699,
  'index': 3,
  'word': 'principles',
  'start': 9,
  'end': 19},
 {'entity': 'I-object_name',
  'score': 0.99332607,
  'index': 4,
  'word': 'of',
  'start': 20,
  'end': 22},
 {'entity': 'I-object_name',
  'score': 0.9924115,
  'index': 5,
  'word': 'data',
  'start': 23,
  'end': 27},
 {'entity': 'I-object_name',
  'score': 0.99358374,
  'index': 6,
  'word': 'science',
  'start': 28,
  'end': 35},
 {'entity': 'B-rating_value',
  'score': 0.9920729,
  'index': 7,
  'word': '5',
  'start': 36,
  'end': 37},
 {'entity': 'B-best_rating',
  'score': 0.9142672,
  'index': 10,
  'word': '5',
  'start': 45,
  'end': 46}]