## BERT for sequence classification


In [1]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, \
     DataCollatorWithPadding, pipeline
from datasets import Dataset
import numpy as np
import torch
import evaluate

In [2]:
snips_file = open('../data/snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [3]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    utterance += f'{token} '
    tokenized_utterance.append(token)


In [4]:
len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084)

In [5]:
utterances[0], sequence_labels[0]

('listen to westbam alumb allergic on google music', 'PlayMusic')

In [6]:
MODEL = 'bert-base-uncased'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [52]:
tokenizer(['hi'])

{'input_ids': [[101, 7632, 102]], 'token_type_ids': [[0, 0, 0]], 'attention_mask': [[1, 1, 1]]}

In [55]:
sequence_clf_model.bert.embeddings.word_embeddings

Embedding(30522, 768, padding_idx=0)

In [57]:
50522 * 768

38800896

In [8]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['GetWeather',
 'BookRestaurant',
 'RateBook',
 'SearchScreeningEvent',
 'SearchCreativeWork',
 'PlayMusic',
 'AddToPlaylist']

In [9]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


In [10]:
print(tokenized_utterances[0])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
listen to westbam alumb allergic on google music
5
PlayMusic


In [58]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
    )
)

snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [59]:
snips_dataset

DatasetDict({
    train: Dataset({
        features: ['utterance', 'label'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['utterance', 'label'],
        num_rows: 2617
    })
})

In [60]:
snips_dataset['train'][0]

{'utterance': 'rate homicide: a year on the killing streets five stars',
 'label': 2}

In [61]:
unique_sequence_labels[snips_dataset['train'][0]['label']]

'RateBook'

In [15]:
tokenizer('hi')

{'input_ids': [101, 7632, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [16]:
tokenizer.decode(tokenizer('hi')['input_ids'])

'[CLS] hi [SEP]'

In [62]:
tokenizer(['I love [CLS]'])

{'input_ids': [[101, 1045, 2293, 101, 102]], 'token_type_ids': [[0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}

In [63]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

In [64]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

In [65]:
# only input_ids, attention_mask, and label are used. The rest are for show
seq_clf_tokenized_snips['train'][0]

{'utterance': 'rate homicide: a year on the killing streets five stars',
 'label': 2,
 'input_ids': [101,
  3446,
  18268,
  1024,
  1037,
  2095,
  2006,
  1996,
  4288,
  4534,
  2274,
  3340,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
# Data Collator will pad data so that all examples are the same input length.
#  Attention mask is how we ignore attention scores for padding tokens

In [22]:
label_dict = {i: l for i, l in enumerate(unique_sequence_labels)}
label_dict

{0: 'GetWeather',
 1: 'BookRestaurant',
 2: 'RateBook',
 3: 'SearchScreeningEvent',
 4: 'SearchCreativeWork',
 5: 'PlayMusic',
 6: 'AddToPlaylist'}

In [23]:
sequence_clf_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(unique_sequence_labels),
)

# set an index -> label dictionary
sequence_clf_model.config.id2label = label_dict

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [24]:
sequence_clf_model.config.id2label[0]

'GetWeather'

In [25]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):  # custom method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [26]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    logging_steps=10,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch',
    use_mps_device=True if torch.backends.mps.is_available() else False
)

# Define the trainer:

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,  # optional
    data_collator=data_collator
)

In [66]:
tokenizer('I love this class')

{'input_ids': [101, 1045, 2293, 2023, 2465, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [67]:
tokenizer.decode(2293)

'love'

In [69]:
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /Users/sinanozdemir/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params":

In [71]:
gpt2_tokenizer('I love this class')

{'input_ids': [40, 1842, 428, 1398], 'attention_mask': [1, 1, 1, 1]}

In [73]:
tokenizer.decode(2293)

'love'

In [72]:
gpt2_tokenizer.decode(1842)

' love'

In [27]:
# Get initial metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mprofoz[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 1.9800927639007568,
 'eval_accuracy': 0.12762705387848683,
 'eval_runtime': 6.6835,
 'eval_samples_per_second': 391.563,
 'eval_steps_per_second': 12.269}

In [29]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 109,487,623


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0394,0.069342,0.982423
2,0.0044,0.051738,0.988154


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved 

TrainOutput(global_step=656, training_loss=0.11454935420713411, metrics={'train_runtime': 205.8835, 'train_samples_per_second': 101.679, 'train_steps_per_second': 3.186, 'total_flos': 232188457301100.0, 'train_loss': 0.11454935420713411, 'epoch': 2.0})

In [30]:
trainer.evaluate()  # sanity check, should be the same as the epoch with the lowest validation loss

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.051737893372774124,
 'eval_accuracy': 0.9881543752388231,
 'eval_runtime': 5.9679,
 'eval_samples_per_second': 438.51,
 'eval_steps_per_second': 13.74,
 'epoch': 2.0}

In [31]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/pytorch_model.bin


In [32]:
pipeline

<function transformers.pipelines.pipeline(task: str = None, model: Union[str, ForwardRef('PreTrainedModel'), ForwardRef('TFPreTrainedModel'), NoneType] = None, config: Union[str, transformers.configuration_utils.PretrainedConfig, NoneType] = None, tokenizer: Union[str, transformers.tokenization_utils.PreTrainedTokenizer, ForwardRef('PreTrainedTokenizerFast'), NoneType] = None, feature_extractor: Union[str, ForwardRef('SequenceFeatureExtractor'), NoneType] = None, image_processor: Union[str, transformers.image_processing_utils.BaseImageProcessor, NoneType] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, use_auth_token: Union[bool, str, NoneType] = None, device: Union[int, str, ForwardRef('torch.device'), NoneType] = None, device_map=None, torch_dtype=None, trust_remote_code: Optional[bool] = None, model_kwargs: Dict[str, Any] = None, pipeline_class: Optional[Any] = None, **kwargs) -> transformers.pipelines.base.Pipeline>

In [38]:
# We can now load our fine-tuned from our directory
pipe = pipeline("text-classification", "./snips_clf/results", tokenizer=tokenizer)

pipe('Please add Here We Go by Dispatch to my road trip playlist', return_all_scores=True)


loading configuration file ./snips_clf/results/config.json
Model config BertConfig {
  "_name_or_path": "./snips_clf/results",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "GetWeather",
    "1": "BookRestaurant",
    "2": "RateBook",
    "3": "SearchScreeningEvent",
    "4": "SearchCreativeWork",
    "5": "PlayMusic",
    "6": "AddToPlaylist"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "si

[[{'label': 'GetWeather', 'score': 0.0001943948882399127},
  {'label': 'BookRestaurant', 'score': 0.00012255563342478126},
  {'label': 'RateBook', 'score': 0.00016526204126421362},
  {'label': 'SearchScreeningEvent', 'score': 0.00020982889691367745},
  {'label': 'SearchCreativeWork', 'score': 0.0001653538056416437},
  {'label': 'PlayMusic', 'score': 0.0001996321079786867},
  {'label': 'AddToPlaylist', 'score': 0.998943030834198}]]

In [45]:
frozen_sequence_clf_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(unique_sequence_labels),
)

loading configuration file config.json from cache at /Users/sinanozdemir/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id"

In [46]:
# freezes EVERY parameter in our bert model
# does not freeze our classification layer
for param in frozen_sequence_clf_model.bert.parameters():
    param.requires_grad = False

In [47]:
epochs = 5

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    report_to='wandb',
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch',
    use_mps_device=True if torch.backends.mps.is_available() else False
)

# Define the trainer:

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [48]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 2.0028934478759766,
 'eval_accuracy': 0.14405808177302254,
 'eval_runtime': 5.5154,
 'eval_samples_per_second': 474.493,
 'eval_steps_per_second': 14.868}

In [49]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1,640
  Number of trainable parameters = 5,383


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8577,1.860846,0.330913
2,1.8184,1.807274,0.401987
3,2.0309,1.773669,0.423768
4,1.6315,1.752286,0.452044
5,1.9599,1.745079,0.47115


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved 

TrainOutput(global_step=1640, training_loss=1.8307396725910465, metrics={'train_runtime': 166.1521, 'train_samples_per_second': 314.983, 'train_steps_per_second': 9.87, 'total_flos': 581074989676500.0, 'train_loss': 1.8307396725910465, 'epoch': 5.0})

In [50]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: utterance, tokens. If utterance, tokens are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.7450792789459229,
 'eval_accuracy': 0.4711501719526175,
 'eval_runtime': 5.623,
 'eval_samples_per_second': 465.412,
 'eval_steps_per_second': 14.583,
 'epoch': 5.0}