In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")

In [3]:
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [4]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [5]:
for k,v in wnut['train'][0].items():
    print(k,':', v)

# convert ner tags idx to labes
print([label_list[idx] for idx in wnut['train'][0]['ner_tags']])


id : 0
tokens : ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
ner_tags : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [6]:
# load distilbert

from transformers import AutoTokenizer

model_name = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
# let's run the tokenizer on the first example
# we see that the tokenizer splits the words into sub tokens, which breaks the map to the labeller
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']


In [8]:
# go through every token and associate the first token in a word to the correct label 
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenize_and_align_labels(wnut['train'][:2])

{'input_ids': [[101, 1030, 2703, 17122, 2009, 1005, 1055, 1996, 3193, 2013, 2073, 1045, 1005, 1049, 2542, 2005, 2048, 3134, 1012, 3400, 2110, 2311, 1027, 9686, 2497, 1012, 3492, 2919, 4040, 2182, 2197, 3944, 1012, 102], [101, 2013, 2665, 2739, 7959, 2098, 1024, 6289, 7011, 8908, 15117, 2005, 10878, 2400, 2000, 13292, 1012, 1019, 8299, 1024, 1013, 1013, 4714, 3126, 2140, 1012, 4012, 1013, 2484, 8490, 3501, 22025, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, -100, -100, 0, 0, -100, 0, 0, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, -100, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, -100, -100, 0, 5, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]]}

In [9]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

In [10]:
# returns dynamically padded tensors for pytorch
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
import evaluate
# prvoides precision, recall, F1, accuracy
seqeval = evaluate.load("seqeval")

In [12]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [13]:
id2label = {idx:l for idx, l in enumerate(label_list)}
label2id = {l:idx for idx, l in enumerate(label_list)}

In [14]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
# get the model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# train

training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3,394
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 535
  Number of trainable parameters = 66,372,877


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.249553,0.513123,0.362373,0.424769,0.945022
2,No log,0.279435,0.582946,0.348471,0.436195,0.945577
3,No log,0.280822,0.575458,0.378128,0.456376,0.946475
4,No log,0.293537,0.556878,0.390176,0.458856,0.947587
5,0.059000,0.296647,0.568063,0.402224,0.470971,0.948014


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 32
Saving model checkpoint to my_awesome_wnut_model/checkpoint-107
Configuration saved in my_awesome_wnut_model/checkpoint-107/config.json
Model weights saved in my_awesome_wnut_model/checkpoint-107/model.safetensors
tokenizer config file saved in my_awesome_wnut_model/checkpoint-107/tokenizer_config.json
Special tokens file saved in my_awesome_wnut_model/checkpoint-107/special_tokens_map.json
tokenizer config file saved in my_awesome_wnut_model/tokenizer_config.json
Special tokens file saved in my_awesome_wnut_model/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `Disti

TrainOutput(global_step=535, training_loss=0.057406222708871434, metrics={'train_runtime': 119.5192, 'train_samples_per_second': 141.986, 'train_steps_per_second': 4.476, 'total_flos': 245460755059260.0, 'train_loss': 0.057406222708871434, 'epoch': 5.0})

In [19]:
trainer.push_to_hub()

Saving model checkpoint to my_awesome_wnut_model
Configuration saved in my_awesome_wnut_model/config.json
Model weights saved in my_awesome_wnut_model/model.safetensors
tokenizer config file saved in my_awesome_wnut_model/tokenizer_config.json
Special tokens file saved in my_awesome_wnut_model/special_tokens_map.json


CommitInfo(commit_url='https://huggingface.co/stephen-osullivan/my_awesome_wnut_model/commit/702f47f096048b9249a50fc255ed724518c0c3b9', commit_message='End of training', commit_description='', oid='702f47f096048b9249a50fc255ed724518c0c3b9', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
# test model inference
from transformers.pipelines import pipeline
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
classifier = pipeline("ner", model="stephen-osullivan/my_awesome_wnut_model")
classifier(text)

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

loading configuration file config.json from cache at /home/sos00/.cache/huggingface/hub/models--stephen-osullivan--my_awesome_wnut_model/snapshots/b30dd4c545d193756a0a35ff225995c521ca0782/config.json
Model config DistilBertConfig {
  "_name_or_path": "stephen-osullivan/my_awesome_wnut_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-corporation",
    "2": "I-corporation",
    "3": "B-creative-work",
    "4": "I-creative-work",
    "5": "B-group",
    "6": "I-group",
    "7": "B-location",
    "8": "I-location",
    "9": "B-person",
    "10": "I-person",
    "11": "B-product",
    "12": "I-product"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-corporation": 1,
    "B-creative-work": 3,
    "B-group": 5,
    "B-location": 7,
    "B-person": 9,
    "B-product": 11,
    "I-corporation": 2,
    "I-creative-work

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /home/sos00/.cache/huggingface/hub/models--stephen-osullivan--my_awesome_wnut_model/snapshots/b30dd4c545d193756a0a35ff225995c521ca0782/model.safetensors
All model checkpoint weights were used when initializing DistilBertForTokenClassification.

All the weights of DistilBertForTokenClassification were initialized from the model checkpoint at stephen-osullivan/my_awesome_wnut_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForTokenClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /home/sos00/.cache/huggingface/hub/models--stephen-osullivan--my_awesome_wnut_model/snapshots/b30dd4c545d193756a0a35ff225995c521ca0782/vocab.txt
loading file tokenizer.json from cache at /home/sos00/.cache/huggingface/hub/models--stephen-osullivan--my_awesome_wnut_model/snapshots/b30dd4c545d193756a0a35ff225995c521ca0782/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/sos00/.cache/huggingface/hub/models--stephen-osullivan--my_awesome_wnut_model/snapshots/b30dd4c545d193756a0a35ff225995c521ca0782/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/sos00/.cache/huggingface/hub/models--stephen-osullivan--my_awesome_wnut_model/snapshots/b30dd4c545d193756a0a35ff225995c521ca0782/tokenizer_config.json
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-group',
  'score': 0.29368842,
  'index': 1,
  'word': 'the',
  'start': 0,
  'end': 3},
 {'entity': 'B-location',
  'score': 0.5067753,
  'index': 2,
  'word': 'golden',
  'start': 4,
  'end': 10},
 {'entity': 'I-location',
  'score': 0.361732,
  'index': 3,
  'word': 'state',
  'start': 11,
  'end': 16},
 {'entity': 'B-group',
  'score': 0.25192493,
  'index': 4,
  'word': 'warriors',
  'start': 17,
  'end': 25},
 {'entity': 'B-location',
  'score': 0.6927596,
  'index': 13,
  'word': 'san',
  'start': 80,
  'end': 83},
 {'entity': 'B-location',
  'score': 0.58765185,
  'index': 14,
  'word': 'francisco',
  'start': 84,
  'end': 93}]