In [1]:
from rich import print
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    AutoModelForTokenClassification
)
import numpy as np
import evaluate

In [2]:
dataset = load_dataset('conll2003')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [4]:
label_names = dataset['train'].features['ner_tags'].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
token = tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True)
print(
    token, '\n--------------------------------------------------------------------------------------\n', 
    token.tokens(),'\n--------------------------------------------------------------------------------------\n',
    token.word_ids()
)

In [7]:
def align_target(labels, word_ids):
    # Define a mapping from beginning (B-) labels to inside (I-) labels
    begin2inside = {
        1: 2,  # B-LOC -> I-LOC
        3: 4,  # B-MISC -> I-MISC
        5: 6,  # B-ORG -> I-ORG
        7: 8    # B-PER -> I-PER
    }

    # Initialize an empty list to store aligned labels and a variable to track the last word
    align_labels = []
    last_word = None

    # Iterate through the word_ids
    for word in word_ids:
        if word is None:
            label = -100  # Set label to -100 for None word_ids
        elif word != last_word:
            label = labels[word]  # Use the label corresponding to the current word_id
        else:
            label = labels[word]
            # Change B- to I- if the previous word is the same
            if label in begin2inside:
                label = begin2inside[label]  # Map B- to I-

        # Append the label to the align_labels list and update last_word
        align_labels.append(label)
        last_word = word

    return align_labels

In [8]:
# Extract labels and word_ids
labels = dataset['train'][0]['ner_tags']
word_ids = token.word_ids()

# Use the align_target function to align labels
aligned_target = align_target(labels, word_ids)

# Print tokenized tokens, original labels, and aligned labels
print(
    token.tokens(), '\n--------------------------------------------------------------------------------------\n',
    labels, '\n--------------------------------------------------------------------------------------\n',
    aligned_target
)

In [9]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_target]

# Loop through tokens and aligned labels and print them
for x, y in zip(token.tokens(), aligned_labels):
    print(f"{x}\t{y}")

In [10]:
# Define fake input data
words = ['[CLS]', 'Ger', '##man', 'call', 'to', 'Micro', '##so', '##ft', '[SEP]']
word_ids = [None, 0, 0, 1, 2, 3, 3, 3, None]
labels = [7, 0, 0, 3, 4]

# Use the align_target function to align labels
aligned_target = align_target(labels, word_ids)

# Create a list of aligned labels using label names
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_target]

# Loop through words and aligned labels and print them
for x, y in zip(words, aligned_labels):
    print(f"{x}\t{y}")

In [11]:
def tokenize_fn(batch):
    # Tokenize the input batch
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)

    # Extract the labels batch from the input batch
    labels_batch = batch['ner_tags']

    # Initialize a list to store aligned targets for each example in the batch
    aligned_targets_batch = []

    # Iterate through each example and align the labels
    for i, labels in enumerate(labels_batch):
        # Extract the word_ids for the current example
        word_ids = tokenized_inputs.word_ids(i)

        # Use the align_target function to align the labels
        aligned_targets_batch.append(align_target(labels, word_ids))

    # Add the aligned labels to the tokenized inputs under the key "labels"
    tokenized_inputs["labels"] = aligned_targets_batch

    # Return the tokenized inputs, including aligned labels
    return tokenized_inputs

In [12]:
tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset['train'].column_names)

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_dataset['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [14]:
metric = evaluate.load("seqeval")

def compute_metrics(logits_and_labels):
    # Unpack the logits and labels
    logits, labels = logits_and_labels 
    
    # Get predictions from the logits
    predictions = np.argmax(logits, axis=-1)
    
    # Remove ignored index (special tokens)
    str_labels = [
    [label_names[t] for t in label if t!=-100] for label in labels
    ]
    
    str_preds = [
    [label_names[p] for (p, t) in zip(prediction, label) if t != -100]
    for prediction, label in zip(predictions, labels)
    ]
    
    # Compute metrics
    results = metric.compute(predictions=str_preds, references=str_labels)
    
    # Extract key metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"], 
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]  
    }

In [15]:
id2label = {k: v for k, v in enumerate(label_names)} 
label2id = {v: k for k, v in enumerate(label_names)}
print(id2label , '\n--------------------\n' , label2id)

In [16]:
model = AutoModelForTokenClassification.from_pretrained(
  checkpoint,
  id2label=id2label,  
  label2id=label2id
)

training_args = TrainingArguments(
    output_dir = "fine_tuned_model",
    eval_strategy = "epoch",
    learning_rate = 2e-5, 
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01
)

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator 
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [17]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2519,0.090953,0.85728,0.903736,0.879895,0.973082
2,0.0629,0.070373,0.897042,0.923763,0.910206,0.980529
3,0.0349,0.068708,0.90558,0.931336,0.918278,0.98206


***** train metrics *****
  epoch                    =        3.0
  total_flos               =   489241GF
  train_loss               =     0.0943
  train_runtime            = 0:00:55.00
  train_samples_per_second =    765.806
  train_steps_per_second   =     47.887


In [19]:
dataset['test'][0]

{'id': '0',
 'tokens': ['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 'pos_tags': [21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7],
 'chunk_tags': [11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0],
 'ner_tags': [0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]}

In [25]:
from transformers import pipeline


folder = '/home/kevin/PycharmProjects/model_train/classification/ner/fine_tuned_model'
classify = pipeline(
    task="ner",
    model=folder,
    batch_size=100,
    aggregation_strategy="max"
)

sample = dataset['test'][0]['tokens']
result = classify(sample)
print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [23]:
tokenized_dataset['test'][0]

{'input_ids': [101,
  156,
  9244,
  10954,
  2069,
  118,
  147,
  12240,
  14962,
  25075,
  1942,
  149,
  21986,
  2428,
  3663,
  160,
  11607,
  117,
  24890,
  11607,
  1592,
  15969,
  156,
  19556,
  22861,
  6258,
  2036,
  18581,
  2271,
  12420,
  1942,
  119,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -100]}