In [1]:
# !pip install datasets transformers evaluate seqeval

### 1. Load a dataset

<span style="font-size:16px">For the indentification of mountain names inside the text we use [Few-NERD](http://ningding97.github.io/fewnerd) dataset, which is also available at [Kaggle](http://www.kaggle.com/datasets/nbroad/fewnerd). More exactly we use a supervised part of this dataset.</span>

In [1]:
from datasets import load_dataset

fewnerd = load_dataset('json', data_files={
    'train': './fewnerd/supervised/train.json',
    'val': './fewnerd/supervised/dev.json',
    'test': './fewnerd/supervised/test.json',
})
fewnerd

DatasetDict({
    train: Dataset({
        features: ['tokens', 'coarse_tags', 'fine_tags', 'id'],
        num_rows: 131766
    })
    val: Dataset({
        features: ['tokens', 'coarse_tags', 'fine_tags', 'id'],
        num_rows: 18823
    })
    test: Dataset({
        features: ['tokens', 'coarse_tags', 'fine_tags', 'id'],
        num_rows: 37647
    })
})

<span style="font-size:16px">Loading tag dictionaries</span>

In [3]:
import json

with open("./fewnerd/id2coarse_tags.json", "r") as f:
    id2coarse_tag = json.load(f)
print(id2coarse_tag)
    
with open("./fewnerd/id2fine_tags.json", "r") as f:
    id2fine_tag = json.load(f)
id2fine_tag  

{'0': 'O', '1': 'art', '2': 'building', '3': 'event', '4': 'location', '5': 'organization', '6': 'other', '7': 'person', '8': 'product'}


{'0': 'O',
 '1': 'art-broadcastprogram',
 '2': 'art-film',
 '3': 'art-music',
 '4': 'art-other',
 '5': 'art-painting',
 '6': 'art-writtenart',
 '7': 'building-airport',
 '8': 'building-hospital',
 '9': 'building-hotel',
 '10': 'building-library',
 '11': 'building-other',
 '12': 'building-restaurant',
 '13': 'building-sportsfacility',
 '14': 'building-theater',
 '15': 'event-attack/battle/war/militaryconflict',
 '16': 'event-disaster',
 '17': 'event-election',
 '18': 'event-other',
 '19': 'event-protest',
 '20': 'event-sportsevent',
 '21': 'location-GPE',
 '22': 'location-bodiesofwater',
 '23': 'location-island',
 '24': 'location-mountain',
 '25': 'location-other',
 '26': 'location-park',
 '27': 'location-road/railway/highway/transit',
 '28': 'organization-company',
 '29': 'organization-education',
 '30': 'organization-government/governmentagency',
 '31': 'organization-media/newspaper',
 '32': 'organization-other',
 '33': 'organization-politicalparty',
 '34': 'organization-religion',
 '

In [4]:
MOUTAIN_TAG = 24

rows_with_mountain_tag = [i for i, row in enumerate(fewnerd["train"]["fine_tags"]) if MOUTAIN_TAG in row]
len(rows_with_mountain_tag), rows_with_mountain_tag[:5]

(1502, [46, 75, 98, 138, 284])

<span style="font-size:16px">Some examples from the train dataset:</span>

In [5]:
for x in fewnerd["train"].select(rows_with_mountain_tag[:5]):
    print(x, "\n")

{'tokens': ['The', 'Eighth', 'Army', 'began', 'to', 'attack', 'Italian', 'units', ',', 'located', 'using', 'information', 'from', 'Ultra', ',', 'at', 'Ruweisat', 'Ridge', 'and', 'from', 'again', 'at', 'Tel', 'El', 'Eisa', 'on', '22', 'July', 'and', 'Miteirya', 'Ridge', 'after', 'which', 'another', 'lull', 'fell', '.'], 'coarse_tags': [0, 5, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0], 'fine_tags': [0, 32, 32, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 24, 0, 0, 0, 0, 21, 21, 21, 0, 0, 0, 0, 24, 24, 0, 0, 0, 0, 0, 0], 'id': '46'} 

{'tokens': ['Though', 'only', 'in', 'length', ',', 'The', 'Salamander', 'Glacier', 'is', 'about', 'wide', '.'], 'coarse_tags': [0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0], 'fine_tags': [0, 0, 0, 0, 0, 0, 24, 24, 0, 0, 0, 0], 'id': '75'} 

{'tokens': ['Mount', 'Diablo', 'has', 'inspired', 'many', 'artists', 'and', 'writers', '.'], 'coarse_tags': [4, 4, 0, 0, 0, 0, 0, 0, 0], 'fine_tags': [24, 24, 0, 0, 0, 0

### 2. Preprocess

<span style="font-size:16px">Replacing tags other than mountain tag with 0, mountain tags with 1, and removing extra columns from new datasets</span>

In [6]:
def tag_map(tag):
    return 1 if tag == MOUTAIN_TAG else 0

def tag_list_map(tag_list):
    return list(map(tag_map, tag_list))

def fine_tags_map(examples):
    examples["mountain_tags"] = list(map(tag_list_map, examples["fine_tags"]))    
    return examples
    
fewnerd_mountains = fewnerd.map(fine_tags_map, remove_columns=["coarse_tags", "fine_tags", "id"], batched=True)

<span style="font-size:16px">Some examples from the processed train dataset:</span>

In [7]:
for x in fewnerd_mountains["train"].select(rows_with_mountain_tag[:5]):
    print(x, "\n")

{'tokens': ['The', 'Eighth', 'Army', 'began', 'to', 'attack', 'Italian', 'units', ',', 'located', 'using', 'information', 'from', 'Ultra', ',', 'at', 'Ruweisat', 'Ridge', 'and', 'from', 'again', 'at', 'Tel', 'El', 'Eisa', 'on', '22', 'July', 'and', 'Miteirya', 'Ridge', 'after', 'which', 'another', 'lull', 'fell', '.'], 'mountain_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]} 

{'tokens': ['Though', 'only', 'in', 'length', ',', 'The', 'Salamander', 'Glacier', 'is', 'about', 'wide', '.'], 'mountain_tags': [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]} 

{'tokens': ['Mount', 'Diablo', 'has', 'inspired', 'many', 'artists', 'and', 'writers', '.'], 'mountain_tags': [1, 1, 0, 0, 0, 0, 0, 0, 0]} 

{'tokens': ['K2', 'is', 'further', 'north', 'than', 'the', 'Himalayan', 'mountains', 'so', 'the', 'climate', 'is', 'colder', ';', 'the', 'Karakoram', 'range', 'is', 'wider', 'than', 'the', 'Himalayan', 'so', 'more', 'ice', 'and', 'snow',

<span style="font-size:16px">Computing number of mountain and O tags in processed datasets and their proportion</span>

In [8]:
def print_mountain_dataset_stat(name, dataset):
    mountain_tags_num = 0
    tags_num = 0
    for tags in dataset["mountain_tags"]:
        mountain_tags_num += sum(tags)
        tags_num += len(tags)
    o_tags_num = tags_num - mountain_tags_num
    print(f"{name:<5} dataset - mountain tags: {mountain_tags_num}, O tags: {o_tags_num}, proportion: {mountain_tags_num/o_tags_num}")

for k in fewnerd_mountains.keys():
    print_mountain_dataset_stat(k, fewnerd_mountains[k])

train dataset - mountain tags: 4500, O tags: 3223038, proportion: 0.0013961982452580454
val   dataset - mountain tags: 734, O tags: 462386, proportion: 0.0015874183041874104
test  dataset - mountain tags: 1366, O tags: 919688, proportion: 0.0014852863144892616


<span style="font-size:16px">Load DistilBERT tokenizer to preprocess the tokens field</span>

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

<span style="font-size:16px">An example of tokenization in action:</span>

In [10]:
example = fewnerd_mountains["train"]["tokens"][75]
tokenized_input = tokenizer(example, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

print(example, "\n")
print(tokenized_input, "\n")
print(tokens)

['Though', 'only', 'in', 'length', ',', 'The', 'Salamander', 'Glacier', 'is', 'about', 'wide', '.'] 

{'input_ids': [101, 2295, 2069, 1999, 3091, 1010, 1996, 16183, 23093, 4063, 10046, 2003, 2055, 2898, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 

['[CLS]', 'though', 'only', 'in', 'length', ',', 'the', 'sal', '##aman', '##der', 'glacier', 'is', 'about', 'wide', '.', '[SEP]']


<span style="font-size:16px">Tokenizer adds some special tokens [CLS] and [SEP] and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. We realign the tokens and labels and remove extra columns from new datasets.</span>

In [11]:
# The value that is ignored and does not contribute to the input gradient in CrossEntropyLoss
IGNORE_INDEX = -100 

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["mountain_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(IGNORE_INDEX) # Set the special tokens to IGNORE_INDEX
            else:
                label_ids.append(label[word_idx]) # Label each token of a given word
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

fewnerd_mountains = fewnerd_mountains.map(tokenize_and_align_labels, remove_columns=["tokens", "mountain_tags"], batched=True)

<span style="font-size:16px">An example from the processed train dataset:</span>

In [12]:
print(fewnerd_mountains["train"][75])

{'input_ids': [101, 2295, 2069, 1999, 3091, 1010, 1996, 16183, 23093, 4063, 10046, 2003, 2055, 2898, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, -100]}


<span style="font-size:16px">Set a data collator that will dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length</span>

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

### 3. Train

<span style="font-size:16px">Before we start training a model, create a list of lables and dictionaries of label ids and labels</span>

In [14]:
label_list = [id2fine_tag[str(0)], id2fine_tag[str(MOUTAIN_TAG)]]
print(label_list, "\n")

id2label = {i: label for i, label in enumerate(label_list)}
print(id2label, "\n")

label2id = {label: i for i, label in enumerate(label_list)}
print(label2id)

['O', 'location-mountain'] 

{0: 'O', 1: 'location-mountain'} 

{'O': 0, 'location-mountain': 1}


<span style="font-size:16px">Load DistilBERT model with AutoModelForTokenClassification along with the number of expected labels, and the label mappings</span>

In [15]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<span style="font-size:16px">Create a function that computes metrics from predictions and labels, ignoring labels for special tokens</span>

In [16]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != IGNORE_INDEX]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [ [label_list[l] for l in label if l != IGNORE_INDEX] for label in labels ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)    
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

<span style="font-size:16px">Due to the imbalance of mountain tags number and O tags number in the datasets, we want to use class weights in the loss function. For this we need a customization of Trainer class.</span>

In [17]:
import torch
from transformers import Trainer

class CustomTrainer(Trainer):    
    def __init__(self, tag_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tag_weights = tag_weights
        
    def compute_loss(self, model, inputs, num_items_in_batch = None, return_outputs=False):
        labels = inputs.get("labels")       
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')        
        
        # Compute custom loss
        weight=torch.tensor(self.tag_weights)
        if torch.cuda.is_available():
           weight = weight.cuda()
        #    print("GPU Activate")
        loss_fun = torch.nn.CrossEntropyLoss(weight)
        loss = loss_fun(logits.view(-1, model.config.num_labels), labels.view(-1))        
        
        return (loss, outputs) if return_outputs else loss

<span style="font-size:16px">Set parameters for training the model, create an instance of CustomTrainer, train the model, and evaluate it on the test dataset</span>

In [18]:
from transformers import TrainingArguments

tag_weights = [0.1, 1]

training_args = TrainingArguments(
    output_dir="train_output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    report_to="none",
)

trainer = CustomTrainer(
    model=model,
    tag_weights=tag_weights,
    args=training_args,
    train_dataset=fewnerd_mountains["train"],
    eval_dataset=fewnerd_mountains["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics    
)

trainer.train()
trainer.evaluate(fewnerd_mountains["test"])

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0155,0.019783,0.475104,0.623978,0.539458,0.998487
2,0.0116,0.026027,0.520833,0.544959,0.532623,0.998706
3,0.0072,0.027725,0.525907,0.553134,0.539177,0.998746
4,0.003,0.046169,0.643068,0.594005,0.617564,0.998912
5,0.001,0.039552,0.690217,0.692098,0.691156,0.998912
6,0.0008,0.053781,0.69697,0.626703,0.659971,0.998978
7,0.0005,0.055568,0.698718,0.594005,0.642121,0.998877
8,0.0006,0.061001,0.677019,0.594005,0.632801,0.998831
9,0.0011,0.057025,0.664653,0.599455,0.630372,0.998866
10,0.0004,0.078071,0.672185,0.553134,0.606876,0.998849






{'eval_loss': 0.03768905624747276,
 'eval_precision': 0.6402266288951841,
 'eval_recall': 0.6637298091042585,
 'eval_f1': 0.6517664023071377,
 'eval_accuracy': 0.9989287819946799,
 'eval_runtime': 75.498,
 'eval_samples_per_second': 498.649,
 'eval_steps_per_second': 31.166,
 'epoch': 15.0}

<span style="font-size:16px">Save the best model and tokenizer to the specified directory</span>

In [19]:
save_dir = "fewnerd-mountains-model"

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

('fewnerd-mountains-model\\tokenizer_config.json',
 'fewnerd-mountains-model\\special_tokens_map.json',
 'fewnerd-mountains-model\\vocab.txt',
 'fewnerd-mountains-model\\added_tokens.json',
 'fewnerd-mountains-model\\tokenizer.json')

### 4. Сonclusions

<span style="font-size:16px">The model metrics for mountain NER are similar to the BERT metrics for all categories of named entities on supervised [Few-NERD](http://ningding97.github.io/fewnerd) dataset. So it seems that by changing the training parameters we can slightly improve the model performance, but a more significant performance improvement is only available by changing the base model from DistilBERT to another language model, such as RoBERTa or XLNet.</span>

### 5. Inference

<span style="font-size:16px">Load the model and tokenizer from the specified path and define a function that tags each word in a text with either the mountain tag or the O tag</span>

In [20]:
#import torch
#from transformers import AutoModelForTokenClassification, AutoTokenizer

model_path = "fewnerd-mountains-model"

model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Returns a list of word and tag pairs based on the model and tokenizer 
def get_word_tag_list(text):    
    tokenized_input = tokenizer(text, return_tensors="pt", truncation=True)
        
    # Compute a list of predicted tags for all tokens based on the model 
    with torch.no_grad():
        logits = model(**tokenized_input).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_tags = [model.config.id2label[t.item()] for t in predictions[0]]

    # List mapping token IDs to word IDs
    word_ids = tokenized_input.word_ids()
    
    # Get a list mapping word IDs to token IDs
    word_to_token_ids = []
    for idx, word_id in enumerate(word_ids):
        if word_id is not None:
            if word_id >= len(word_to_token_ids):
                word_to_token_ids.append([])
            word_to_token_ids[word_id].append(idx)

    # Generate a list of word and tag pairs
    word_tag_list = []    
    for word_id in range(len(word_to_token_ids)):
        span = tokenized_input.word_to_chars(word_id)
        word = text[span.start:span.end]
        
        token_id = word_to_token_ids[word_id][0]
        tag = predicted_tags[token_id]       
        
        word_tag_list.append((word, tag))

    return word_tag_list     

<span style="font-size:16px">Examples of the model output:</span>

In [21]:
# Prints the model output
def print_word_tag_list(text):
    word_tag_list = get_word_tag_list(text)
    for p in word_tag_list:
        print(f"{p[0]} : {p[1]}")

text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
print_word_tag_list(text)

The : O
Golden : O
State : O
Warriors : O
are : O
an : O
American : O
professional : O
basketball : O
team : O
based : O
in : O
San : O
Francisco : O
. : O


In [22]:
text = """ 
The Mont Blanc massif is popular for outdoor activities like hiking, climbing, trail running and winter sports like skiing, and snowboarding.
The most popular climbing route to the summit of Mont Blanc is the Goûter Route, which typically takes two days.
"""
print_word_tag_list(text)

The : O
Mont : location-mountain
Blanc : location-mountain
massif : location-mountain
is : O
popular : O
for : O
outdoor : O
activities : O
like : O
hiking : O
, : O
climbing : O
, : O
trail : O
running : O
and : O
winter : O
sports : O
like : O
skiing : O
, : O
and : O
snowboarding : O
. : O
The : O
most : O
popular : O
climbing : O
route : O
to : O
the : O
summit : O
of : O
Mont : location-mountain
Blanc : location-mountain
is : O
the : O
Goûter : O
Route : O
, : O
which : O
typically : O
takes : O
two : O
days : O
. : O


In [23]:
text = "Mont Blanc is a beautiful rooftop cafe."
print_word_tag_list(text)

Mont : O
Blanc : O
is : O
a : O
beautiful : O
rooftop : O
cafe : O
. : O
