In [1]:
# !pip install datasets transformers evaluate seqeval

### 1. Загрузка датасетка

<span style="font-size:16px">Для обозначения названий гор в тексте мы используем набор данных [Few-NERD][1], который доступен на Kaggle[2] и huggingface[3]. Мы используем supervised-часть этого набора данных</span>

In [2]:
from datasets import load_dataset

fewnerd = load_dataset('json', data_files={
    'train': '../fewnerd/supervised/train.json',
    'val': '../fewnerd/supervised/dev.json',
    'test': '../fewnerd/supervised/test.json',
})
fewnerd

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'coarse_tags', 'fine_tags', 'id'],
        num_rows: 131766
    })
    val: Dataset({
        features: ['tokens', 'coarse_tags', 'fine_tags', 'id'],
        num_rows: 18823
    })
    test: Dataset({
        features: ['tokens', 'coarse_tags', 'fine_tags', 'id'],
        num_rows: 37647
    })
})

<span style="font-size:16px">Загрузка списка тегов</span>

In [3]:
import json

with open("../fewnerd/id2coarse_tags.json", "r") as f:
    id2coarse_tag = json.load(f)
print(id2coarse_tag)
    
with open("../fewnerd/id2fine_tags.json", "r") as f:
    id2fine_tag = json.load(f)
id2fine_tag  

{'0': 'O', '1': 'art', '2': 'building', '3': 'event', '4': 'location', '5': 'organization', '6': 'other', '7': 'person', '8': 'product'}


{'0': 'O',
 '1': 'art-broadcastprogram',
 '2': 'art-film',
 '3': 'art-music',
 '4': 'art-other',
 '5': 'art-painting',
 '6': 'art-writtenart',
 '7': 'building-airport',
 '8': 'building-hospital',
 '9': 'building-hotel',
 '10': 'building-library',
 '11': 'building-other',
 '12': 'building-restaurant',
 '13': 'building-sportsfacility',
 '14': 'building-theater',
 '15': 'event-attack/battle/war/militaryconflict',
 '16': 'event-disaster',
 '17': 'event-election',
 '18': 'event-other',
 '19': 'event-protest',
 '20': 'event-sportsevent',
 '21': 'location-GPE',
 '22': 'location-bodiesofwater',
 '23': 'location-island',
 '24': 'location-mountain',
 '25': 'location-other',
 '26': 'location-park',
 '27': 'location-road/railway/highway/transit',
 '28': 'organization-company',
 '29': 'organization-education',
 '30': 'organization-government/governmentagency',
 '31': 'organization-media/newspaper',
 '32': 'organization-other',
 '33': 'organization-politicalparty',
 '34': 'organization-religion',
 '

In [4]:
MOUTAIN_TAG = 24

rows_with_mountain_tag = [i for i, row in enumerate(fewnerd["train"]["fine_tags"]) if MOUTAIN_TAG in row]
len(rows_with_mountain_tag), rows_with_mountain_tag[:5]

(1502, [46, 75, 98, 138, 284])

<span style="font-size:16px">Примеры данных обучающей выборки датасета:</span>

In [5]:
for x in fewnerd["train"].select(rows_with_mountain_tag[:5]):
    print(x, "\n")

{'tokens': ['The', 'Eighth', 'Army', 'began', 'to', 'attack', 'Italian', 'units', ',', 'located', 'using', 'information', 'from', 'Ultra', ',', 'at', 'Ruweisat', 'Ridge', 'and', 'from', 'again', 'at', 'Tel', 'El', 'Eisa', 'on', '22', 'July', 'and', 'Miteirya', 'Ridge', 'after', 'which', 'another', 'lull', 'fell', '.'], 'coarse_tags': [0, 5, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 4, 4, 4, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0], 'fine_tags': [0, 32, 32, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 24, 0, 0, 0, 0, 21, 21, 21, 0, 0, 0, 0, 24, 24, 0, 0, 0, 0, 0, 0], 'id': '46'} 

{'tokens': ['Though', 'only', 'in', 'length', ',', 'The', 'Salamander', 'Glacier', 'is', 'about', 'wide', '.'], 'coarse_tags': [0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0], 'fine_tags': [0, 0, 0, 0, 0, 0, 24, 24, 0, 0, 0, 0], 'id': '75'} 

{'tokens': ['Mount', 'Diablo', 'has', 'inspired', 'many', 'artists', 'and', 'writers', '.'], 'coarse_tags': [4, 4, 0, 0, 0, 0, 0, 0, 0], 'fine_tags': [24, 24, 0, 0, 0, 0

### 2. Предобработка данных

<span style="font-size:16px">Замена тегов, отличных от нужных тегов на 0, нужных тегов на 1 и удаление дополнительных столбцов из новых наборов данных</span>

In [6]:
def tag_map(tag):
    return 1 if tag == MOUTAIN_TAG else 0

def tag_list_map(tag_list):
    return list(map(tag_map, tag_list))

def fine_tags_map(examples):
    examples["mountain_tags"] = list(map(tag_list_map, examples["fine_tags"]))    
    return examples
    
fewnerd_mountains = fewnerd.map(fine_tags_map, remove_columns=["coarse_tags", "fine_tags", "id"], batched=True)

Map:   0%|          | 0/131766 [00:00<?, ? examples/s]

Map:   0%|          | 0/18823 [00:00<?, ? examples/s]

Map:   0%|          | 0/37647 [00:00<?, ? examples/s]

<span style="font-size:16px">Вычисление количества меток нужного тега в обработанных наборах данных и их доля в датасете</span>

In [8]:
def print_mountain_dataset_stat(name, dataset):
    mountain_tags_num = 0
    tags_num = 0
    for tags in dataset["mountain_tags"]:
        mountain_tags_num += sum(tags)
        tags_num += len(tags)
    o_tags_num = tags_num - mountain_tags_num
    print(f"{name:<5} dataset - mountain tags: {mountain_tags_num}, O tags: {o_tags_num}, proportion: {mountain_tags_num/o_tags_num}")

for k in fewnerd_mountains.keys():
    print_mountain_dataset_stat(k, fewnerd_mountains[k])

train dataset - mountain tags: 4500, O tags: 3223038, proportion: 0.0013961982452580454
val   dataset - mountain tags: 734, O tags: 462386, proportion: 0.0015874183041874104
test  dataset - mountain tags: 1366, O tags: 919688, proportion: 0.0014852863144892616


<span style="font-size:16px">Загрузка токенизатора DistilBERT для предварительной обработки поля токенов.</span>

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

<span style="font-size:16px">Пример работы токенизатора:</span>

In [10]:
example = fewnerd_mountains["train"]["tokens"][75]
tokenized_input = tokenizer(example, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

print(example, "\n")
print(tokenized_input, "\n")
print(tokens)

['Though', 'only', 'in', 'length', ',', 'The', 'Salamander', 'Glacier', 'is', 'about', 'wide', '.'] 

{'input_ids': [101, 2295, 2069, 1999, 3091, 1010, 1996, 16183, 23093, 4063, 10046, 2003, 2055, 2898, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 

['[CLS]', 'though', 'only', 'in', 'length', ',', 'the', 'sal', '##aman', '##der', 'glacier', 'is', 'about', 'wide', '.', '[SEP]']


<span style="font-size:16px">Токенизатор добавляет некоторые специальные токены, а токенизация вложенных слов приводит к несоответствию между вводимыми данными и метками. Одно слово, соответствующее одной метке, теперь можно разделить на два вложенных слова. Мы перестраиваем маркеры и метки и удаляем лишние столбцы из новых наборов данных.</span>

In [None]:
#Значение, которое игнорируется и не влияет на градиент в CrossEntropyLoss
IGNORE_INDEX = -100 

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["mountain_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Сопоставление токенов с их соответствующим словом
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(IGNORE_INDEX) 
            else:
                label_ids.append(label[word_idx]) 
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

fewnerd_mountains = fewnerd_mountains.map(tokenize_and_align_labels, remove_columns=["tokens", "mountain_tags"], batched=True)

Map:   0%|          | 0/131766 [00:00<?, ? examples/s]

Map:   0%|          | 0/18823 [00:00<?, ? examples/s]

Map:   0%|          | 0/37647 [00:00<?, ? examples/s]

<span style="font-size:16px">Пример обработанного набора данных:</span>

In [12]:
print(fewnerd_mountains["train"][75])

{'input_ids': [101, 2295, 2069, 1999, 3091, 1010, 1996, 16183, 23093, 4063, 10046, 2003, 2055, 2898, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, -100]}


<span style="font-size:16px">Настройка средства сопоставления данных</span>

In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

### 3. Train

<span style="font-size:16px">Прежде чем приступить к обучению модели, создаем список меток и словари с идентификаторами меток и самими метками</span>

In [14]:
label_list = [id2fine_tag[str(0)], id2fine_tag[str(MOUTAIN_TAG)]]
print(label_list, "\n")

id2label = {i: label for i, label in enumerate(label_list)}
print(id2label, "\n")

label2id = {label: i for i, label in enumerate(label_list)}
print(label2id)

['O', 'location-mountain'] 

{0: 'O', 1: 'location-mountain'} 

{'O': 0, 'location-mountain': 1}


<span style="font-size:16px">Загружаем модель DistilBERT с помощью AutoModelForTokenClassification, указав количество ожидаемых меток и их соответствие</span>

In [15]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<span style="font-size:16px">Создаем функцию, которая вычисляет метрики на основе прогнозов и меток, игнорируя метки для специальных токенов</span>

In [16]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != IGNORE_INDEX]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [ [label_list[l] for l in label if l != IGNORE_INDEX] for label in labels ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)    
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

<span style="font-size:16px">Из-за разницы в количестве нужных и обычных тегов в наборах данных следует использовать весовые коэффициенты классов в функции потерь. Для этого нам нужно доработать класс Trainer.</span>

In [17]:
import torch
from transformers import Trainer

class CustomTrainer(Trainer):    
    def __init__(self, tag_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tag_weights = tag_weights
        
    def compute_loss(self, model, inputs, num_items_in_batch = None, return_outputs=False):
        labels = inputs.get("labels")       
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')        
        
        # Compute custom loss
        weight=torch.tensor(self.tag_weights)
        if torch.cuda.is_available():
           weight = weight.cuda()
        #    print("GPU Activate")
        loss_fun = torch.nn.CrossEntropyLoss(weight)
        loss = loss_fun(logits.view(-1, model.config.num_labels), labels.view(-1))        
        
        return (loss, outputs) if return_outputs else loss

<span style="font-size:16px">Задаем параметры для обучения модели с помощью экземпляра CustomTrainer, обучаем модель и оцениваем ее на тестовом наборе данных</span>

In [18]:
from transformers import TrainingArguments

tag_weights = [0.1, 1]

training_args = TrainingArguments(
    output_dir="../train_output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    report_to="wandb",
)

trainer = CustomTrainer(
    model=model,
    tag_weights=tag_weights,
    args=training_args,
    train_dataset=fewnerd_mountains["train"],
    eval_dataset=fewnerd_mountains["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics    
)

trainer.train()
trainer.evaluate(fewnerd_mountains["test"])

  super().__init__(*args, **kwargs)
[34m[1mwandb[0m: Loading settings from /root/.config/wandb/settings
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for http://wandb:8080 from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33m-skorinaka[0m to [32mhttp://wandb:8080[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0162,0.018953,0.463736,0.574932,0.513382,0.998539
2,0.0111,0.023716,0.503632,0.566757,0.533333,0.998656
3,0.0056,0.024027,0.561364,0.673025,0.612144,0.998711
4,0.0031,0.047369,0.662338,0.555858,0.604444,0.998838
5,0.0014,0.044753,0.612987,0.643052,0.62766,0.998698
6,0.0009,0.04692,0.655882,0.607629,0.630835,0.998783
7,0.0006,0.083447,0.719231,0.509537,0.596491,0.998726
8,0.0002,0.080655,0.699324,0.564033,0.624434,0.998862
9,0.0005,0.064416,0.72449,0.580381,0.644478,0.998862
10,0.0002,0.053328,0.695238,0.59673,0.642229,0.998816






{'eval_loss': 0.06592143326997757,
 'eval_precision': 0.6784565916398714,
 'eval_recall': 0.6196769456681351,
 'eval_f1': 0.6477359938603223,
 'eval_accuracy': 0.998946312694164,
 'eval_runtime': 65.2888,
 'eval_samples_per_second': 576.622,
 'eval_steps_per_second': 36.04,
 'epoch': 15.0}

<span style="font-size:16px">Сохраняем модель и токенизатор в директории</span>

In [19]:
save_dir = "../models/fewnerd-mountains-model"

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

('../models/fewnerd-mountains-model/tokenizer_config.json',
 '../models/fewnerd-mountains-model/special_tokens_map.json',
 '../models/fewnerd-mountains-model/vocab.txt',
 '../models/fewnerd-mountains-model/added_tokens.json',
 '../models/fewnerd-mountains-model/tokenizer.json')

### 4. Выводы

<span style="font-size:16px">Метрики модели для распознавания сущностей в тексте аналогичны метрикам BERT для всех категорий именованных сущностей в контролируемом наборе данных [Few-NERD](http://ningding97.github.io/fewnerd). Таким образом, изменяя параметры обучения, мы можем немного повысить производительность модели, но более значительного улучшения можно добиться, только заменив базовую модель DistilBERT на другую языковую модель, например RoBERTa или XLNet.</span>

### 5. Проверка

<span style="font-size:16px">Загружаем модель и токенизатор по указанному пути и определяем функцию, которая помечает каждое слово в тексте нужным тегом или тегом “O”</span>

In [20]:
#import torch
#from transformers import AutoModelForTokenClassification, AutoTokenizer

model_path = "../models/fewnerd-mountains-model"

model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Возвращает список пар слов и тегов на основе модели и токенизатора
def get_word_tag_list(text):    
    tokenized_input = tokenizer(text, return_tensors="pt", truncation=True)
        
    # Вычисляет список прогнозируемых тегов для всех токенов на основе модели
    with torch.no_grad():
        logits = model(**tokenized_input).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_tags = [model.config.id2label[t.item()] for t in predictions[0]]

    # Список, сопоставляющий идентификаторы токенов с идентификаторами слов
    word_ids = tokenized_input.word_ids()
    
    # Получение списка
    word_to_token_ids = []
    for idx, word_id in enumerate(word_ids):
        if word_id is not None:
            if word_id >= len(word_to_token_ids):
                word_to_token_ids.append([])
            word_to_token_ids[word_id].append(idx)

    # Список пар слов и тэгов
    word_tag_list = []    
    for word_id in range(len(word_to_token_ids)):
        span = tokenized_input.word_to_chars(word_id)
        word = text[span.start:span.end]
        
        token_id = word_to_token_ids[word_id][0]
        tag = predicted_tags[token_id]       
        
        word_tag_list.append((word, tag))

    return word_tag_list     

<span style="font-size:16px">Функция выводящая результат</span>

In [None]:
# Prints the model output
def print_word_tag_list(text):
    word_tag_list = get_word_tag_list(text)
    for p in word_tag_list:
        print(f"{p[0]} : {p[1]}")

<span style="font-size:16px">В итоге можно проверить выходные данные текста:</span>

In [None]:
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
print_word_tag_list(text)

The : O
Golden : O
State : O
Warriors : O
are : O
an : O
American : O
professional : O
basketball : O
team : O
based : O
in : O
San : O
Francisco : O
. : O


In [22]:
text = """ 
The Mont Blanc massif is popular for outdoor activities like hiking, climbing, trail running and winter sports like skiing, and snowboarding.
The most popular climbing route to the summit of Mont Blanc is the Goûter Route, which typically takes two days.
"""
print_word_tag_list(text)

The : O
Mont : location-mountain
Blanc : location-mountain
massif : O
is : O
popular : O
for : O
outdoor : O
activities : O
like : O
hiking : O
, : O
climbing : O
, : O
trail : O
running : O
and : O
winter : O
sports : O
like : O
skiing : O
, : O
and : O
snowboarding : O
. : O
The : O
most : O
popular : O
climbing : O
route : O
to : O
the : O
summit : O
of : O
Mont : location-mountain
Blanc : location-mountain
is : O
the : O
Goûter : O
Route : O
, : O
which : O
typically : O
takes : O
two : O
days : O
. : O


In [23]:
text = "Mont Blanc is a beautiful rooftop cafe."
print_word_tag_list(text)

Mont : O
Blanc : O
is : O
a : O
beautiful : O
rooftop : O
cafe : O
. : O
