# Showus: NER Training (RoBERTa)

In [1]:
! pip install /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
! pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
! pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
! pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
! pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Processing /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.8.7
    Uninstalling fsspec-0.8.7:
      Successfully uninstalled fsspec-0.8.7
Successfully installed fsspec-2021.4.0
Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.59.0
    Uninstalling tqdm-4.59.0:
      Successf

In [2]:
import sys
from functools import partial

from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_metric
from transformers import TrainingArguments, Trainer
import datasets

sys.path.append('/kaggle/input/showus-package')
from showus import load_ner_datasets, get_ner_classlabel
from showus import tokenize_and_align_labels, create_tokenizer
from showus import compute_metrics

In [3]:
model_checkpoint, bs = 'roberta-base', 8

classlabel = get_ner_classlabel()

tokenizer = create_tokenizer(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=classlabel.num_classes)
model.resize_token_embeddings(len(tokenizer))
metric = load_metric('seqeval')

tokenized_datasets = datasets.load_from_disk('../input/showusdata-ner-datasets-roberta/datasetdict_roberta-base')
word_ids = tokenized_datasets['valid']['word_ids']
compute_metrics_ = partial(compute_metrics, metric=metric, label_list=classlabel.names, word_ids=word_ids)

args = TrainingArguments(output_dir=f'training_results_{model_checkpoint}', num_train_epochs=9, 
                         learning_rate=2e-5, weight_decay=0.01,
                         per_device_train_batch_size=bs, per_device_eval_batch_size=bs,
                         evaluation_strategy='epoch', logging_steps=4, report_to='none', 
                         save_strategy='epoch', save_total_limit=6)

trainer = Trainer(model=model, args=args, 
                  train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['valid'], 
                  data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics_)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961.0, style=ProgressStyle(description…




In [4]:
# trainer.train()
trainer.train(resume_from_checkpoint='../input/showusdata-roberta-base-ner/training_results_roberta-base/checkpoint-63645')

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
6,0.0006,0.001035,0.947745,0.977122,0.962209,0.999811,165.8457,33.411
7,0.0009,0.001044,0.952844,0.976753,0.96465,0.999788,166.3964,33.3
8,0.0,0.001016,0.956616,0.976384,0.966399,0.999807,166.6213,33.255
9,0.0,0.001034,0.957523,0.98155,0.969388,0.999818,166.1433,33.351


TrainOutput(global_step=114561, training_loss=0.0002743813999457143, metrics={'train_runtime': 29812.7759, 'train_samples_per_second': 3.843, 'total_flos': 3.451104776025618e+17, 'epoch': 9.0, 'init_mem_cpu_alloc_delta': 1616920576, 'init_mem_gpu_alloc_delta': 497527808, 'init_mem_cpu_peaked_delta': 380284928, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1879769088, 'train_mem_gpu_alloc_delta': 2017684480, 'train_mem_cpu_peaked_delta': 462897152, 'train_mem_gpu_peaked_delta': 6504682496})