#Setup

In [1]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip3 install seqeval
!pip3 install transformers[torch]
!pip3 install accelerate -U

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

# Running the Roberta Model

Let's try annotating some Indian language sentences and get the named entities

In [2]:
# Import all the necessary classes and initialize the tokenizer and model.
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("Yaxin/xlm-roberta-base-conll2003-ner")

model = AutoModelForTokenClassification.from_pretrained("Yaxin/xlm-roberta-base-conll2003-ner")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [3]:
def get_predictions( sentence, tokenizer, model ):
  # Let us first tokenize the sentence - split words into subwords
  tok_sentence = tokenizer(sentence, return_tensors='pt')

  with torch.no_grad():
    # we will send the tokenized sentence to the model to get predictions
    logits = model(**tok_sentence).logits.argmax(-1)

    # We will map the maximum predicted class id with the class label
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]

    predicted_labels = []

    previous_token_id = 0
    # we need to assign the named entity label to the head word and not the following sub-words
    word_ids = tok_sentence.word_ids()
    for word_index in range(len(word_ids)):
        if word_ids[word_index] == None:
            previous_token_id = word_ids[word_index]
        elif word_ids[word_index] == previous_token_id:
            previous_token_id = word_ids[word_index]
        else:
            predicted_labels.append( predicted_tokens_classes[ word_index ] )
            previous_token_id = word_ids[word_index]

    return predicted_labels

In [4]:
# let us try with some example sentences here
sentence = 'लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं'

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )

लगातार	O
हमलावर	O
हो	O
रहे	O
शिवपाल	B-PER
और	O
राजभर	O
को	O
सपा	B-ORG
की	O
दो	O
टूक,	O
चिट्ठी	O
जारी	O
कर	O
कहा-	O
जहां	O
जाना	O
चाहें	O
जा	O
सकते	O
हैं	O


In [5]:
sentence = 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model=model
                                   )

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )

ಶರಣ್	B-PER
ರ	O
ನೀವು	O
ನೋಡಲೇಬೇಕಾದ	O
ಟಾಪ್	O
5	O
ಕಾಮಿಡಿ	O
ಚಲನಚಿತ್ರಗಳು	O


#Wikiann Dataset

In [6]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='or'

raw_datasets = load_dataset('wikiann', lang)

Downloading readme:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [7]:
# let's now print how the Dataset looks like
raw_datasets

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 100
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 100
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 100
    })
})

In [8]:
raw_datasets.column_names

{'validation': ['tokens', 'ner_tags', 'langs', 'spans'],
 'test': ['tokens', 'ner_tags', 'langs', 'spans'],
 'train': ['tokens', 'ner_tags', 'langs', 'spans']}

In [9]:
# let's print an instance of dataset
idx=99
rec=raw_datasets['train'][idx]
for w, t in zip(rec['tokens'],rec['ner_tags']):
  print('{}\t{}'.format(w,t))


ମଝିଘରିଆଣୀ	3
ମନ୍ଦିରର	4
ମ୍ୟାନେଜିଂ	0
ଟ୍ରଷ୍ଟି	0
,	0


In [10]:
column_names = raw_datasets["train"].column_names
print(column_names)

features = raw_datasets["train"].features
print(features)

['tokens', 'ner_tags', 'langs', 'spans']
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


In [11]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [12]:
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.

label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)


{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


# Training an NER Model with the dataset

We have already seen how to get predictions from fine-tuned NER model. We will now use the pre-trained IndicBERT model and fine-tune it for NER task.

Let us download a pre-trained model and fine-tune it for the task of NER. We will have to use the `AutoModelForTokenClassification` class to fine-tune the model

**Load Pre-trained Model**

In [13]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('Yaxin/xlm-roberta-base-conll2003-ner', num_labels=num_labels, finetuning_task='ner',ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained("Yaxin/xlm-roberta-base-conll2003-ner", ignore_mismatched_sizes=True)
model = AutoModelForTokenClassification.from_pretrained('Yaxin/xlm-roberta-base-conll2003-ner', num_labels=num_labels , ignore_mismatched_sizes=True)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Yaxin/xlm-roberta-base-conll2003-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Run the next cell if you want to use a GPU. Make sure that the Colab runtime is set accordingly

model=model.to("cuda")

**Tokenize all texts and align the labels with them**

In [14]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

  self.pid = os.fork()


Running tokenizer on train dataset (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [16]:
eval_dataset = raw_datasets["validation"]
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)

Running tokenizer on Validation dataset (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

**Create Data Collator, Metrics**

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [18]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

**Set Training Arguments**

In [None]:
# args=TrainingArguments(output_dir='output_dir',max_steps=5)
args=TrainingArguments(
    output_dir='output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,)




**Training**

In [None]:
# Initialize our Trainer
# early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)
# args.metric_for_best_model = "f1"
# args.load_best_model_at_end = True
# args.evaluation_strategy = IntervalStrategy.STEPS
# args.eval_steps = args.save_steps
# args.greater_is_better = True

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    args=args,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_la

In [None]:
train_result = trainer.train()
metrics = train_result.metrics

Step,Training Loss


In [None]:

metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_LOC_f1             =     0.2771
  eval_LOC_number         =         32
  eval_LOC_precision      =     0.1716
  eval_LOC_recall         =     0.7188
  eval_ORG_f1             =        0.2
  eval_ORG_number         =         27
  eval_ORG_precision      =     0.2174
  eval_ORG_recall         =     0.1852
  eval_PER_f1             =     0.2514
  eval_PER_number         =         86
  eval_PER_precision      =     0.2371
  eval_PER_recall         =     0.2674
  eval_loss               =     0.9673
  eval_overall_accuracy   =      0.712
  eval_overall_f1         =     0.2556
  eval_overall_precision  =     0.2008
  eval_overall_recall     =     0.3517
  eval_runtime            = 0:00:02.89
  eval_samples_per_second =     34.496
  eval_steps_per_second   =      4.485


# Multilingual Fine-Tuning

We now present a short tutorial to fine-tune the model on the combined data of all Indic languages

The _Naampadam_ Dataset is a large dataset for Named Entity Recognition in 11 Indian languages.  _Naampadam_ means "named entity" in Sanskrit.

In [19]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

# languages=['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']

# For demo purpose we will only choose 'Assamese' and 'Odiya' datasets
languages=['as', 'or']

raw_datasets = {}

for lang in languages:
  raw_datasets[lang] = load_dataset('wikiann', lang,
                            use_auth_token='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',
                            download_mode=DownloadMode.FORCE_REDOWNLOAD)




Downloading readme:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.68k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

We now concatenate all the datasets so that we could fine-tune a multilingual NER model

In [20]:
# Let's look at how raw_dataset looks like
raw_datasets

{'as': DatasetDict({
     validation: Dataset({
         features: ['tokens', 'ner_tags', 'langs', 'spans'],
         num_rows: 100
     })
     test: Dataset({
         features: ['tokens', 'ner_tags', 'langs', 'spans'],
         num_rows: 100
     })
     train: Dataset({
         features: ['tokens', 'ner_tags', 'langs', 'spans'],
         num_rows: 100
     })
 }),
 'or': DatasetDict({
     validation: Dataset({
         features: ['tokens', 'ner_tags', 'langs', 'spans'],
         num_rows: 100
     })
     test: Dataset({
         features: ['tokens', 'ner_tags', 'langs', 'spans'],
         num_rows: 100
     })
     train: Dataset({
         features: ['tokens', 'ner_tags', 'langs', 'spans'],
         num_rows: 100
     })
 })}

In [21]:
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.

label_column_name = 'ner_tags'

label_list = raw_datasets['as']["train"].features[label_column_name].feature.names
label_to_id = {label_list[i]: raw_datasets['as']["train"].features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)


{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


To concatenate dataset let's combine all `train` and `validation` splits of all languages together

In [22]:
pre_concatenated_train_split = []

for lang in raw_datasets:
  pre_concatenated_train_split.append( raw_datasets[lang]['train'] )

pre_concatenated_validation_split = []

for lang in raw_datasets:
  pre_concatenated_validation_split.append( raw_datasets[lang]['validation'] )

Let us concatenate the dataset now

In [23]:
from datasets import concatenate_datasets, DatasetDict

concatenated_dataset = DatasetDict()
concatenated_dataset["train"] = concatenate_datasets(
    pre_concatenated_train_split
)

concatenated_dataset["validation"] = concatenate_datasets(
    pre_concatenated_validation_split
)

### Let us now load the Pre-trained model

In [24]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np

config = AutoConfig.from_pretrained('Yaxin/xlm-roberta-base-conll2003-ner', num_labels=num_labels, finetuning_task='ner', ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained("Yaxin/xlm-roberta-base-conll2003-ner", ignore_mismatched_sizes=True)
model = AutoModelForTokenClassification.from_pretrained('Yaxin/xlm-roberta-base-conll2003-ner', num_labels=num_labels, ignore_mismatched_sizes=True )

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Yaxin/xlm-roberta-base-conll2003-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Run the next cell if you want to use a GPU. Make sure that the Colab runtime is set accordingly

model=model.to("cuda")

Tokenize all the datasets and align them

In [25]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
text_column_name = 'tokens'
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Let us not `tokenize` the `train` and `validation` splits

In [27]:
train_dataset = concatenated_dataset["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=32,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

  self.pid = os.fork()


Running tokenizer on train dataset (num_proc=32):   0%|          | 0/200 [00:00<?, ? examples/s]

In [28]:
validation_dataset = concatenated_dataset["validation"]
validation_dataset = validation_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on validation dataset",
)

Running tokenizer on validation dataset (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

## Create DataCollator and Metrics

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [30]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


## Set Training Arguments

In [31]:
args=TrainingArguments(
    output_dir='output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,)

# !pip3 install transformers[torch]
# !pip3 install accelerate -U

In [32]:
# Initialize the trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    args=args,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Train the model

In [33]:
# Train the model
train_result = trainer.train()
metrics = train_result.metrics

Step,Training Loss


## Evaluate the Trained Model

Let us now evaluate the trained model on the test sets of all languages

We need to first tokenize the test sets

In [34]:
tokenized_test_set = {}

for lang in raw_datasets:
  tokenized_test_set[lang] = raw_datasets[lang]['test'].map(
      tokenize_and_align_labels,
      batched=True,
      num_proc=32,
      load_from_cache_file=True,
      desc="Running tokenizer on test dataset of language {0}".format(lang),
  )

  self.pid = os.fork()


Running tokenizer on test dataset of language as (num_proc=32):   0%|          | 0/100 [00:00<?, ? examples/s]

Running tokenizer on test dataset of language or (num_proc=32):   0%|          | 0/100 [00:00<?, ? examples/s]

Run prediction on test set of each of the language separately and extract overall `Precison`, `Recall` and `F-Score` separately

In [35]:
final_metrics = {}

for lang in tokenized_test_set:
  predictions, labels, metrics = trainer.predict(tokenized_test_set[lang], metric_key_prefix=lang)

  lang_specific_results = {}
  for key in metrics:
    if 'overall_precision' in key:
      lang_specific_results['Precision'] = metrics[key]
    elif 'overall_recall' in key:
      lang_specific_results['Recall'] = metrics[key]
    elif 'overall_f1' in key:
      lang_specific_results['F1'] = metrics[key]
  final_metrics[lang] = lang_specific_results

Print the individual result on each of the language

In [38]:
# import pandas as pd

# combined_results = pd.DataFrame.from_dict(
#             final_metrics, orient="index"
#         )

# print(combined_results)
metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        1.0
  eval_LOC_f1             =     0.1325
  eval_LOC_number         =         76
  eval_LOC_precision      =     0.1333
  eval_LOC_recall         =     0.1316
  eval_ORG_f1             =     0.1125
  eval_ORG_number         =         62
  eval_ORG_precision      =     0.0918
  eval_ORG_recall         =     0.1452
  eval_PER_f1             =     0.6583
  eval_PER_number         =        118
  eval_PER_precision      =     0.5224
  eval_PER_recall         =     0.8898
  eval_loss               =     1.0396
  eval_overall_accuracy   =     0.6669
  eval_overall_f1         =     0.3937
  eval_overall_precision  =     0.3316
  eval_overall_recall     =     0.4844
  eval_runtime            = 0:00:05.92
  eval_samples_per_second =     33.745
  eval_steps_per_second   =      4.218


# Misc

In [None]:
torch.__version__

In [None]:
!nvidia-smi

In [None]:
import torch
import gc
# del trainer
# del model
# del train_dataset
# del eval_dataset
# del data_collator
# del tokenizer
gc.collect()
torch.cuda.empty_cache()

from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)