### Importing Library

In [1]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import json
from datasets import Dataset
import random
import json
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_metric

In [3]:
random.seed(1)

In [4]:
# Selecting the device
device = torch.device("cuda:1")
print(device)

cuda:1


### Model Loading

In [7]:
# IndicNER
num_labels = 9
text_name = "words"
label_name = "ner"

config = AutoConfig.from_pretrained('ai4bharat/indicNER', num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indicNER")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indicNER', num_labels=num_labels )
model=model.to(device)

RuntimeError: Error(s) in loading state_dict for BertForTokenClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([7, 768]) from checkpoint, the shape in current model is torch.Size([9, 768]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([7]) from checkpoint, the shape in current model is torch.Size([9]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

The above code explains that we cannot modify the number of labels for IndicNER, as it is already fine-tuned with NER task, and does not add any additional layers.

In [5]:
# IndicNER
num_labels = 7
text_name = "words"
label_name = "ner"

config = AutoConfig.from_pretrained('ai4bharat/indicNER')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indicNER")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indicNER' )
model=model.to(device)

### Dataset Preparation

In [6]:
label_to_id_dict = {
    'O' : 0,
    'B-PER' : 1,
    'I-PER' : 2,
    'B-ORG' : 3,
    'I-ORG' : 4,
    'B-LOC' : 5,
    'I-LOC' : 6,
    'B-MISC': 7, # Adding the extra two label will help to use the compute metric of the trainer
    'I-MISC' : 8 
}

In [7]:
id_2_label = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
def make_dataset( file_path, text_name = text_name, label_name = label_name, dataset_type = None, 
                 num_samples_to_take = None ):
    
    data_dir = { text_name : [ ], label_name : [ ] }
    with open(file_path, 'r') as f:
        for line in f:
            data_dir[text_name].append(json.loads(line)[text_name])
            data_dir[label_name].append( [ label_to_id_dict[i] for i in ( json.loads(line)[label_name] ) ] )
    if dataset_type == 'train':
        # For sampling the initial 20000 samples
        data_dir[text_name] = data_dir[text_name][0:num_samples_to_take]
        data_dir[label_name] = data_dir[label_name][ 0:num_samples_to_take]
    data = Dataset.from_dict(data_dir)
    return data

In [9]:
train_data = make_dataset("./bn_IndicNER_v1.0/bn_train.json", dataset_type = "train", num_samples_to_take=20000)

In [10]:
val_data = make_dataset("./bn_IndicNER_v1.0/bn_val.json", num_samples_to_take=100)
test_data = make_dataset("./bn_IndicNER_v1.0/bn_test.json", num_samples_to_take=100)

In [11]:
train_data

Dataset({
    features: ['words', 'ner'],
    num_rows: 20000
})

In [12]:
val_data

Dataset({
    features: ['words', 'ner'],
    num_rows: 4859
})

In [13]:
padding = "max_length"
def tokenize_and_align_labels(examples):
    """
    Tokenize the text and match the target label with the text token
    Input: A batch of sentence
        A dict:
            'words': A list of sentences denoted as a list of words
                        Desired Input to the model
            'ner': A list of desired labels
    Output: Tokenized Sentence
        A dict: dict:
            'words': A list of sentences denoted as a list of words
                        Desired Input to the model
            'ner': A list of desired labels
            'input_ids': Id of the tokenized words
                            A list of sentences denoted as 
                            a list of Id s of the consitituted word tokens
                            # Input to the Transformer model
            'attention_masks': A binary list
                                1 indicates the text token, model will attend to
                                0 indicates the padding token and special token, the model will not attend to.
            'labels': A list of numbers
                        Denting the target ground truth label
                        generated from the 'ner' field of Input
    """
    
    tokenized_inputs = tokenizer(
        examples[text_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples[label_name]):
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [14]:
train_data = train_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on train dataset",
)

Running tokenizer on train dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [00:05<00:00, 3432.94 examples/s]


In [15]:
# Visualizing the toenized data
train_data

Dataset({
    features: ['words', 'ner', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [16]:
val_data = val_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on validation dataset",
)
test_data = test_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on test dataset",
)

Running tokenizer on validation dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4859/4859 [00:01<00:00, 3517.49 examples/s]
Running tokenizer on test dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 607/607 [00:00<00:00, 3602.10 examples/s]


In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)

### Metric

In [18]:
# Metrics
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id_2_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_2_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### Trainning

In [19]:
def training_model(batch_size, lr , model, train_data = train_data, val_data = val_data,
                  tokenizer = tokenizer, data_collator = data_collator, compute_metrics = compute_metrics):
                    
    
    
    # Setting the TrainingArguments
    args=TrainingArguments(
        output_dir='output_dir',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        learning_rate = lr,
        num_train_epochs = 3 # As asked in the question
        )
    
    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        args=args,
    )
    
    train_result = trainer.train()
    
    eval_metric = trainer.evaluate(val_data)
    
    return (trainer, train_result, eval_metric)
    

#### Hyper-parameter Search

We have tuned the batch size and learning-rate as two are the most important hyper-parameters. 
Batch Size: Larger Batch size might result in a smoother convergence during Gradient descent, with a huge cost of calculating the gradients. Whereas smaller batch size might result ina more haphazard convergence but each step of Gradient Descent is fast. Therefore a we need to fine tune the optimum.

Learning Rate: A Higher Learning Rate indicates larger step, which might lead the model to go in bad optima, and lower earning rate might not help the model to converge.

We use a random sapling approach and done only 3 experiments due to limited resources for Hyper-parameter search.


In [25]:
def hyper_param_search():
    
    model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indicNER' )
    model=model.to(device)
    
    tunable_hyper_param = {'batch_size' : [8, 16, 32],
                      'lr': [1e-2, 1e-3, 1e-4, 1e-5]}
    batch_size = tunable_hyper_param['batch_size'][random.randint(0,2)] 
                # Generating a random number between 0 and 2 (both included)
    lr = tunable_hyper_param['lr'][random.randint(0,3)]
                # Generating a random number between 0 and 3 (both included)
    
    print('Batch size Learning Rate')
    print(batch_size, lr)
    
    (trainer, train_result, eval_metric) = training_model(batch_size, lr, model)
    
    return  (trainer, train_result, eval_metric, batch_size, lr)

##### Hyper-parameter search Experiment 1

In [26]:
(trainer, train_result, eval_metric, batch_size, lr) = hyper_param_search()

Batch size Learning Rate
32 0.01




Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size, lr))
print('The Evaluation Metric on the validation Data: ')
eval_metric

For Batch Size = 32, and Learning Rate = 0.01 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.7981597781181335,
 'eval_LOC_precision': 0.0,
 'eval_LOC_recall': 0.0,
 'eval_LOC_f1': 0.0,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.0,
 'eval_ORG_recall': 0.0,
 'eval_ORG_f1': 0.0,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.0,
 'eval_PER_recall': 0.0,
 'eval_PER_f1': 0.0,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.0,
 'eval_overall_recall': 0.0,
 'eval_overall_f1': 0.0,
 'eval_overall_accuracy': 0.8187198114551364,
 'eval_runtime': 24.6876,
 'eval_samples_per_second': 196.82,
 'eval_steps_per_second': 6.157,
 'epoch': 3.0}

##### Hyper-parameter search experiment 2

In [28]:
(trainer_2, train_result_2, eval_metric_2, batch_size_2, lr_2) = hyper_param_search()

Batch size Learning Rate
32 0.01


Step,Training Loss


In [29]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_2, lr_2))
print('The Evaluation Metric on the validation Data: ')
eval_metric_2

For Batch Size = 32, and Learning Rate = 0.01 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.7981597781181335,
 'eval_LOC_precision': 0.0,
 'eval_LOC_recall': 0.0,
 'eval_LOC_f1': 0.0,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.0,
 'eval_ORG_recall': 0.0,
 'eval_ORG_f1': 0.0,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.0,
 'eval_PER_recall': 0.0,
 'eval_PER_f1': 0.0,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.0,
 'eval_overall_recall': 0.0,
 'eval_overall_f1': 0.0,
 'eval_overall_accuracy': 0.8187198114551364,
 'eval_runtime': 24.9677,
 'eval_samples_per_second': 194.611,
 'eval_steps_per_second': 6.088,
 'epoch': 3.0}

##### Hyper-parameter search Experiment 3

In [36]:
(trainer_3, train_result_3, eval_metric_3, batch_size_3, lr_3) = hyper_param_search()

Batch size Learning Rate
32 0.01




Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_3, lr_3))
print('The Evaluation Metric on the validation Data: ')
eval_metric_3

For Batch Size = 32, and Learning Rate = 0.01 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.7981597781181335,
 'eval_LOC_precision': 0.0,
 'eval_LOC_recall': 0.0,
 'eval_LOC_f1': 0.0,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.0,
 'eval_ORG_recall': 0.0,
 'eval_ORG_f1': 0.0,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.0,
 'eval_PER_recall': 0.0,
 'eval_PER_f1': 0.0,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.0,
 'eval_overall_recall': 0.0,
 'eval_overall_f1': 0.0,
 'eval_overall_accuracy': 0.8187198114551364,
 'eval_runtime': 25.1842,
 'eval_samples_per_second': 192.939,
 'eval_steps_per_second': 6.036,
 'epoch': 3.0}

In [20]:
def hyper_param_setting():
    
    model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indicNER' )
    model=model.to(device)
    
    batch_size = 8
    lr = 1e-5
    print('Batch size Learning Rate')
    print(batch_size, lr)
    
    (trainer, train_result, eval_metric) = training_model(batch_size, lr, model)
    
    return  (trainer, train_result, eval_metric, batch_size, lr)

In [21]:
(trainer_4, train_result_4, eval_metric_4, batch_size_4, lr_4) = hyper_param_setting()

Batch size Learning Rate
8 1e-05




Step,Training Loss
500,0.7127
1000,0.1564
1500,0.1393


Checkpoint destination directory output_dir/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_dir/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_dir/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [27]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_4, lr_4))
print('The Evaluation Metric on the validation Data: ')
eval_metric_4

For Batch Size = 8, and Learning Rate = 1e-05 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.19311270117759705,
 'eval_LOC_precision': 0.7546480743691899,
 'eval_LOC_recall': 0.8086090359302739,
 'eval_LOC_f1': 0.7806972351021809,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.6146688560481663,
 'eval_ORG_recall': 0.6413478012564249,
 'eval_ORG_f1': 0.6277249860257126,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.8009319181982915,
 'eval_PER_recall': 0.8366684694429422,
 'eval_PER_f1': 0.8184102631926993,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.745805561939784,
 'eval_overall_recall': 0.7857142857142857,
 'eval_overall_f1': 0.7652399481193256,
 'eval_overall_accuracy': 0.9458969478005258,
 'eval_runtime': 25.2989,
 'eval_samples_per_second': 192.063,
 'eval_steps_per_second': 6.008,
 'epoch': 3.0}

In [44]:
def hyper_param_setting_2():
    
    model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )
    model=model.to(device)
    
    batch_size = 16
    lr = 1e-6
    print('Batch size Learning Rate')
    print(batch_size, lr)
    
    (trainer, train_result, eval_metric) = training_model(batch_size, lr, model)
    
    return  (trainer, train_result, eval_metric, batch_size, lr)

In [45]:
(trainer_5, train_result_5, eval_metric_5, batch_size_5, lr_5) = hyper_param_setting_2()
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_5, lr_5))
print('The Evaluation Metric on the validation Data: ')
eval_metric_5

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size Learning Rate
16 1e-06




Step,Training Loss
500,0.9335


Checkpoint destination directory output_dir/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


For Batch Size = 16, and Learning Rate = 1e-06 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.7791699171066284,
 'eval_LOC_precision': 0.0,
 'eval_LOC_recall': 0.0,
 'eval_LOC_f1': 0.0,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.0,
 'eval_ORG_recall': 0.0,
 'eval_ORG_f1': 0.0,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.0,
 'eval_PER_recall': 0.0,
 'eval_PER_f1': 0.0,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.0,
 'eval_overall_recall': 0.0,
 'eval_overall_f1': 0.0,
 'eval_overall_accuracy': 0.8187198114551364,
 'eval_runtime': 17.4745,
 'eval_samples_per_second': 278.062,
 'eval_steps_per_second': 8.698,
 'epoch': 3.0}

In [23]:
trainer_4.model.save_pretrained("./IndicNER_trainer_4", from_pt = True)

In [47]:
trainer_5.model.save_pretrained("./IndicNER_trainer_5", from_pt = True)

### Calculating Macro-F1 for the best model

In [28]:
best_model = trainer_4

In [29]:
test_metric = best_model.evaluate(test_data)

In [30]:
best_model.model.save_pretrained("./IndicBERT_best", from_pt = True)

In [31]:
test_metric

{'eval_loss': 0.16460365056991577,
 'eval_LOC_precision': 0.7739938080495357,
 'eval_LOC_recall': 0.7552870090634441,
 'eval_LOC_f1': 0.7645259938837919,
 'eval_LOC_number': 331,
 'eval_ORG_precision': 0.7116279069767442,
 'eval_ORG_recall': 0.7391304347826086,
 'eval_ORG_f1': 0.7251184834123222,
 'eval_ORG_number': 207,
 'eval_PER_precision': 0.8315789473684211,
 'eval_PER_recall': 0.8605664488017429,
 'eval_PER_f1': 0.8458244111349036,
 'eval_PER_number': 459,
 'eval_overall_precision': 0.7877591312931885,
 'eval_overall_recall': 0.8004012036108324,
 'eval_overall_f1': 0.7940298507462685,
 'eval_overall_accuracy': 0.9548401774542146,
 'eval_runtime': 3.2695,
 'eval_samples_per_second': 185.657,
 'eval_steps_per_second': 5.811,
 'epoch': 3.0}

In [32]:
val_metric = best_model.evaluate(val_data)
val_metric

{'eval_loss': 0.19311270117759705,
 'eval_LOC_precision': 0.7546480743691899,
 'eval_LOC_recall': 0.8086090359302739,
 'eval_LOC_f1': 0.7806972351021809,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.6146688560481663,
 'eval_ORG_recall': 0.6413478012564249,
 'eval_ORG_f1': 0.6277249860257126,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.8009319181982915,
 'eval_PER_recall': 0.8366684694429422,
 'eval_PER_f1': 0.8184102631926993,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.745805561939784,
 'eval_overall_recall': 0.7857142857142857,
 'eval_overall_f1': 0.7652399481193256,
 'eval_overall_accuracy': 0.9458969478005258,
 'eval_runtime': 24.9836,
 'eval_samples_per_second': 194.487,
 'eval_steps_per_second': 6.084,
 'epoch': 3.0}

In [33]:
train_metric = best_model.evaluate(train_data)
train_metric

{'eval_loss': 0.11506272107362747,
 'eval_LOC_precision': 0.8161290322580645,
 'eval_LOC_recall': 0.8566591422121896,
 'eval_LOC_f1': 0.8359030837004405,
 'eval_LOC_number': 11518,
 'eval_ORG_precision': 0.7471780225758194,
 'eval_ORG_recall': 0.7658210203512684,
 'eval_ORG_f1': 0.7563846630412335,
 'eval_ORG_number': 7174,
 'eval_PER_precision': 0.8539766157008866,
 'eval_PER_recall': 0.8851967769860825,
 'eval_PER_f1': 0.8693064774547953,
 'eval_PER_number': 15017,
 'eval_overall_precision': 0.8184752492216287,
 'eval_overall_recall': 0.8500400486516954,
 'eval_overall_f1': 0.8339590791350155,
 'eval_overall_accuracy': 0.9645699033037872,
 'eval_runtime': 103.5324,
 'eval_samples_per_second': 193.176,
 'eval_steps_per_second': 6.037,
 'epoch': 3.0}

### Load the Manually Annotated Data and Calc the Macro-F1 score

In [34]:
def form_corpus_label_from_manual_anno(text):
    """
    Modify the annotated label, such that model can use it as Tags
    """
    text_1 = text.split(' ')
    new_text = []
    new_id = []
    for word in text_1:
        if word == '[' or word == ']' or word == ',' or word == '"' or word == '\n' or word == ' ' or word == '':
            pass
        else:
            n_w = ''
            for i in word:
                if i == '[' or i == ']' or i == ',' or i == '"' or i == '\n' or i == '\'':
                    pass
                else:
                    n_w = n_w + i
            if n_w != '':
                new_text.append(n_w)
                id1 = label_to_id_dict[n_w]
                if id1 == 7 or id1 == 8:
                    id1 = 0
                new_id.append(id1)
    return (new_id, new_text)


i = 0
text_corpus = {'words':[],
              'ner' : []}
with open("annotated_text.txt", 'r') as f:
    for line in f.readlines():
        if i % 3 == 0:
            text = line
            text_1 = text.split(" ")[1:]
            if text_1[-1] == '\n':
                text_1 = text_1[0:-1]
            text_corpus['words'].append(text_1)
        elif i % 3 == 1:
            label = line
            ( new_label_id, new_label ) = form_corpus_label_from_manual_anno(label)
            text_corpus['ner'].append(new_label_id)
        i = i + 1

In [35]:
text_corpus

{'words': [['রাত',
   'সাড়ে',
   'আটটাতেও',
   'ওকে',
   'একা',
   'রিকশায়',
   'ছেড়ে',
   'দিতে',
   'তিনি',
   'রাজি',
   'নন।\n'],
  ['একাধিক',
   'জঙ্গি',
   'ঘাঁটি',
   'ধ্বংস',
   'করে',
   'দেওয়া',
   'হয়েছে',
   'বলে',
   'দাবি',
   'করে',
   'সরকার।\n'],
  ['এ',
   'সম্মেলনে',
   'বিশ্বের',
   'প্রায়',
   'দেড়',
   'হাজার',
   'নারী',
   'নেতৃত্ব',
   'যোগদান',
   'করেন।\n'],
  ['আব্দুল',
   'মালেক',
   'হিমু,',
   'অপু',
   'আলম,',
   'দবির',
   'মোহাম্মদ',
   'সহ',
   'আরো',
   'অনেকে।\n'],
  ['মিথিলা',
   'যে',
   'অনিকের',
   'এই',
   'জিনিসগুলো',
   'জানে',
   'না',
   'তা',
   'নয়।',
   'কিন্তু',
   'সে',
   'চায়',
   'যে',
   'অনিক',
   'সব',
   'সময়',
   'তার',
   'কাছে',
   'সত্যি',
   'কথা',
   'বলুক।'],
  ['লোকসভার',
   'স্পিকার',
   'ওম',
   'বিড়লার',
   'প্রস্তাবে',
   'সম্মতি',
   'জানিয়েছেন',
   'সব',
   'দলের',
   'সাংসদরা।\n'],
  ['মিথিলা', 'চুপ', 'করে', 'শুনে', 'গেলেও', 'মুখে', 'কিছু', 'বলল', 'না।'],
  ['জানা',
   'যায়,',
   'জগন্নাথপুর',
   'পৌর',
   'শহরের

In [36]:
print(len(text_corpus['ner']), len(text_corpus['words']))

25 25


In [37]:
for i in range(len(text_corpus['ner'])):
    (j, k) = (len(text_corpus['words'][i]), len(text_corpus['ner'][i]))
    if j > k:
        for _ in range(j - k):
            text_corpus['ner'][i].append(0)
    elif j < k :
        text_corpus['ner'][i] = text_corpus['ner'][i][0 : k - (k - j)]        

In [38]:
manual_anno_data = Dataset.from_dict(text_corpus)

In [39]:
manual_anno_data

Dataset({
    features: ['words', 'ner'],
    num_rows: 25
})

In [40]:
manual_anno_data = manual_anno_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

Running tokenizer on validation dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 1180.48 examples/s]


In [41]:
result_manual_anno = best_model.evaluate(manual_anno_data)

In [42]:
result_manual_anno

{'eval_loss': 0.0887410044670105,
 'eval_LOC_precision': 0.8,
 'eval_LOC_recall': 1.0,
 'eval_LOC_f1': 0.888888888888889,
 'eval_LOC_number': 4,
 'eval_ORG_precision': 0.4,
 'eval_ORG_recall': 1.0,
 'eval_ORG_f1': 0.5714285714285715,
 'eval_ORG_number': 2,
 'eval_PER_precision': 0.9333333333333333,
 'eval_PER_recall': 0.875,
 'eval_PER_f1': 0.9032258064516129,
 'eval_PER_number': 16,
 'eval_overall_precision': 0.8,
 'eval_overall_recall': 0.9090909090909091,
 'eval_overall_f1': 0.8510638297872342,
 'eval_overall_accuracy': 0.9792207792207792,
 'eval_runtime': 0.2025,
 'eval_samples_per_second': 123.481,
 'eval_steps_per_second': 4.939,
 'epoch': 3.0}