### Importing Library

In [35]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import json
from datasets import Dataset
import random
import json
import numpy as np
import torch

In [36]:
from datasets import load_metric

In [37]:
random.seed(1)

In [38]:
# Selecting the device
device = torch.device("cuda:0")
print(device)

cuda:0


### Model Loading

In [39]:
# IndicBERT
num_labels = 9
text_name = "words"
label_name = "ner"

config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetunning_task = 'ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )
model=model.to(device)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Dataset Preparation

In [40]:
label_to_id_dict = {
    'O' : 0,
    'B-PER' : 1,
    'I-PER' : 2,
    'B-ORG' : 3,
    'I-ORG' : 4,
    'B-LOC' : 5,
    'I-LOC' : 6,
    'B-MISC' : 7,
    'I-MISC' : 8
}

In [41]:
id_2_label = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [42]:
def make_dataset( file_path, text_name = text_name, label_name = label_name, dataset_type = None, 
                 num_samples_to_take = None ):
    
    data_dir = { text_name : [ ], label_name : [ ] }
    with open(file_path, 'r') as f:
        for line in f:
            data_dir[text_name].append(json.loads(line)[text_name])
            data_dir[label_name].append( [ label_to_id_dict[i] for i in ( json.loads(line)[label_name] ) ] )
    if dataset_type == 'train':
        # For sampling the initial 20000 samples
        data_dir[text_name] = data_dir[text_name][0:num_samples_to_take]
        data_dir[label_name] = data_dir[label_name][ 0:num_samples_to_take]
    data = Dataset.from_dict(data_dir)
    return data

In [43]:
train_data = make_dataset("./bn_IndicNER_v1.0/bn_train.json", dataset_type = "train", num_samples_to_take=20000)

In [44]:
val_data = make_dataset("./bn_IndicNER_v1.0/bn_val.json", num_samples_to_take=100)
test_data = make_dataset("./bn_IndicNER_v1.0/bn_test.json", num_samples_to_take=100)

In [45]:
train_data

Dataset({
    features: ['words', 'ner'],
    num_rows: 20000
})

In [46]:
val_data

Dataset({
    features: ['words', 'ner'],
    num_rows: 4859
})

In [47]:
padding = "max_length"
def tokenize_and_align_labels(examples):
    """
    Tokenize the text and match the target label with the text token
    Input: A batch of sentence
        A dict:
            'words': A list of sentences denoted as a list of words
                        Desired Input to the model
            'ner': A list of desired labels
    Output: Tokenized Sentence
        A dict: dict:
            'words': A list of sentences denoted as a list of words
                        Desired Input to the model
            'ner': A list of desired labels
            'input_ids': Id of the tokenized words
                            A list of sentences denoted as 
                            a list of Id s of the consitituted word tokens
                            # Input to the Transformer model
            'attention_masks': A binary list
                                1 indicates the text token, model will attend to
                                0 indicates the padding token and special token, the model will not attend to.
            'labels': A list of numbers
                        Denting the target ground truth label
                        generated from the 'ner' field of Input
    """
    
    tokenized_inputs = tokenizer(
        examples[text_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples[label_name]):
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [48]:
train_data = train_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on train dataset",
)

Running tokenizer on train dataset: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [00:05<00:00, 3371.69 examples/s]


In [49]:
# Visualizing the toenized data
train_data

Dataset({
    features: ['words', 'ner', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [50]:
val_data = val_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on validation dataset",
)
test_data = test_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on test dataset",
)

Running tokenizer on validation dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4859/4859 [00:01<00:00, 3427.44 examples/s]
Running tokenizer on test dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 607/607 [00:00<00:00, 3391.59 examples/s]


In [51]:
data_collator = DataCollatorForTokenClassification(tokenizer)

### Metric

In [52]:
# Metrics
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id_2_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_2_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### Trainning

In [53]:
def training_model(batch_size, lr , model, train_data = train_data, val_data = val_data,
                  tokenizer = tokenizer, data_collator = data_collator, compute_metrics = compute_metrics):
                    
    
    
    # Setting the TrainingArguments
    args=TrainingArguments(
        output_dir='output_dir',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        learning_rate = lr,
        num_train_epochs = 3 # As asked in the question
        )
    
    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        args=args,
    )
    
    train_result = trainer.train()
    
    eval_metric = trainer.evaluate(val_data)
    
    return (trainer, train_result, eval_metric)
    

#### Hyper-parameter Search

We have tuned the batch size and learning-rate as two are the most important hyper-parameters. 
Batch Size: Larger Batch size might result in a smoother convergence during Gradient descent, with a huge cost of calculating the gradients. Whereas smaller batch size might result ina more haphazard convergence but each step of Gradient Descent is fast. Therefore a we need to fine tune the optimum.

Learning Rate: A Higher Learning Rate indicates larger step, which might lead the model to go in bad optima, and lower earning rate might not help the model to converge.

We use a random sapling approach and done only 3 experiments due to limited resources for Hyper-parameter search.


In [54]:
def hyper_param_search():
    
    model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )
    model=model.to(device)
    
    tunable_hyper_param = {'batch_size' : [8, 16, 32],
                      'lr': [1e-2, 1e-3, 1e-4, 1e-5]}
    batch_size = tunable_hyper_param['batch_size'][random.randint(0,2)] 
                # Generating a random number between 0 and 2 (both included)
    lr = tunable_hyper_param['lr'][random.randint(0,3)]
                # Generating a random number between 0 and 2 (both included)
    
    print('Batch size Learning Rate')
    print(batch_size, lr)
    
    (trainer, train_result, eval_metric) = training_model(batch_size, lr, model)
    
    return  (trainer, train_result, eval_metric, batch_size, lr)

##### Hyper-parameter search Experiment 1

In [55]:
(trainer, train_result, eval_metric, batch_size, lr) = hyper_param_search()

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size Learning Rate
8 0.01




Step,Training Loss
500,0.8134
1000,0.7738
1500,0.7818


Checkpoint destination directory output_dir/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_dir/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_dir/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size, lr))
print('The Evaluation Metric on the validation Data: ')
print(eval_metric)

For Batch Size = 8, and Learning Rate = 0.01 :
The Evaluation Metric on the validation Data: 
{'eval_loss': 0.7926539778709412, 'eval_LOC_precision': 0.0, 'eval_LOC_recall': 0.0, 'eval_LOC_f1': 0.0, 'eval_LOC_number': 2811, 'eval_ORG_precision': 0.0, 'eval_ORG_recall': 0.0, 'eval_ORG_f1': 0.0, 'eval_ORG_number': 1751, 'eval_PER_precision': 0.0, 'eval_PER_recall': 0.0, 'eval_PER_f1': 0.0, 'eval_PER_number': 3698, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.8187221589143272, 'eval_runtime': 16.9598, 'eval_samples_per_second': 286.501, 'eval_steps_per_second': 8.962, 'epoch': 3.0}


##### Hyper-parameter search experiment 2

In [57]:
(trainer_2, train_result_2, eval_metric_2, batch_size_2, lr_2) = hyper_param_search()

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size Learning Rate
32 0.01




Step,Training Loss


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_2, lr_2))
print('The Evaluation Metric on the validation Data: ')
eval_metric_2

For Batch Size = 32, and Learning Rate = 0.01 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.7922132611274719,
 'eval_LOC_precision': 0.0,
 'eval_LOC_recall': 0.0,
 'eval_LOC_f1': 0.0,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.0,
 'eval_ORG_recall': 0.0,
 'eval_ORG_f1': 0.0,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.0,
 'eval_PER_recall': 0.0,
 'eval_PER_f1': 0.0,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.0,
 'eval_overall_recall': 0.0,
 'eval_overall_f1': 0.0,
 'eval_overall_accuracy': 0.8187221589143272,
 'eval_runtime': 17.2603,
 'eval_samples_per_second': 281.513,
 'eval_steps_per_second': 8.806,
 'epoch': 3.0}

##### Hyper-parameter search Experiment 3

In [59]:
(trainer_3, train_result_3, eval_metric_3, batch_size_3, lr_3) = hyper_param_search()

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size Learning Rate
32 0.01


Step,Training Loss


In [61]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_3, lr_3))
print('The Evaluation Metric on the validation Data: ')
eval_metric_3

For Batch Size = 32, and Learning Rate = 0.01 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.7922132611274719,
 'eval_LOC_precision': 0.0,
 'eval_LOC_recall': 0.0,
 'eval_LOC_f1': 0.0,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.0,
 'eval_ORG_recall': 0.0,
 'eval_ORG_f1': 0.0,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.0,
 'eval_PER_recall': 0.0,
 'eval_PER_f1': 0.0,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.0,
 'eval_overall_recall': 0.0,
 'eval_overall_f1': 0.0,
 'eval_overall_accuracy': 0.8187221589143272,
 'eval_runtime': 17.0933,
 'eval_samples_per_second': 284.264,
 'eval_steps_per_second': 8.892,
 'epoch': 3.0}

##### Hyper-parameter Search Experiment with lower Learning Rates

In [62]:
def hyper_param_setting():
    
    model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )
    model=model.to(device)
    
    batch_size = 8
    lr = 1e-5
    print('Batch size Learning Rate')
    print(batch_size, lr)
    
    (trainer, train_result, eval_metric) = training_model(batch_size, lr, model)
    
    return  (trainer, train_result, eval_metric, batch_size, lr)

In [64]:
(trainer_4, train_result_4, eval_metric_4, batch_size_4, lr_4) = hyper_param_setting()

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size Learning Rate
8 1e-05




Step,Training Loss
500,0.6567
1000,0.4537
1500,0.4092


Checkpoint destination directory output_dir/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_dir/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory output_dir/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [65]:
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_4, lr_4))
print('The Evaluation Metric on the validation Data: ')
eval_metric_4

For Batch Size = 8, and Learning Rate = 1e-05 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.4211013615131378,
 'eval_LOC_precision': 0.49778172138420584,
 'eval_LOC_recall': 0.3991462113127001,
 'eval_LOC_f1': 0.443040473840079,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.2852233676975945,
 'eval_ORG_recall': 0.14220445459737294,
 'eval_ORG_f1': 0.18978658536585366,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.560645347162201,
 'eval_PER_recall': 0.5262303948080044,
 'eval_PER_f1': 0.5428930115776259,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.5027280994240679,
 'eval_overall_recall': 0.40157384987893463,
 'eval_overall_f1': 0.44649347153048863,
 'eval_overall_accuracy': 0.8746115197347974,
 'eval_runtime': 17.2133,
 'eval_samples_per_second': 282.282,
 'eval_steps_per_second': 8.83,
 'epoch': 3.0}

##### Hyper-parameter search experiment with lower learning rate 2

In [71]:
def hyper_param_setting_2():
    
    model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )
    model=model.to(device)
    
    batch_size = 16
    lr = 1e-5
    print('Batch size Learning Rate')
    print(batch_size, lr)
    
    (trainer, train_result, eval_metric) = training_model(batch_size, lr, model)
    
    return  (trainer, train_result, eval_metric, batch_size, lr)

In [72]:
(trainer_5, train_result_5, eval_metric_5, batch_size_5, lr_5) = hyper_param_setting_2()
print ('For Batch Size = {}, and Learning Rate = {} :'.format(batch_size_5, lr_5))
print('The Evaluation Metric on the validation Data: ')
eval_metric_5

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size Learning Rate
16 1e-05




Step,Training Loss
500,0.6182


Checkpoint destination directory output_dir/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


For Batch Size = 16, and Learning Rate = 1e-05 :
The Evaluation Metric on the validation Data: 


{'eval_loss': 0.4501913785934448,
 'eval_LOC_precision': 0.4909437559580553,
 'eval_LOC_recall': 0.3664176449662042,
 'eval_LOC_f1': 0.4196374006926054,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.2288135593220339,
 'eval_ORG_recall': 0.09251856082238721,
 'eval_ORG_f1': 0.13176087840585604,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.550876114355979,
 'eval_PER_recall': 0.48458626284478096,
 'eval_PER_f1': 0.5156092648539778,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.49249050998514604,
 'eval_overall_recall': 0.3612590799031477,
 'eval_overall_f1': 0.416788881905161,
 'eval_overall_accuracy': 0.8689785558893608,
 'eval_runtime': 17.3408,
 'eval_samples_per_second': 280.206,
 'eval_steps_per_second': 8.765,
 'epoch': 3.0}

### Caculating the Macro F1

In [73]:
best_model = trainer_4
test_metric = best_model.evaluate(test_data)



In [91]:
best_model.model.save_pretrained("./IndicBERT_best", from_pt = True)

In [75]:
test_metric

{'eval_loss': 0.3632481098175049,
 'eval_LOC_precision': 0.5439330543933054,
 'eval_LOC_recall': 0.39274924471299094,
 'eval_LOC_f1': 0.45614035087719296,
 'eval_LOC_number': 331,
 'eval_ORG_precision': 0.42990654205607476,
 'eval_ORG_recall': 0.2222222222222222,
 'eval_ORG_f1': 0.2929936305732484,
 'eval_ORG_number': 207,
 'eval_PER_precision': 0.6148491879350348,
 'eval_PER_recall': 0.5773420479302832,
 'eval_PER_f1': 0.595505617977528,
 'eval_PER_number': 459,
 'eval_overall_precision': 0.5675675675675675,
 'eval_overall_recall': 0.44232698094282846,
 'eval_overall_f1': 0.49718151071025923,
 'eval_overall_accuracy': 0.8896598794221363,
 'eval_runtime': 4.0598,
 'eval_samples_per_second': 149.513,
 'eval_steps_per_second': 4.68,
 'epoch': 3.0}

In [74]:
val_metric = best_model.evaluate(val_data)
val_metric



{'eval_loss': 0.4211013615131378,
 'eval_LOC_precision': 0.49778172138420584,
 'eval_LOC_recall': 0.3991462113127001,
 'eval_LOC_f1': 0.443040473840079,
 'eval_LOC_number': 2811,
 'eval_ORG_precision': 0.2852233676975945,
 'eval_ORG_recall': 0.14220445459737294,
 'eval_ORG_f1': 0.18978658536585366,
 'eval_ORG_number': 1751,
 'eval_PER_precision': 0.560645347162201,
 'eval_PER_recall': 0.5262303948080044,
 'eval_PER_f1': 0.5428930115776259,
 'eval_PER_number': 3698,
 'eval_overall_precision': 0.5027280994240679,
 'eval_overall_recall': 0.40157384987893463,
 'eval_overall_f1': 0.44649347153048863,
 'eval_overall_accuracy': 0.8746115197347974,
 'eval_runtime': 31.6466,
 'eval_samples_per_second': 153.539,
 'eval_steps_per_second': 4.803,
 'epoch': 3.0}

In [76]:
train_metric = best_model.evaluate(train_data)
train_metric



{'eval_loss': 0.3800722658634186,
 'eval_LOC_precision': 0.5349235212322173,
 'eval_LOC_recall': 0.434189963535336,
 'eval_LOC_f1': 0.4793214165907893,
 'eval_LOC_number': 11518,
 'eval_ORG_precision': 0.33901277875219243,
 'eval_ORG_recall': 0.18859771396710343,
 'eval_ORG_f1': 0.24236453201970445,
 'eval_ORG_number': 7174,
 'eval_PER_precision': 0.591568225224588,
 'eval_PER_recall': 0.5569021775321302,
 'eval_PER_f1': 0.5737120120738148,
 'eval_PER_number': 15017,
 'eval_overall_precision': 0.5356116024311243,
 'eval_overall_recall': 0.43658963481562785,
 'eval_overall_f1': 0.48105775831072467,
 'eval_overall_accuracy': 0.8876350736708247,
 'eval_runtime': 114.6004,
 'eval_samples_per_second': 174.519,
 'eval_steps_per_second': 5.454,
 'epoch': 3.0}

### Load the Manually Annotated Data and Calc the Macro-F1 score

In [204]:
def form_corpus_label_from_manual_anno(text):
    """
    Modify the annotated label, such that model can use it as Tags
    """
    text_1 = text.split(' ')
    new_text = []
    new_id = []
    for word in text_1:
        if word == '[' or word == ']' or word == ',' or word == '"' or word == '\n' or word == ' ' or word == '':
            pass
        else:
            n_w = ''
            for i in word:
                if i == '[' or i == ']' or i == ',' or i == '"' or i == '\n' or i == '\'':
                    pass
                else:
                    n_w = n_w + i
            if n_w != '':
                new_text.append(n_w)
                new_id.append(label_to_id_dict[n_w])
    return (new_id, new_text)


i = 0
text_corpus = {'words':[],
              'ner' : []}
with open("annotated_text.txt", 'r') as f:
    for line in f.readlines():
        if i % 3 == 0:
            text = line
            text_1 = text.split(" ")[1:]
            if text_1[-1] == '\n':
                text_1 = text_1[0:-1]
            text_corpus['words'].append(text_1)
        elif i % 3 == 1:
            label = line
            ( new_label_id, new_label ) = form_corpus_label_from_manual_anno(label)
            text_corpus['ner'].append(new_label_id)
        i = i + 1

In [205]:
text_corpus

{'words': [['রাত',
   'সাড়ে',
   'আটটাতেও',
   'ওকে',
   'একা',
   'রিকশায়',
   'ছেড়ে',
   'দিতে',
   'তিনি',
   'রাজি',
   'নন।\n'],
  ['একাধিক',
   'জঙ্গি',
   'ঘাঁটি',
   'ধ্বংস',
   'করে',
   'দেওয়া',
   'হয়েছে',
   'বলে',
   'দাবি',
   'করে',
   'সরকার।\n'],
  ['এ',
   'সম্মেলনে',
   'বিশ্বের',
   'প্রায়',
   'দেড়',
   'হাজার',
   'নারী',
   'নেতৃত্ব',
   'যোগদান',
   'করেন।\n'],
  ['আব্দুল',
   'মালেক',
   'হিমু,',
   'অপু',
   'আলম,',
   'দবির',
   'মোহাম্মদ',
   'সহ',
   'আরো',
   'অনেকে।\n'],
  ['মিথিলা',
   'যে',
   'অনিকের',
   'এই',
   'জিনিসগুলো',
   'জানে',
   'না',
   'তা',
   'নয়।',
   'কিন্তু',
   'সে',
   'চায়',
   'যে',
   'অনিক',
   'সব',
   'সময়',
   'তার',
   'কাছে',
   'সত্যি',
   'কথা',
   'বলুক।'],
  ['লোকসভার',
   'স্পিকার',
   'ওম',
   'বিড়লার',
   'প্রস্তাবে',
   'সম্মতি',
   'জানিয়েছেন',
   'সব',
   'দলের',
   'সাংসদরা।\n'],
  ['মিথিলা', 'চুপ', 'করে', 'শুনে', 'গেলেও', 'মুখে', 'কিছু', 'বলল', 'না।'],
  ['জানা',
   'যায়,',
   'জগন্নাথপুর',
   'পৌর',
   'শহরের

In [165]:
print(len(text_corpus['ner']), len(text_corpus['words']))

25 25


In [210]:
for i in range(len(text_corpus['ner'])):
    (j, k) = (len(text_corpus['words'][i]), len(text_corpus['ner'][i]))
    if j > k:
        for _ in range(j - k):
            text_corpus['ner'][i].append(0)
    elif j < k :
        text_corpus['ner'][i] = text_corpus['ner'][i][0 : k - (k - j)]        

In [207]:
manual_anno_data = Dataset.from_dict(text_corpus)

In [208]:
manual_anno_data

Dataset({
    features: ['words', 'ner'],
    num_rows: 25
})

In [209]:
manual_anno_data = manual_anno_data.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

Running tokenizer on validation dataset: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 1290.36 examples/s]


In [211]:
result_manual_anno = best_model.evaluate(manual_anno_data)

  _warn_prf(average, modifier, msg_start, len(result))


In [212]:
result_manual_anno

{'eval_loss': 0.8164288401603699,
 'eval_LOC_precision': 1.0,
 'eval_LOC_recall': 0.5,
 'eval_LOC_f1': 0.6666666666666666,
 'eval_LOC_number': 4,
 'eval_MISC_precision': 0.0,
 'eval_MISC_recall': 0.0,
 'eval_MISC_f1': 0.0,
 'eval_MISC_number': 20,
 'eval_ORG_precision': 1.0,
 'eval_ORG_recall': 0.5,
 'eval_ORG_f1': 0.6666666666666666,
 'eval_ORG_number': 2,
 'eval_PER_precision': 0.1875,
 'eval_PER_recall': 0.1875,
 'eval_PER_f1': 0.1875,
 'eval_PER_number': 16,
 'eval_overall_precision': 0.3157894736842105,
 'eval_overall_recall': 0.14285714285714285,
 'eval_overall_f1': 0.19672131147540983,
 'eval_overall_accuracy': 0.8363636363636363,
 'eval_runtime': 0.1547,
 'eval_samples_per_second': 161.627,
 'eval_steps_per_second': 6.465,
 'epoch': 3.0}