# Loading Marathi Dataset

The dataset for marathi language is directly fetched from the website.

In [1]:
!pip install datasets
from datasets import ClassLabel, load_dataset, load_metric

mr_dataset = load_dataset('ai4bharat/naamapadam','mr')



Downloading builder script:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

Downloading and preparing dataset naamapadam_pr/mr to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/mr/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20...


Downloading data:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset naamapadam_pr downloaded and prepared to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/mr/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
features = mr_dataset["train"].features
label_list = features['ner_tags'].feature.names
label_to_id = {label_list[i]: features['ner_tags'].feature.str2int( label_list[i] ) for i in range(len(label_list))}
print(label_to_id)
num_labels = len(label_list)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


# FineTuning IndicNER for NER classification

Firstly, we need to tokenize the training and validation dataset and then map the words to their NER labels.

In [3]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/IndicNER', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER",num_labels=num_labels )

2024-03-12 19:26:19.861344: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-12 19:26:19.861459: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-12 19:26:19.994964: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [4]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0


In [5]:
model=model.to("cuda")

In [6]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples): 
    tokenized_inputs = tokenizer(
        examples['tokens'],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):       
        word_ids = tokenized_inputs.word_ids(batch_index=i) 

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:       
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



## Tokenizing train_dataset

In [7]:
train_dataset = mr_dataset["train"].select(range(20000))   #sampling

train_dataset = train_dataset.map(            
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

       

Running tokenizer on train dataset #0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

Running tokenizer on train dataset #1:   0%|          | 0/5 [00:00<?, ?ba/s]

Running tokenizer on train dataset #3:   0%|          | 0/5 [00:00<?, ?ba/s]

Running tokenizer on train dataset #2:   0%|          | 0/5 [00:00<?, ?ba/s]

## Tokenizing validation_dataset




In [8]:
eval_dataset = mr_dataset["validation"]        #no-sampling
eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)

        

Running tokenizer on Validation dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer)

## Designing a compute metrics

We require compute metrics to evaluate the model. I have retrieved precision, recall, f1-score and accuracy here. Using these values, I will calculate the macro-f1 score.

In [10]:
!pip install seqeval

def all_classes(true_tags, pred_tags):
    classes = set(tag for row in true_tags + pred_tags for tag in row)

    results = []
    true_positives, false_positives, false_negatives = 0, 0, 0
    for class_label in sorted(classes):
        true_indices = [(i, j) for i, row in enumerate(true_tags) for j, tag in enumerate(row) if tag == class_label]
        pred_indices = [(i, j) for i, row in enumerate(pred_tags) for j, tag in enumerate(row) if tag == class_label]
        
        true_positive = len(set(true_indices) & set(pred_indices))
        false_positive = len(set(pred_indices) - set(true_indices))
        false_negative = len(set(true_indices) - set(pred_indices))

        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        results.append((class_label, precision, recall, f1_score))

        true_positives += true_positive
        false_positives += false_positive
        false_negatives += false_negative

    macro_f1_score = 2 * (true_positives) / (2 * true_positives + false_positives + false_negatives) if (2 * true_positives + false_positives + false_negatives) > 0 else 0
    results.append(('macro f1:', 0, 0, macro_f1_score))

    return results




Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=7267398d720461c551d5c413915b845cb70befd915309c9191e32a0d50832e90
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [11]:
# Metrics function
#!pip install seqeval
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    r1 = all_classes(true_labels, true_predictions)
    print("Class\tPrecision\tRecall\tF1-Score")
    for class_label, precision, recall, f1_score in r1:
        print(f"{class_label}\t{precision:.2f}\t\t{recall:.2f}\t{f1_score:.2f}")
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return results

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

## Finetuning the hyper-parameters

For fine-tuning a pre-trained model, we need to tune the hyper-parameters which are mentioned in the TrainingArguments. If a particular hyper parameter is not mentioned then it will take the default value.

In [12]:
args=TrainingArguments(
    output_dir='Trained Models/IndicNER/Fine_tuned_IndicNER',
    num_train_epochs=3,
    logging_steps=500, #500
    learning_rate=7e-5,
    save_steps=1000, #1000
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,)

## Training the model

As training was taking more time and due to limited resources, I sampled off the training dataset. Also, I discussed this issue with my colleagues. They were getting different EFT (Expected Finish Time).

In [13]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,  #.select(range(20000)),   #sampling
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    args=args,
)

In [14]:
#GPU 100,000 -> 12h , 10k -> 3h, entire dataset -> 61h, 25% -> 15h
#GPU Kaggle p100: 100,000 -> 7h,
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.404
1000,0.2677
1500,0.2302
2000,0.1845
2500,0.1905
3000,0.1219
3500,0.1187


Checkpoint destination directory Trained Models/IndicNER/Fine_tuned_IndicNER/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Trained Models/IndicNER/Fine_tuned_IndicNER/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory Trained Models/IndicNER/Fine_tuned_IndicNER/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3750, training_loss=0.21019397430419923, metrics={'train_runtime': 3340.4443, 'train_samples_per_second': 17.962, 'train_steps_per_second': 1.123, 'total_flos': 1.567851411456e+16, 'train_loss': 0.21019397430419923, 'epoch': 3.0})

In [15]:
trainer.evaluate(train_dataset)
#takes time

Class	Precision	Recall	F1-Score
B-LOC	0.93		0.96	0.94
B-ORG	0.94		0.92	0.93
B-PER	0.96		0.96	0.96
I-LOC	0.88		0.84	0.86
I-ORG	0.92		0.94	0.93
I-PER	0.94		0.96	0.95
O	0.99		0.99	0.99
macro f1:	0.00		0.00	0.97


{'eval_loss': 0.07519374042749405,
 'eval_precision': 0.9235974700467938,
 'eval_recall': 0.934593250148297,
 'eval_macro-f1': 0.929062826521006,
 'eval_accuracy': 0.9745609859966432,
 'eval_runtime': 417.0117,
 'eval_samples_per_second': 47.96,
 'eval_steps_per_second': 2.998,
 'epoch': 3.0}

In [16]:
#GPU
trainer.evaluate()

Class	Precision	Recall	F1-Score
B-LOC	0.81		0.86	0.84
B-ORG	0.76		0.76	0.76
B-PER	0.86		0.88	0.87
I-LOC	0.69		0.65	0.67
I-ORG	0.71		0.71	0.71
I-PER	0.88		0.88	0.88
O	0.96		0.95	0.96
macro f1:	0.00		0.00	0.92


{'eval_loss': 0.31304654479026794,
 'eval_precision': 0.7948739355485056,
 'eval_recall': 0.8211297973264338,
 'eval_macro-f1': 0.8077885716709794,
 'eval_accuracy': 0.9185480702390667,
 'eval_runtime': 47.9606,
 'eval_samples_per_second': 47.956,
 'eval_steps_per_second': 3.002,
 'epoch': 3.0}

In [17]:
trainer.save_model("Fine_tuned_IndicNER")

# Test Phase

In [29]:
#TEST
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/Fine_tuned_IndicNER")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/Fine_tuned_IndicNER")

test_dataset = mr_dataset["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on test dataset",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/Fine_tuned_IndicNER and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


        

Running tokenizer on test dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
x = trainer.predict(test_dataset)
x.metrics

Class	Precision	Recall	F1-Score
B-LOC	0.88		0.81	0.85
B-ORG	0.75		0.73	0.74
B-PER	0.90		0.88	0.89
I-LOC	0.67		0.68	0.68
I-ORG	0.65		0.60	0.63
I-PER	0.94		0.93	0.93
O	0.95		0.96	0.96
macro f1:	0.00		0.00	0.92


{'test_loss': 0.3161526620388031,
 'test_precision': 0.8255954784012919,
 'test_recall': 0.7971155720132528,
 'test_macro-f1': 0.8111056023797718,
 'test_accuracy': 0.92080751104019,
 'test_runtime': 22.5027,
 'test_samples_per_second': 47.994,
 'test_steps_per_second': 3.022}

In [20]:
macro_f1_test = x.metrics['test_macro-f1']
print("Macro f1 score on test dataset: ",macro_f1_test)

Macro f1 score on test dataset:  0.8111056023797718


In [21]:
!zip -r file.zip /kaggle/working

updating: kaggle/working/ (stored 0%)
updating: kaggle/working/Trained Models/ (stored 0%)
updating: kaggle/working/Trained Models/IndicNER/ (stored 0%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/ (stored 0%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/runs/ (stored 0%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/runs/Mar12_16-46-10_83339e7580ca/ (stored 0%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/runs/Mar12_16-46-10_83339e7580ca/events.out.tfevents.1710265729.83339e7580ca.34.1 (deflated 49%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/runs/Mar12_16-46-10_83339e7580ca/events.out.tfevents.1710261971.83339e7580ca.34.0 (deflated 60%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/checkpoint-2000/ (stored 0%)
updating: kaggle/working/Trained Models/IndicNER/Fine_tuned_IndicNER/checkpoint-2000/tokenizer.json (deflated 69%)
updating: kaggle/w

In [22]:
!ls

 Fine_tuned_IndicNER  'Trained Models'	 file.zip   state.db   wandb


In [23]:
from IPython.display import FileLink
FileLink(r'file.zip')

# Q4 Appended.

The part of Question 4 wherein we have to compute precision recall f1-score on the fine-tuned IndicNER has been appended here.

In [24]:
from datasets import ClassLabel, Sequence, Dataset, Value, Features, load_dataset, load_metric
Q1_data = {
    'tokens':[["कार्यकर्त्यांमधील", "वाद", "बघून", "संतापलेल्या", "सुप्रिया", "सुळे", "यांनी", "कार्यक्रमामध्येच", "राडेबाज", "कार्यकर्त्यांना","दम", "दिला", "."],
              ["अर्थात", "यात", "काही", "चांगल्या", "आणि", "व्हिडिओला", "पाठिंबा", "देणाऱ्या", "कमेंटही", "होत्या", "."],
              ["भारतीय", "वायुसेनेनं", "केलेल्या", "कारवाईचं", "संपुर्ण", "देशभरात", "कौतूक", "केलं", "जात", "आहे", "."],
              ["24)", "बैठक", "बोलावली", "असल्याची", "माहिती", "आमदार", "अनिल", "बाबर", "यांनी",".", ".", ".", "ढवळीत","पुरग्रस्तांच्या","मदत", "वाटपाची", "चौकशी", "सुरु"],
              ["यावेळी", "सुरेश", "प्रभुनी", "सुधीर", "सावंत", "व", "मधु", "दंडवते", "या", "दोघांचाही", "पराभव", "केला", ".", "त्यावेळी", "निकालादिवशी", "सुरेश", "प्रभुंना", "विजयी", "घोषित" "केल्यानंतर", "त्यांनी", "लगेचच", "शेजारी", "उभे", "असलेल्या", "मधु", "दंडवतेंची", "भेट", "घेत", "वाकुन", "त्यांच्या", "चरणांना", "स्पर्श", "केला", ".", "मतांच्या", "गोळाबेरजेत", "आपण", "विजयी", "झालेलो", "असलो", "तरी", "मधु", "दंडवतें", "या", "व्यक्तीचे", "कोकणप्रती", "असलेले", "योगदान", "प्रभुना", "चांगलेच", "ठाऊक", "होते", "."],
              ["सव्वाशे", "वर्षे", "अखंड", "पाणीपुरवठा", ":", "संस्थानकालीन", "योजना", ":", "सुधारित", "45", "कोटींचा", "आराखडा", "लवकरच", "मंजूर"],
              ["कार्यक्रम", "सर्वांना", "खुला", "असून", ",", "रसिकांनी", "उपस्थित", "रहावे", "असे", "आवाहन", "लोकमान्य", "ग्रंथालयाने", "केले", "आहे", "."],
              ["विमोचन", "आणि", "तो", "एक", "पादचारी", "दाबा", "आणि", "एक", "कार", "दाबा", "होते", "तेव्हा", ",", "उदाहरणार्थ", ",", "गुन्हा", "मागोवा", "लपविण्यासाठी", "मानले", "जाते", "."],
              ["निसर्गराजा", "मित्र", "जीवांचे", "ही", "संस्था", "गेली", "दहा", "वर्षे", "पिंपरी", "-", "चिंचवड", "परिसरातील", "तरुणांना", "एकत्र", "करून", "निसर्ग", "संवर्धनाची", "कार्ये", "विविध", "उपक्रमांद्वारे", "करत", "आहे", "."],
              ["कल्याण", "-", "डोंबिवली", "मध्ये", "माथाडी", "कामगारही", "उद्या", "बंदमध्ये", "सहभागी", "होणार", "असून", "एक", "भव्य", "मोर्चाही", "काढण्यात", "येणार", "आहे", "."],
              ["दोन्ही", "ट्रकचा", "अपघात", "एवढा", "भीषण", "होता", "की", "ट्रकच्या", "समोरील", "बाजूचा", "चक्काचुर", "होऊन", "त्यामध्ये", "ट्रकचालक", "अडकले", "होते", "."],
              ["सिबिल", "या", "संस्थेने", "प्रसिद्ध", "केलेल्या", "माहितीनुसार", ",", "2018", "मध्ये", "जाणूनबुजून", "कर्ज", "बुडविणाऱ्या", "मंडळींच्या", "संख्येत", "मोठ्या", "प्रमाणावर", "वाढ", "झाली", "आहे", "."],
              ["श्रीराम", "लागू", "-", "मराठी", "रंगभूमीचे", "अनभिषिक्त", "सम्राट", ",", "चतुरस्र", "अभिनेते", ",", "परखड", ".", ".", ".", "30", "डिसेंबरला", "होणाऱ्या", "शपथविधी", "सोहळ्याची", "विधानभवनात", "जोरदार", "तयारी", "सुरू"],
              ["कडाप्पे", "अंगावर", "पडून", "दोन", "कामगारांचा", "जागीच", "मृत्यू", ",", "चार", "जण", "गंभीर"],
              ["मात्र", "सत्ताधा-यांनी", "या", "विरोधाला", "न", "जुमानता", "हा", "प्रस्ताव", "मंजूर", "केला", "."],
              ["रजनीकांत", "यांच्यावर", "प्रेम", "करणाऱ्या", "त्याच्या", "चाहत्यांची", "संख्या", "खूप", "मोठी", "आहे", "."],
              ["एक", "हजार", "आणि", "एक", "आहेत", "नवीन", "वर्ष", "परिस्थिती", "प्रौढांसाठी", "घरी", "सर्जनशीलता", "आणि", "कल्पकता", "प्रदर्शित", ",", "आपण", "अविस्मरणीय", "काहीतरी", "अनुकूल", "कंपनी", "सामान्य", "साधना", "चालू", "शकत", "नाही", ",", "म्हणून", "."],
              ["नेपाळच्या", "सौनाली", "जवळील", "पडासरी", "भागातील", "एका", "कारखान्यात", "झालेल्या", "प्रचंड", "स्फोटांच्या", "झळा", "भारतीय", "सीमेपर्यंतही", "पोहोचल्या", "असून", "या", "भागात", "सीमा", "सुरक्षा", "दलाची", "गस्त", "वाढविली", "गेली", "आहे", "."],
              ["तसेच", "शाळांना", "लागलेल्या", "सुट्टय़ा", ",", "लग्न", "सराई", ",", "जत्रा", ",", "यात्रा", ".", ".", ".", "तीन", "ग्रामपंचायतीसाठी", "आज", "मतदान"],
              ["जग", "आपण", "सुमारे", "गोल", "फिरणे", "नाही", ",", "तो", "एक", "तारीख", "लक्षात", "विशेषतः", "महत्वाचे", "आहे", "."],
              ["देशाचे", "विभाजन", "करणाऱ्यांनाच", "नव्हे", "तर", "अशा", "लोकांशी", "हातमिळवणी", "करणाऱ्यांनाही", "देशाच्या", "सत्ताकारणात", "यापुढे", "थारा", "मिळू", "नये", "."],
              ["माजी", "जागतिक", "उपविजेता", "मुंबईचा", "दुसरा", "मानांकित", "मोहम्मद", "गुफरानने", "मुबंइ", "उपनगरच्या", "इश्तियाक", "अन्सारीचे", "25", "-", "6", ",", "25", "-", "8", "असे", "आव्हान", "परतवून", "लावले", "."],
              ["तथापि", ",", "एक", "दीर्घ", "परंपरा", "आणि", "काम", "नीतिविषयक", "लोभ", "उद्ध्वस्त", "होते", "."],
              ["सरकारने", "या", "अर्थसंकल्पीय", "अधिवेशनात", "शेतकऱ्यांना", "संपूर्ण", "कर्जमाफीची", "घोषणा", "करावी", ",", "या", "मागणीसाठी", "विरोधक", "आज", "आक्रमक", "झाले", "."],
              ["काँग्रेसचे", "विरोधक", "हे", "काँग्रेस", "मुक्त", "भारताची", "भाषा", "करतात", "तर", "संघाचे", "विरोधक", "संघ", "मुक्त", "भारताची", "."]],

   'ner_tags':[["O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], 
             ["B-ORG", "I-ORG", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O"], #4
             ["O", "B-PER", "I-PER", "B-PER", "I-PER", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "O", "B-LOC", "O", "O", "B-PER", "O", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #6
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-ORG", "I-ORG", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["B-ORG", "I-ORG", "I-ORG", "O", "O", "O", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #9
             ["B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #10
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #11
             ["B-ORG", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #12
             ["B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #13
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #14
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["B-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #17
             ["B-LOC", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "I-LOC", "O", "O", "O", "O", "B-ORG", "I-ORG", "I-ORG", "O", "O", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-ORG", "O", "O"], #19
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["O", "O", "O", "B-LOC", "O", "O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-PER", "I-PER", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #22
             ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
             ["B-ORG", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], #24
             ["B-ORG", "O", "O", "B-ORG", "O", "B-LOC", "O", "O", "O", "B-ORG", "O", "B-ORG", "O", "B-LOC", "O"]]
    }

In [25]:
#NER_labels = ['B-ORG', 'B-MISC', 'I-ORG', 'I-LOC', 'B-LOC', 'I-MISC', 'I-PER', 'O', 'B-PER']
NER_labels = ['B-ORG', 'I-ORG', 'I-LOC', 'B-LOC', 'I-PER', 'O', 'B-PER']
label = ClassLabel(names=NER_labels,num_classes=7)
sequence_feature = Sequence(feature=label, length=-1)
print(sequence_feature)

Sequence(feature=ClassLabel(num_classes=7, names=['B-ORG', 'I-ORG', 'I-LOC', 'B-LOC', 'I-PER', 'O', 'B-PER'], id=None), length=-1, id=None)


In [26]:
features = Features({"tokens": Sequence(feature=Value(dtype='string', id=None)), "ner_tags": sequence_feature})
dataset = Dataset.from_dict(Q1_data, features=features)

In [27]:
Q1_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Q1 dataset",
)

        

Running tokenizer on Q1 dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Q1 dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Q1 dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Q1 dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
a = trainer.predict(Q1_dataset)
a.metrics

Class	Precision	Recall	F1-Score
B-LOC	0.18		0.01	0.01
B-ORG	0.00		0.00	0.00
B-PER	0.00		0.00	0.00
I-LOC	0.00		0.00	0.00
I-ORG	0.00		0.00	0.00
I-PER	0.00		0.00	0.00
O	0.01		0.50	0.03
macro f1:	0.00		0.00	0.02


{'test_loss': 9.208431243896484,
 'test_precision': 0.06593406593406594,
 'test_recall': 0.006060606060606061,
 'test_macro-f1': 0.011100832562442183,
 'test_accuracy': 0.01858736059479554,
 'test_runtime': 0.5754,
 'test_samples_per_second': 43.452,
 'test_steps_per_second': 3.476}