# Indic-BERT Model 

In [15]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("tokenizer_indicBERT/kaggle/working/tokenizer_indicBERT")
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("my_indicBERT/kaggle/working/my_indicBERT")

In [16]:
# Let's download the Naampadam (Indic NER) dataset

from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='hi'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

In [17]:
print(type(raw_datasets))

<class 'datasets.dataset_dict.DatasetDict'>


In [18]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    # print(tokenized_inputs)
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    # print(tokenized_inputs)
    return tokenized_inputs

In [23]:
train_dataset_test  = raw_datasets["test"]
# print(type(train_dataset))




In [85]:

# print(train_dataset_test[0])

In [67]:
label_mapping = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC'
}


In [70]:
sample = train_dataset_test
# print(sample)

test_data=[]

for i in sample['tokens']:
    sentence = ' '.join(i)
    test_data.append(sentence)

ground_truth = []

for i in sample['ner_tags']:
    output_list = [label_mapping[num] for num in i]
    ground_truth.append(output_list)

# print(test_data)
# print(ground_truth)

In [63]:


def get_ner(sentence):
    tok_sentence = tokenizer(sentence, return_tensors='pt')

    with torch.no_grad():
        logits = model_fine_tuned(**tok_sentence).logits.argmax(-1)
        predicted_tokens_classes = [
            model_fine_tuned.config.id2label[t.item()] for t in logits[0]]

        predicted_labels = []

        previous_token_id = 0
        word_ids = tok_sentence.word_ids()
        for word_index in range(len(word_ids)):
            if word_ids[word_index] == None:
                previous_token_id = word_ids[word_index]
            elif word_ids[word_index] == previous_token_id:
                previous_token_id = word_ids[word_index]
            else:
                predicted_labels.append(predicted_tokens_classes[word_index])
                previous_token_id = word_ids[word_index]

        ner_output = []
        l = min (len(sentence.split(' ')) , len(predicted_labels) )
        for index in range(l):
            ner_output.append(
                (sentence.split(' ')[index], predicted_labels[index]))
        return ner_output



In [64]:
# str = "दरअसल , जनवरी से चीन और नेपाल के सीमावर्ती क्षेत्रों को संचार सुविधा के लिए अब बैलून नेटवर्क सिस्टम की शुरूआत की जा रही है, जिसके साथ ही उत्तराखंड बैलून से नेटवर्क सुविधा देने वाला पहला राज्य बनेगा।  "
labeled_output = []
output=[]
for i in test_data:
    l = get_ner(i)
    labeled_output.append(l)
    temp=[]
    for j in l:
        temp.append(j[1])
    output.append(temp)




In [76]:
# print(labeled_output[0])

In [75]:
# print(output[0])

In [73]:

with open('tags_output_BERT_test.txt', 'w') as file:
    for tags in output:
        tags_string = ' '.join(tags)
        file.write(tags_string + '\n' )

In [74]:
with open('tags_output_BERT_ground_truth.txt', 'w') as file:
    for tags in ground_truth:
        tags_string = ' '.join(tags)
        file.write(tags_string + '\n' )

# Indic-NER

In [80]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

tokenizer_ner = AutoTokenizer.from_pretrained("tokenizer_indicNER/kaggle/working/tokenizer_indicNER")
model_fine_tuned_ner = AutoModelForTokenClassification.from_pretrained("my_indicNER/kaggle/working/my_indicNER")

In [82]:


def get_ner(sentence):
    tok_sentence = tokenizer_ner(sentence, return_tensors='pt')

    with torch.no_grad():
        logits = model_fine_tuned_ner(**tok_sentence).logits.argmax(-1)
        predicted_tokens_classes = [
            model_fine_tuned_ner.config.id2label[t.item()] for t in logits[0]]

        predicted_labels = []

        previous_token_id = 0
        word_ids = tok_sentence.word_ids()
        for word_index in range(len(word_ids)):
            if word_ids[word_index] == None:
                previous_token_id = word_ids[word_index]
            elif word_ids[word_index] == previous_token_id:
                previous_token_id = word_ids[word_index]
            else:
                predicted_labels.append(predicted_tokens_classes[word_index])
                previous_token_id = word_ids[word_index]

        ner_output = []
        l = min (len(sentence.split(' ')) , len(predicted_labels) )
        for index in range(l):
            ner_output.append(
                (sentence.split(' ')[index], predicted_labels[index]))
        return ner_output



In [83]:
# str = "दरअसल , जनवरी से चीन और नेपाल के सीमावर्ती क्षेत्रों को संचार सुविधा के लिए अब बैलून नेटवर्क सिस्टम की शुरूआत की जा रही है, जिसके साथ ही उत्तराखंड बैलून से नेटवर्क सुविधा देने वाला पहला राज्य बनेगा।  "
labeled_output_ner = []
output_ner=[]
for i in test_data:
    l = get_ner(i)
    labeled_output_ner.append(l)
    temp=[]
    for j in l:
        temp.append(j[1])
    output_ner.append(temp)



In [84]:

with open('tags_output_NER_test.txt', 'w') as file:
    for tags in output_ner:
        tags_string = ' '.join(tags)
        file.write(tags_string + '\n' )