In [None]:
!pip3 install datasets
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install protobuf
!pip3 install torch
!pip3 install hindi-stemmer
!pip3 install indic-nlp-library
!pip3 install snowballstemmer

In [2]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode
import torch
from indicnlp.tokenize import indic_tokenize
from indicnlp import common
from indicnlp import loader
from indicnlp.morph import unsupervised_morph
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from snowballstemmer import stemmer
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
lang='as'
raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

In [4]:
column_names = raw_datasets["train"].column_names
print(column_names)

features = raw_datasets["train"].features
print(features)

['tokens', 'ner_tags']
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [5]:
train_data = raw_datasets['train']
val_data = raw_datasets['validation']
test_data = raw_datasets['test']

In [7]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [8]:
label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


## Performing tokenization on our datset

In [9]:
config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
padding = "max_length"

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # Ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
print(raw_datasets["train"])

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 10266
})


In [24]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

In [25]:
test_dataset = raw_datasets["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on test dataset",
)

In [14]:
print(train_dataset)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10266
})


In [16]:
df = pd.DataFrame(train_dataset)
df.to_csv('assamese_train_data.csv', index=False)

df = pd.DataFrame(test_data)
df.to_csv('assamese_test.csv', index=False)

In [17]:
print("Train dataset after tokenization")
print(train_dataset)

Train dataset after tokenization
Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10266
})


## Performing Normalization on our dataset

In [26]:
normalizer_factory = IndicNormalizerFactory()
normalizer = normalizer_factory.get_normalizer("as",remove_nuktas=False)
for entry in train_dataset:
    tokens = entry['tokens']
    normalized_tokens = [normalizer.normalize(token) for token in tokens]
    entry['tokens'] = normalized_tokens

In [27]:
for entry in test_dataset:
    tokens = entry['tokens']
    normalized_tokens = [normalizer.normalize(token) for token in tokens]
    entry['tokens'] = normalized_tokens

## Performing Morph Analyzer on tokens to see their root form

In [30]:
INDIC_NLP_RESOURCES = r"../../indic_nlp_resources-master/"
common.set_resources_path(INDIC_NLP_RESOURCES)
loader.load()

analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('bn')

In [31]:
sample_dataset = train_dataset.select(range(10))

def morph_tokens(tokens_list):
    stemmed_tokens = []
    for tokens in tokens_list:
        stemmed_tokens_per_list = []
        for token in tokens:
            stem = analyzer.morph_analyze(token)
            stemmed_tokens_per_list.append(" ".join([s[0] for s in stem]))
        stemmed_tokens.append(stemmed_tokens_per_list)
    return stemmed_tokens

morphed_dataset = sample_dataset.map(
    lambda example: {
        'tokens': morph_tokens(example['tokens']),
        'ner_tags': example['ner_tags'],
        'input_ids': example['input_ids'],
        'token_type_ids': example['token_type_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels'],
    },
    batched=True
)

print(morphed_dataset)

Map: 100%|██████████| 10/10 [00:00<00:00, 218.86 examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10
})





In [32]:
print(sample_dataset[0])
print(morphed_dataset[0])

{'tokens': ['কিন্তু', 'বিচাৰি', 'পায়', 'ক', "'", 'ত', '?'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0], 'input_ids': [2, 18788, 308, 84043, 144, 3821, 13195, 1849, 27, 5566, 57, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
df = pd.DataFrame({
    'tokens': train_dataset['tokens'],
    'ner_tags': train_dataset['ner_tags'],
    'input_ids': train_dataset['input_ids'],
    'token_type_ids': train_dataset['token_type_ids'],
    'attention_mask': train_dataset['attention_mask'],
    'labels': train_dataset['labels'],
})

In [37]:
stemmed_dataset = {
    'tokens': df['tokens'],
    'ner_tags': df['ner_tags'],
    'input_ids': df['input_ids'],
    'token_type_ids': df['token_type_ids'],
    'attention_mask': df['attention_mask'],
    'labels': df['labels'],
}

In [38]:
print(stemmed_dataset)
print(test_data[10])

{'tokens': 0                       [কিন্তু, বিচাৰি, পায়, ক, ', ত, ?]
1                                        [আলেখ্য, সম্পাদক]
2                             [এটা, VPN, সংযোগ, ধৰণ, বাছক]
3        [এটা, সেৱাক, &, brandShortName, ., লৈ, যোগ, কৰ...
4        [", এই, কি, ', য়ে, সন্ধান, সঁজুলি, আৰম্ভ, হওত...
                               ...                        
10261    [যদি, প্ৰাৰ্থীসকলৰ, কোনা, ঠিকনা, সলনি, হৈছে, ,...
10262    [", ", ", নামত, অনুপস্থিত, ", ", অনুসন্ধান, বি...
10263    [এই, কথা, প্ৰকাশ, কৰিলে, ৰাজ্যৰ, স্বাস্থ্যমন্ত...
10264    [৭০, সংখ্যক, স্বাধীনতা, দিৱস, হৈছে, দেশবাসীয়ে...
10265                                     [লিখনী, দৈৰ্ঘ্য]
Name: tokens, Length: 10266, dtype: object, 'ner_tags': 0                                    [0, 0, 0, 0, 0, 0, 0]
1                                                   [0, 0]
2                                          [0, 0, 0, 0, 0]
3                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Removal of noisy words from input data

In [40]:
special_characters = ['nn', 'n', '।', '/', '`', '+', '\\', '"', '?', '▁(', '$', '@', '[', '_', "'", '!', ',', ':', '^', '|', ']', '=', '%', '&', '.', ')', '(', '#', '*', '', ';', '-', '}', '|', '"']

pattern = '|'.join(re.escape(char) for char in special_characters)
pattern = f"[{pattern}]"

In [41]:
def clean_assamese_tokenized_array(tokenized_array):
    cleaned_tokens = []
    for token in tokenized_array:
        token = re.sub(r'<[^>]+>', '', token) 
        token = re.sub(pattern, '', token) 
        token = ' '.join(token.split())
        cleaned_tokens.append(token)

    return cleaned_tokens

In [42]:
cleaned_data_test = []

for line in test_dataset:
    cleaned_line = clean_assamese_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_data_test.append(line)

  token = re.sub(pattern, '', token)


In [43]:
cleaned_data = []

for line in train_dataset:
    cleaned_line = clean_assamese_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_data.append(line)

In [47]:
cleaned_val_data = []

for line in val_data:
    cleaned_line = clean_assamese_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_val_data.append(line)

In [46]:
print(train_dataset[5])
print(cleaned_data[5])

{'tokens': ['দুখন', 'প্ৰধান', 'নদী', ',', 'টাইগ্ৰিছ', 'আৰু', 'ইউফ্ৰেটিছ', ',', 'ইৰাকৰ', 'মধ্যভাগেদি', 'দক্ষিণলৈ', 'পাৰ্ছিয়ান', 'গ', "'", 'ল্ফৰ', 'কাষত', 'ছাট', 'আল', '-', 'আৰৱলৈ', 'বৈ', 'গৈছে', '।'], 'ner_tags': [0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 6, 6, 6, 0, 0, 0], 'input_ids': [2, 12801, 6109, 41864, 60703, 93695, 6, 11972, 152, 81081, 4695, 17159, 11528, 78465, 182441, 6, 2267, 144, 23550, 2784, 14314, 13195, 5608, 185249, 12801, 442, 80276, 651, 41864, 4695, 13195, 382, 3531, 27, 4085, 78465, 72656, 308, 8, 117954, 3811, 13, 17159, 164981, 4083, 65600, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [48]:
df = pd.DataFrame(cleaned_data)
df.to_csv('assamese_train_data.csv', index=False)

df = pd.DataFrame(cleaned_data_test)
df.to_csv('assamese_test_data.csv', index=False)

df = pd.DataFrame(cleaned_val_data)
df.to_csv('assamese_val_data.csv', index=False)