In [None]:
!pip3 install datasets
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install protobuf
!pip3 install torch
!pip3 install hindi-stemmer
!pip3 install indic-nlp-library
!pip3 install snowballstemmer

In [3]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode
import torch
from indicnlp.tokenize import indic_tokenize
from indicnlp import common
from indicnlp import loader
from indicnlp.morph import unsupervised_morph
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from snowballstemmer import stemmer
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
lang='gu'
raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

In [5]:
column_names = raw_datasets["train"].column_names
print(column_names)

features = raw_datasets["train"].features
print(features)

['tokens', 'ner_tags']
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [6]:
train_data = raw_datasets['train']
val_data = raw_datasets['validation']
test_data = raw_datasets['test']
print(train_data[0])

{'tokens': ['લક્ઝરી', 'સેન્ટ', 'એન્ડ્રુ', 'માતાનો', 'ચર્ચ'], 'ner_tags': [0, 5, 6, 6, 6]}


In [7]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [8]:
label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


## Performing tokenization on our datset

In [9]:
config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

In [10]:
padding = "max_length"

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # Ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
print(raw_datasets["train"])

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 472845
})


In [12]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

In [13]:
test_dataset = raw_datasets["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on test dataset",
)

In [14]:
print(train_dataset)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 472845
})


In [15]:
df = pd.DataFrame(train_dataset)
df.to_csv('gujarati_train_data.csv', index=False)

df = pd.DataFrame(test_data)
df.to_csv('gujarati_test_data.csv', index=False)

In [16]:
print("Train dataset after tokenization")
print(train_dataset[10])
print(raw_datasets["train"][10]) 

Train dataset after tokenization
{'tokens': ['ટીમ', 'ઇન્ડિયાના', 'ફાસ્ટ', 'બોલર', 'મોહંમદ', 'શમીની', 'ભૂતપૂર્વ', 'પત્ની', 'હસીન', 'જહાં', 'ફરી', 'એક', 'વાર', 'સમાચારોમાં', 'ઝળકી', 'છે', '.'], 'ner_tags': [0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [2, 8, 140884, 14473, 3437, 28023, 132034, 1838, 58955, 772, 3740, 67477, 9320, 20962, 9864, 3077, 10034, 4313, 77670, 29421, 1295, 165, 8224, 20101, 110, 18969, 11798, 13429, 1609, 8443, 120642, 4205, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Performing Normalization on our dataset

In [17]:
normalizer_factory = IndicNormalizerFactory()
normalizer = normalizer_factory.get_normalizer("gu",remove_nuktas=False)

for entry in train_dataset:
    tokens = entry['tokens']
    normalized_tokens = [normalizer.normalize(token) for token in tokens]
    entry['tokens'] = normalized_tokens

In [18]:
for entry in test_dataset:
    tokens = entry['tokens']
    normalized_tokens = [normalizer.normalize(token) for token in tokens]
    entry['tokens'] = normalized_tokens

## Performing Morph Analyzer on tokens to see their root form

In [19]:
INDIC_NLP_RESOURCES = r"../../indic_nlp_resources-master/"
common.set_resources_path(INDIC_NLP_RESOURCES)
loader.load()

analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('gu')

In [20]:
sample_dataset = train_dataset.select(range(10))

def morph_tokens(tokens_list):
    stemmed_tokens = []
    for tokens in tokens_list:
        stemmed_tokens_per_list = []
        for token in tokens:
            stem = analyzer.morph_analyze(token)
            stemmed_tokens_per_list.append(" ".join([s[0] for s in stem]))
        stemmed_tokens.append(stemmed_tokens_per_list)
    return stemmed_tokens

morphed_dataset = sample_dataset.map(
    lambda example: {
        'tokens': morph_tokens(example['tokens']),
        'ner_tags': example['ner_tags'],
        'input_ids': example['input_ids'],
        'token_type_ids': example['token_type_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels'],
    },
    batched=True
)

print(morphed_dataset)

Map: 100%|██████████| 10/10 [00:00<00:00, 181.13 examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10
})





In [21]:
print(sample_dataset[0])
print(morphed_dataset[0])

{'tokens': ['લક્ઝરી', 'સેન્ટ', 'એન્ડ્રુ', 'માતાનો', 'ચર્ચ'], 'ner_tags': [0, 5, 6, 6, 6], 'input_ids': [2, 108114, 20641, 31893, 1838, 8555, 35192, 14006, 1295, 76810, 3968, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Performing stemming on our dataset

In [22]:
df = pd.DataFrame({
    'tokens': train_dataset['tokens'],
    'ner_tags': train_dataset['ner_tags'],
    'input_ids': train_dataset['input_ids'],
    'token_type_ids': train_dataset['token_type_ids'],
    'attention_mask': train_dataset['attention_mask'],
    'labels': train_dataset['labels'],
})


stemmed_dataset = {
    'tokens': df['tokens'],
    'ner_tags': df['ner_tags'],
    'input_ids': df['input_ids'],
    'token_type_ids': df['token_type_ids'],
    'attention_mask': df['attention_mask'],
    'labels': df['labels'],
}

In [23]:
print(stemmed_dataset)
print(test_data[10])

{'tokens': 0                    [લક્ઝરી, સેન્ટ, એન્ડ્રુ, માતાનો, ચર્ચ]
1         [આ, દિવસે, ભગવાન, ગણેશની, પૂજાનું, વધારે, મહત્...
2                               [આદુ, પાવડર, -, 1, ટીસ્પૂન]
3         [તે, સમયે, સુભાષ, કાલાબુરાગી, શહેરમાં, MR, મેડ...
4                                             [ઓરેન્જ, ઓરા]
                                ...                        
472840    [તેઓ, ભારતીય, રિઝર્વ, બેન્કની, પેમેન્ટ, અને, સ...
472841                               [આ, પ્રથમ, ખોજ, છે, .]
472842    [ગૌહાટી, હાઇકોર્ટે, સુપર, 30ના, સ્થાપક, આનંદકુ...
472843    [ફ્લાઇંગ, રાની, (, ૧૨૯૨૧, /, ૧૨૯૨૨, ), ભારતીય,...
472844    [તેમાં, ગુલબર્ગ, સોસાયટી, ,, ઓડ, ,, સરદારપુરા,...
Name: tokens, Length: 472845, dtype: object, 'ner_tags': 0                                           [0, 5, 6, 6, 6]
1                               [0, 0, 0, 1, 0, 0, 0, 0, 0]
2                                           [0, 0, 0, 0, 0]
3                   [0, 0, 1, 1, 0, 3, 4, 4, 0, 0, 0, 0, 0]
4                               

## Removal of noisy words from input data

In [25]:
special_characters = ['nn', 'n', '।', '/', '`', '+', '\\', '"', '?', '▁(', '$', '@', '[', '_', "'", '!', ',', ':', '^', '|', ']', '=', '%', '&', '.', ')', '(', '#', '*', '', ';', '-', '}', '|', '"']

pattern = '|'.join(re.escape(char) for char in special_characters)
pattern = f"[{pattern}]"

In [26]:
def clean_guj_tokenized_array(tokenized_array):
    cleaned_tokens = []
    for token in tokenized_array:
        token = re.sub(r'<[^>]+>', '', token) 
        token = re.sub(pattern, '', token) 
        token = ' '.join(token.split())
        cleaned_tokens.append(token)

    return cleaned_tokens

In [27]:
cleaned_data_test = []

for line in test_dataset:
    cleaned_line = clean_guj_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_data_test.append(line)

  token = re.sub(pattern, '', token)


In [None]:
cleaned_data = []

for line in train_dataset:
    cleaned_line = clean_guj_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_data.append(line)

In [29]:
cleaned_val_data = []

for line in val_data:
    cleaned_line = clean_guj_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_val_data.append(line)

In [None]:
df = pd.DataFrame(cleaned_data)
df.to_csv('gujarati_train_data.csv', index=False)

df = pd.DataFrame(cleaned_data_test)
df.to_csv('gujarati_test_data.csv', index=False)

df = pd.DataFrame(cleaned_val_data)
df.to_csv('gujarati_val_data.csv', index=False)