In [None]:
!pip3 install datasets
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install protobuf
!pip3 install torch
!pip3 install hindi-stemmer
!pip3 install indic-nlp-library
!pip3 install snowballstemmer

In [2]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode
import torch
from indicnlp.tokenize import indic_tokenize
from indicnlp import common
from indicnlp import loader
from indicnlp.morph import unsupervised_morph
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from snowballstemmer import stemmer
import pandas as pd
import re

In [3]:
lang='ta'
raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

In [4]:
column_names = raw_datasets["train"].column_names
print(column_names)

features = raw_datasets["train"].features
print(features)

['tokens', 'ner_tags']
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [5]:
train_data = raw_datasets['train']
val_data = raw_datasets['validation']
test_data = raw_datasets['test']

In [6]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [7]:
label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_to_id)

num_labels = len(label_list)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


## Performing tokenization on our datset

In [8]:
config = AutoConfig.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/indic-bert', num_labels=num_labels )

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
padding = "max_length"

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # Ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
print(raw_datasets["train"])

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 497882
})


In [11]:
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on train dataset",
)

In [12]:
test_dataset = raw_datasets["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on test dataset",
)

In [13]:
print("Train dataset after tokenization")
print(train_dataset)

Train dataset after tokenization
Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 497882
})


## Performing Normalization on our dataset

In [15]:
normalizer_factory = IndicNormalizerFactory()
normalizer = normalizer_factory.get_normalizer("ta",remove_nuktas=False)

for entry in train_dataset:
    tokens = entry['tokens']
    normalized_tokens = [normalizer.normalize(token) for token in tokens]
    entry['tokens'] = normalized_tokens

In [21]:
for entry in test_dataset:
    tokens = entry['tokens']
    normalized_tokens = [normalizer.normalize(token) for token in tokens]
    entry['tokens'] = normalized_tokens

## Performing Morph Analyzer on tokens to see their root form

In [28]:
INDIC_NLP_RESOURCES = r"../../indic_nlp_resources-master/"
common.set_resources_path(INDIC_NLP_RESOURCES)
loader.load()

analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('ta')

In [29]:
sample_dataset = train_dataset.select(range(10))

def morph_tamil_tokens(tokens_list):
    stemmed_tokens = []
    for tokens in tokens_list:
        stemmed_tokens_per_list = []
        for token in tokens:
            stem = analyzer.morph_analyze(token)
            stemmed_tokens_per_list.append(" ".join([s[0] for s in stem]))
        stemmed_tokens.append(stemmed_tokens_per_list)
    return stemmed_tokens

morphed_dataset = sample_dataset.map(
    lambda example: {
        'tokens': morph_tamil_tokens(example['tokens']),
        'ner_tags': example['ner_tags'],
        'input_ids': example['input_ids'],
        'token_type_ids': example['token_type_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels'],
    },
    batched=True
)

print(morphed_dataset)

Map: 100%|██████████| 10/10 [00:00<00:00, 125.01 examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10
})





In [33]:
print(sample_dataset[1])
print(morphed_dataset[1])

{'tokens': ['தெய்வீக', 'ஏவுதலினால்', 'அறிவிக்கப்பட்ட', 'மற்றொரு', 'செய்தியினால்', 'கடுங்கோபம்', 'கொண்ட', 'அரசன்', ',', 'உரியாவை', 'படுகொலை', 'செய்தான்', '.', '-', 'எரேமியா', '26:21', '-', '24', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0], 'input_ids': [2, 8, 114250, 137418, 63641, 67035, 1173, 1471, 20173, 137418, 1385, 3664, 21585, 5448, 48740, 5343, 5798, 8, 99717, 114250, 1173, 1471, 21839, 73270, 1385, 3664, 3534, 26491, 5448, 6425, 1173, 6, 25603, 1299, 3048, 104235, 1471, 8, 99717, 49940, 5, 13, 195120, 56572, 665, 13260, 13, 506, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Performing stemming on our dataset

In [14]:
lang_stemmer = stemmer('tamil')

def stem_tamil_tokens(tokens_list):
    stemmed_tokens = []
    for tokens in tokens_list:
        stemmed_tokens_per_list = []
        for token in tokens:
            stemmed_token = lang_stemmer.stemWord(token)
            stemmed_tokens_per_list.append(stemmed_token)
        stemmed_tokens.append(stemmed_tokens_per_list)
    return stemmed_tokens

In [None]:
df = pd.DataFrame({
    'tokens': train_dataset['tokens'],
    'ner_tags': train_dataset['ner_tags'],
    'input_ids': train_dataset['input_ids'],
    'token_type_ids': train_dataset['token_type_ids'],
    'attention_mask': train_dataset['attention_mask'],
    'labels': train_dataset['labels'],
})

df['tokens'] = df['tokens'].apply(stem_tamil_tokens)

stemmed_dataset = {
    'tokens': df['tokens'],
    'ner_tags': df['ner_tags'],
    'input_ids': df['input_ids'],
    'token_type_ids': df['token_type_ids'],
    'attention_mask': df['attention_mask'],
    'labels': df['labels'],
}

In [17]:
sample_dataset = train_dataset.select(range(10))

df = pd.DataFrame({
    'tokens': sample_dataset['tokens'],
    'ner_tags': sample_dataset['ner_tags'],
    'input_ids': sample_dataset['input_ids'],
    'token_type_ids': sample_dataset['token_type_ids'],
    'attention_mask': sample_dataset['attention_mask'],
    'labels': sample_dataset['labels'],
})
sample_df = df.copy()
print(sample_df['tokens'])
sample_df['tokens'] = sample_df['tokens'].apply(stem_tamil_tokens)
print(sample_df['tokens'])

0    [பைரவருக்கு, தேய்பிறை, அஷ்டமியில், விசேஷ, அபிஷ...
1    [தெய்வீக, ஏவுதலினால், அறிவிக்கப்பட்ட, மற்றொரு,...
2    [ஓடர், (, கன்னியாகுமரி, மாவட்டம், மற்றும், தென...
3    [இது, தொடா்பாக, அவா், தனது, சுட்டுரைப், பக்கத்...
4    [ஜிஎஸ்டி, பற்றாக்குறையை, சமாளிப்பதற்காக, விருப...
5    [நல்கொண்டா, பகுதியிலேயே, இந்த, சம்பவம், பதிவாக...
6    [இந்த, செய்தி, வாட்சப், மற்றும், முகநூல், வழிய...
7                       [இவரது, மனைவி, பழனியம்மாள், .]
8    [TNEA, Counselling, 2020, Rank, List, :, பொறிய...
9    [ஷ்யாம, பிரசாத், முகர்ஜியை, மறந்ததா, பா, ., ஜ,...
Name: tokens, dtype: object
0    [[ப, ை, ர, வ, ர, ு, க, ், க, ு], [த, ே, ய, ், ...
1    [[த, ெ, ய, ், வ, ீ, க], [ஏ, வ, ு, த, ல, ி, ன, ...
2    [[ஓ, ட, ர, ்], [(], [க, ன, ், ன, ி, ய, ா, க, ு...
3    [[இ, த, ு], [த, ொ, ட, ா, ், ப, ா, க], [அ, வ, ா...
4    [[ஜ, ி, எ, ஸ, ், ட, ி], [ப, ற, ், ற, ா, க, ், ...
5    [[ந, ல, ், க, ொ, ண, ், ட, ா], [ப, க, ு, த, ி, ...
6    [[இ, ந, ், த], [ச, ெ, ய, ், த, ி], [வ, ா, ட, ்...
7    [[இ, வ, ர, த, ு], [ம, ன, ை, வ, ி

## Removal of noisy words from input data

In [25]:
special_characters = ['nn', 'n', '।', '/', '`', '+', '\\', '"', '?', '▁(', '$', '@', '[', '_', "'", '!', ',', ':', '^', '|', ']', '=', '%', '&', '.', ')', '(', '#', '*', '', ';', '-', '}', '|', '"']

pattern = '|'.join(re.escape(char) for char in special_characters)
pattern = f"[{pattern}]"

In [26]:
def clean_tamil_tokenized_array(tokenized_array):
    cleaned_tokens = []
    for token in tokenized_array:
        token = re.sub(r'<[^>]+>', '', token) 
        token = re.sub(pattern, '', token) 
        token = ' '.join(token.split())
        cleaned_tokens.append(token)

    return cleaned_tokens

In [29]:
cleaned_data_test = []

for line in test_dataset:
    cleaned_line = clean_tamil_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_data_test.append(line)

In [None]:
cleaned_data = []

for line in train_dataset:
    cleaned_line = clean_tamil_tokenized_array(line['tokens'])
    line['tokens'] = cleaned_line
    cleaned_data.append(line)

In [18]:
df = pd.DataFrame(cleaned_data)
df.to_csv('tamil_train_data.csv', index=False)

df = pd.DataFrame(cleaned_data_test)
df.to_csv('tamil_test_data.csv', index=False)

df = pd.DataFrame(val_data)
df.to_csv('tamil_val_data.csv', index=False)