### The purpose of this file is to create a corpus of english, hindi and kannada data for training a translation model. The saved corspus will be used to train a sentence piece model and then a transformer model for translation.

In [1]:
import pandas as pd
from pathlib import Path
from torch.nn.utils.rnn import pad_sequence
from en_indic_transformer import TranslationDataset, TranslationDataLoader, Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = Path()
base_dir = path.absolute().parent

In [3]:
load_dir = base_dir / 'data'

Get the saved english to hindi data and english to kannada data from the respective csv files. 

In [4]:
en_hindi_file = load_dir / 'en_hindi.csv'
en_kannada_file = load_dir / 'en_kannada.csv'

In [5]:
en_hindi_file

PosixPath('/Users/sameergururajmathad/en-indic-transformer/data/en_hindi.csv')

In [6]:
en_kannada_file

PosixPath('/Users/sameergururajmathad/en-indic-transformer/data/en_kannada.csv')

In [7]:
en_hindi_df = pd.read_csv(en_hindi_file)
en_kannada_df = pd.read_csv(en_kannada_file)

In [8]:
en_hindi_source = en_hindi_df["english_sentence"].tolist()
en_hindi_target = en_hindi_df["hindi_sentence"].tolist()
en_kannada_source = en_kannada_df["english_sentence"].tolist()
en_kannada_target = en_kannada_df["kannada_sentence"].tolist()

combine all the data into single list to store the corpus.

In [9]:
corpus = []

corpus.extend(en_hindi_source)
corpus.extend(en_hindi_target)
corpus.extend(en_kannada_source)
corpus.extend(en_kannada_target)

In [10]:
corpus

["However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles",
 'Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.',
 'The value of insects in the biosphere is enormous because they outnumber all other living groups in measure of species richness.',
 'Mithali To Anchor Indian Team Against Australia in ODIs',
 'After the assent of the Honble President on 8thSeptember, 2016, the 101thConstitutional Amendment Act, 2016 came into existence',
 'The court has fixed a hearing for February 12',
 'Please select the position where the track should be split.',
 'As per police, armys 22RR, special operation Group (SOG) of police and the Central Reserve Police Force (CRPF) cordoned the village and launched search operation in the area.',
 'Jharkhand chief minister Hemant Soren',
 'Arvind Kumar, SHO of the sector 55/56 police sta

save the processed data into a text file to be used for training sentence piece model.

In [11]:
corpus_save_dir = base_dir / 'tokenizer'

check if the directory exists, if not create one.

In [12]:
if not corpus_save_dir.exists():
    corpus_save_dir.mkdir(parents=True, exist_ok=True)

save the corpus to a text file if not present.

In [13]:
save_file = corpus_save_dir / 'tokenizer_corpus.txt'

In [14]:
if not save_file.exists():
    with open(save_file , 'w', encoding='utf-8') as file:
        file.write('\n'.join(corpus))

Train the tokeinzer. It requires few parameters like input file, model prefix, vocab size etc.

In [15]:
vocab_size = 50_000
model_prefix = corpus_save_dir / 'tokenizer' # path to store the tokenizer files and also the name to store 'tokenizer'
user_defined_symbols = {'<|endoftext|>', '<|english|>', '<|hindi|>', '<|kannada|>'}

In [16]:
# Tokenizer.train(corpus_path=str(save_file),
#                 save_path=str(model_prefix),
#                 vocab_size=vocab_size, 
#                 user_defined_symbols=user_defined_symbols, 
#                 model_type='unigram', 
#                 split_by_whitespace=False)

In [17]:
tokenizer = Tokenizer(str(corpus_save_dir/'tokenizer.model'))

In [18]:
txt_en = "The quick brown fox jumps over the lazy dog."
txt_hi = "मुझे हिन्दी बहुत पसंद है।"
txt_kn = "ನನಗೆ ಕನ್ನಡ ತುಂಬಾ ಇಷ್ಟ."

In [19]:
dataset = TranslationDataset(src=en_hindi_source, target=en_hindi_target, tokenizer=tokenizer, src_prepend_value='<|english|>', target_prepend_value='<|hindi|>', endoftext='<|endoftext|>')

In [20]:
def custom_collate_fn(batch):
    sources, target_ins, target_outs = [], [], []

    for source, target_in, target_out in batch:
        sources.append(source)
        target_ins.append(target_in)
        target_outs.append(target_out)

    source_padded = pad_sequence(sources, batch_first=True, padding_value=50256)
    target_in_padded = pad_sequence(target_ins, batch_first=True, padding_value=50256)
    target_out_padded = pad_sequence(target_outs, batch_first=True, padding_value=-100)

    return source_padded, target_in_padded, target_out_padded
    

In [21]:
# dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True,collate_fn=custom_collate_fn)
dataloader = TranslationDataLoader(dataset=dataset, batch_size=16, shuffle=True, pad_val=tokenizer.get_piece_id('<|endoftext|>'), ignore_index=-100)

In [22]:
data = iter(dataloader)

In [23]:
first = next(data)

In [24]:
# source = list(first[0][2])
# target_in = list(first[1][2])
# target_out = list(first[2][2])

source = first[0][2]
target_in = first[1][2]
target_out = first[2][2]

In [25]:
target_in, target_out

(tensor([    3, 17903,    55, 27076, 46439,   898,   894,  3894,     9,     6,
             6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
             6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
             6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
             6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
             6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
             6]),
 tensor([17903,    55, 27076, 46439,   898,   894,  3894,     9,     6,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100]))

In [26]:
tokenizer.decode(target_in), tokenizer.decode(target_out)

('<|hindi|> लेकिन वह भी साल में 194 दिनों के लिए ही है .<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'लेकिन वह भी साल में 194 दिनों के लिए ही है .<|endoftext|>')

checking if the length of target input and output are same

In [27]:
len(target_in), len(target_out)

(61, 61)

In [28]:
# tokenizer.decode([id for id in source if id != -100])
tokenizer.decode(source)

'<|english|> But that too is only for 194 days in a year .<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [29]:
# tokenizer.decode([id for id in target_in if id != -100])
tokenizer.decode(target_in)

'<|hindi|> लेकिन वह भी साल में 194 दिनों के लिए ही है .<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [30]:
# tokenizer.decode([id for id in target_out if id != -100])
tokenizer.decode(target_out)

'लेकिन वह भी साल में 194 दिनों के लिए ही है .<|endoftext|>'