# Optional

**Run the cells below if tokenizer needs to be trained. Trained tokenizer can be cloned from huggingface directly.**

### The purpose of this file is to create a corpus of english, hindi and kannada data for training a translation model. The saved corspus will be used to train a sentence piece model and then a transformer model for translation.

In [1]:
import pandas as pd
from pathlib import Path
from torch.nn.utils.rnn import pad_sequence
from en_indic_transformer import TranslationDataset, TranslationDataLoader, Tokenizer

In [2]:
path = Path()
base_dir = path.absolute().parent

In [3]:
load_dir = base_dir / 'data'

Get the saved english to hindi data and english to kannada data from the respective csv files. 

In [4]:
en_hindi_file = load_dir / 'en_hindi.csv'
en_kannada_file = load_dir / 'en_kannada.csv'

In [5]:
en_hindi_file

PosixPath('/Users/sameergururajmathad/en-indic-transformer/data/en_hindi.csv')

In [6]:
en_kannada_file

PosixPath('/Users/sameergururajmathad/en-indic-transformer/data/en_kannada.csv')

In [7]:
en_hindi_df = pd.read_csv(en_hindi_file)
en_kannada_df = pd.read_csv(en_kannada_file)

In [8]:
en_hindi_source = en_hindi_df["english_sentence"].tolist()
en_hindi_target = en_hindi_df["hindi_sentence"].tolist()
en_kannada_source = en_kannada_df["english_sentence"].tolist()
en_kannada_target = en_kannada_df["kannada_sentence"].tolist()

combine all the data into single list to store the corpus.

In [9]:
corpus = []

corpus.extend(en_hindi_source)
corpus.extend(en_hindi_target)
corpus.extend(en_kannada_source)
corpus.extend(en_kannada_target)

In [10]:
len(corpus)

7667100

save the processed data into a text file to be used for training sentence piece model.

In [11]:
corpus_save_dir = base_dir / 'data'
tokenizer_save_dir = base_dir / 'tokenizer'

check if the directory exists, if not create one.

In [12]:
if not corpus_save_dir.exists():
    corpus_save_dir.mkdir(parents=True, exist_ok=True)

if not tokenizer_save_dir.exists():
    tokenizer_save_dir.mkdir(parents=True, exist_ok=True)

save the corpus to a text file if not present.

In [13]:
save_file = corpus_save_dir / 'tokenizer_corpus.txt'

In [14]:
# if not save_file.exists():
#     with open(save_file , 'w', encoding='utf-8') as file:
#         file.write('\n'.join(corpus))

In [None]:
if not save_file.exists():
    with open(save_file , 'w', encoding='utf-8') as file:
        for item in corpus:
            file.write(f'{item}\n') # safer approach.

Train the tokeinzer. It requires few parameters like input file, model prefix, vocab size etc.

In [16]:
vocab_size = 50_000
model_prefix = tokenizer_save_dir / 'tokenizer' # path to store the tokenizer files and also the name to store 'tokenizer'
user_defined_symbols = {'<|endoftext|>', '<|english|>', '<|hindi|>', '<|kannada|>'}

In [17]:
Tokenizer.train(corpus_path=str(save_file),
                save_path=str(model_prefix),
                vocab_size=vocab_size, 
                user_defined_symbols=user_defined_symbols, 
                model_type='unigram', 
                split_by_whitespace=False)

Training SentencePiece on the given data.


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /Users/sameergururajmathad/en-indic-transformer/data/tokenizer_corpus.txt
  input_format: 
  model_prefix: /Users/sameergururajmathad/en-indic-transformer/tokenizer/tokenizer
  model_type: UNIGRAM
  vocab_size: 50000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 0
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <|english|>
  user_defined_symbols: <|kannada|>
  user_defined_symbols: <|endoftext|>
  user_defined_symbols: <|hindi|>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_c

In [18]:
tokenizer = Tokenizer(str(tokenizer_save_dir/'tokenizer.model'))

In [19]:
txt_en = "The quick brown fox jumps over the lazy dog."
txt_hi = "मुझे हिन्दी बहुत पसंद है।"
txt_kn = "ನನಗೆ ಕನ್ನಡ ತುಂಬಾ ಇಷ್ಟ."

In [20]:
dataset = TranslationDataset(src=en_hindi_source, target=en_hindi_target, tokenizer=tokenizer, src_prepend_value='<|english|>', target_prepend_value='<|hindi|>', endoftext='<|endoftext|>')

In [21]:
def custom_collate_fn(batch):
    sources, target_ins, target_outs = [], [], []

    for source, target_in, target_out in batch:
        sources.append(source)
        target_ins.append(target_in)
        target_outs.append(target_out)

    source_padded = pad_sequence(sources, batch_first=True, padding_value=50256)
    target_in_padded = pad_sequence(target_ins, batch_first=True, padding_value=50256)
    target_out_padded = pad_sequence(target_outs, batch_first=True, padding_value=-100)

    return source_padded, target_in_padded, target_out_padded
    

In [22]:
# dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True,collate_fn=custom_collate_fn)
dataloader = TranslationDataLoader(dataset=dataset, batch_size=16, shuffle=True, pad_val=tokenizer.get_piece_id('<|endoftext|>'), ignore_index=-100)

In [23]:
data = iter(dataloader)

In [24]:
first = next(data)

In [25]:
# source = list(first[0][2])
# target_in = list(first[1][2])
# target_out = list(first[2][2])

source = first[0][2]
target_in = first[1][2]
target_out = first[2][2]

In [26]:
target_in, target_out

(tensor([    6,  4924,   206, 18895, 12035,   843,   373,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5]),
 tensor([ 4924,   206, 18895, 12035,   843,   373,     5,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100]))

In [27]:
tokenizer.decode(target_in), tokenizer.decode(target_out)

('<|hindi|> लेखकः तारा अली बेग<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 'लेखकः तारा अली बेग<|endoftext|>')

checking if the length of target input and output are same

In [28]:
len(target_in), len(target_out)

(46, 46)

In [29]:
# tokenizer.decode([id for id in source if id != -100])
tokenizer.decode(source)

'<|english|> Author: Tara Ali Baig<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [30]:
# tokenizer.decode([id for id in target_in if id != -100])
tokenizer.decode(target_in)

'<|hindi|> लेखकः तारा अली बेग<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [31]:
# tokenizer.decode([id for id in target_out if id != -100])
tokenizer.decode(target_out)

'लेखकः तारा अली बेग<|endoftext|>'