## Build Dataset

In [6]:
import pandas as pd
import re
import string
from string import digits
from tensorflow.data import Dataset
import tensorflow as tf

batch_size = 8
buffer_size = 2000
max_tokens =128

def create_dataset():
    df = pd.read_csv('./data/hind_encorp.csv',encoding='utf-8')
    #df = df[df['source']=='ted']
    
    df = df.dropna()
    # Lowercase all characters
    df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())
    df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.lower())
    # Remove quotes
    df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))
    df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))
    exclude = set(string.punctuation) # Set of all special characters
    # Remove all the special characters
    df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    # Remove all numbers from text
    remove_digits = str.maketrans('', '', digits)
    df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
    df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

    # Remove extra spaces
    df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
    df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
    df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
    df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))
    
    dataset = Dataset.from_tensor_slices((df['english_sentence'].values, df['hindi_sentence'].values))
    train_split = 0.8
    val_split = 0.2
    train_size = int(train_split * len(dataset))
    val_size = int(val_split * len(dataset))
        
    
    train_encorp_samples, val_encorp_samples = dataset.take(train_size), dataset.skip(train_size).take(val_size)
    train_encorp_ds = train_encorp_samples.shuffle(buffer_size).batch(batch_size).map(
        lambda en, hi: prepare_en_in_batch((en,hi)), tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
    val_encorp_ds = val_encorp_samples.shuffle(buffer_size).batch(batch_size).map(
        lambda en, hi: prepare_en_in_batch((en,hi)), tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
    return train_encorp_ds, val_encorp_ds

In [2]:
from vocabulary.tokenizer import TransformerTokenizer
from vocabulary.gen_vocab import reserved_tokens

def get_tokenizer():
    en_bert = './vocabulary/encorp_en_to_hi/en_encorp_vocab.txt'
    hi_bert = './vocabulary/encorp_en_to_hi/hi_encorp_vocab.txt'
    en_tokenizer = TransformerTokenizer(en_bert, res_tokens=reserved_tokens)
    hi_tokenizer = TransformerTokenizer(hi_bert, res_tokens=reserved_tokens)
    return en_tokenizer, hi_tokenizer

def prepare_en_in_batch( inputs):
        e_line ,h_line = inputs
        en = src_tokenizer.tokenize(e_line)
        en = en[:, :max_tokens]
        en = en.to_tensor()

        hi = target_tokenizer.tokenize(h_line)
        hi = hi[:, : (max_tokens + 1)]
        hi_inputs = hi[:, :-1].to_tensor()
        hi_labels = hi[:, 1:].to_tensor()

        return (en, hi_inputs), hi_labels

In [7]:
src_tokenizer, target_tokenizer = get_tokenizer()
train_encorp_ds, val_encorp_ds = create_dataset()

In [8]:
def test_en_in(train_batches, src_tokenizer, target_tokenizer):
    
    for (en_line, hi_line), label in train_batches.take(1):
        break
    #print('en_line\n', en_line, type(en_line))
    #print('hi_line\n', hi_line, type(hi_line))
    #print('label\n', label, type(label))

    en_tokens = src_tokenizer.detokenize(en_line.numpy())
    hi_tokens = target_tokenizer.detokenize(hi_line.numpy())

    print('pt line\n', en_tokens.numpy()[0].decode('utf-8'))
    #print('pt tokens\n', src_tokenizer.lookup(en_line.numpy()))
    print('en line\n', hi_tokens.numpy()[0].decode('utf-8'))
    #print('en tokens\n', target_tokenizer.lookup(hi_line.numpy()))
    label_tokens = target_tokenizer.detokenize(label.numpy())
    print('en label\n', label_tokens.numpy()[0].decode('utf-8'))
    #print('label tokens\n', target_tokenizer.lookup(label.numpy()))

    print(en_line.shape)
    print(hi_line.shape)
    print(label.shape)

In [9]:
test_en_in(train_encorp_ds,src_tokenizer, target_tokenizer)

2023-07-06 06:43:31.571213: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_19' with dtype int64
	 [[{{node Placeholder/_19}}]]
2023-07-06 06:43:31.572101: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_15' with dtype int64
	 [[{{node Placeholder/_15}}]]


pt line
 what troubled me was that this topdown approach is still around
en line
 मझ यह परशानी थी कि यह ऊपर स नीच की सोच अब भी मौजद ह ।
en label
 मझ यह परशानी थी कि यह ऊपर स नीच की सोच अब भी मौजद ह ।
(8, 90)
(8, 93)
(8, 93)


In [10]:
from train import TransformerTraining

training = TransformerTraining(src_tokenizer, 
                               target_tokenizer, 
                               num_layers=4, 
                               d_mode=512, 
                               dff=2048,
                               num_heads=8, 
                               dropout_rate=0.1, 
                               num_epochs=10,
                               steps_per_epochs=0.1,
                               save_freq=5)
training.compile()

In [None]:
training.fit(train_encorp_ds, val_encorp_ds)

2023-07-06 06:43:48.836191: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_19' with dtype int64
	 [[{{node Placeholder/_19}}]]
2023-07-06 06:43:48.837407: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_16' with dtype int64
	 [[{{node Placeholder/_16}}]]


Epoch 1/10


2023-07-06 06:44:17.873073: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-07-06 06:44:18.343453: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-07-06 06:44:18.838525: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fac9401e5e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-06 06:44:18.838644: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-07-06 06:44:18.957754: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-06 06:44:19.811729: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifeti

 271/2738 [=>............................] - ETA: 18:26 - loss: nan - masked_accuracy: 0.0403

In [None]:
from model.translator import TransformerTranslator
import tensorflow as tf

translator = TransformerTranslator((target_tokenizer, src_tokenizer), training.get_model())

In [None]:
def print_translation(sentence, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')
  print(f'{"Ground truth":15s}: {ground_truth}')

In [None]:
sentence = 'politicians do not have permission to do what'
ground_truth = 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह क'

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

In [None]:
sentence = 'Hello How are you'
ground_truth = 'नमस्ते, आप कैसे हैं'

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)