In [2]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

In [None]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [6]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [12110, 2, 90, 23, 19, 8800, 61, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

#this method .as_target_tokenizer will give tokens for target(here hindi-target and english-source). 
#these tokens are basically labels for source.

{'input_ids': [[26618, 16155, 346, 33383, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}


In [11]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#we created a dataframe containing the token inputs and token labels

In [13]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

                                                                        

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2507
    })
})

In [14]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
#loading the model from the checkpoint

Downloading tf_model.h5: 100%|██████████| 306M/306M [00:12<00:00, 24.5MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.
Downloading generation_config.json: 100%|██████████| 293/293 [00:00<?, ?B/s] 


In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
#passes data in batches instead of whole

In [17]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [24]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],    #test instead of train because train ds is very large
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [19]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [21]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer , metrics=['accuracy'])

In [25]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



<keras.callbacks.History at 0x23a003b0f08>

In [26]:
model.save_pretrained("en_hi_translator/")

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("en_hi_translator/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at en_hi_translator/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [10]:
input_text  = "My name Shiva. I am aspiring to become a doctor"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[61949   500   179 40095     3   104    38  4891  2671     6    39    38
  10760  5566   254     0 61949]], shape=(1, 17), dtype=int32)


In [11]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

मेरा नाम शिवा. मैं एक डॉक्टर बनने के लिए एक उत्साहपूर्ण हूँ
