In [2]:
!pip install datasets transformers[sentencepiece] sacrebleu -q


In [3]:
!pip install datasets




In [None]:
import os
os.kill(os.getpid(), 9)  # Restart the runtime


In [4]:
import datasets
print("Datasets library version:", datasets.__version__)



Datasets library version: 3.2.0


In [5]:
# Ideally, you should group imports from the same library like this:
import os
import sys

import tensorflow as tf
import transformers
from datasets import load_dataset
from transformers import (AdamWeightDecay, AutoTokenizer, DataCollatorForSeq2Seq,
                          TFAutoModelForSeq2SeqLM)

In [6]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

In [7]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [9]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

###Preprocessing on dataset

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [11]:
tokenizer("hey , lets go")

{'input_ids': [74, 667, 44, 2, 446, 16, 411, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokenizer(["Hello, this is a introduction.", "This is another sintroduction."])

{'input_ids': [[12110, 2, 90, 23, 19, 19394, 3, 0], [239, 23, 414, 946, 9765, 10790, 18057, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[26618, 16155, 346, 33383, 0]], 'attention_mask': [[1, 1, 1, 1, 1]]}




In [14]:
max_input_length = 1000
max_target_length = 1000

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
preprocess_function(raw_datasets["train"][:2])

{'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0], [32643, 28541, 36253, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0], [26618, 16155, 346, 33383, 0]]}

In [16]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [17]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [41]:
batch_size = 32
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 85

In [42]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [43]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [44]:
!pip install datasets



In [45]:
from datasets import load_dataset
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [46]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [47]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [48]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [49]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=85)

Epoch 1/85
Epoch 2/85
Epoch 3/85
Epoch 4/85
Epoch 5/85
Epoch 6/85
Epoch 7/85
Epoch 8/85
Epoch 9/85
Epoch 10/85
Epoch 11/85
Epoch 12/85
Epoch 13/85
Epoch 14/85
Epoch 15/85
Epoch 16/85
Epoch 17/85
Epoch 18/85
Epoch 19/85
Epoch 20/85
Epoch 21/85
Epoch 22/85
Epoch 23/85
Epoch 24/85
Epoch 25/85
Epoch 26/85
Epoch 27/85
Epoch 28/85
Epoch 29/85
Epoch 30/85
Epoch 31/85
Epoch 32/85
Epoch 33/85
Epoch 34/85
Epoch 35/85
Epoch 36/85
Epoch 37/85
Epoch 38/85
Epoch 39/85
Epoch 40/85
Epoch 41/85
Epoch 42/85
Epoch 43/85
Epoch 44/85
Epoch 45/85
Epoch 46/85
Epoch 47/85
Epoch 48/85
Epoch 49/85
Epoch 50/85
Epoch 51/85
Epoch 52/85
Epoch 53/85
Epoch 54/85
Epoch 55/85
Epoch 56/85
Epoch 57/85
Epoch 58/85
Epoch 59/85
Epoch 60/85
Epoch 61/85
Epoch 62/85
Epoch 63/85
Epoch 64/85
Epoch 65/85
Epoch 66/85
Epoch 67/85
Epoch 68/85
Epoch 69/85
Epoch 70/85
Epoch 71/85
Epoch 72/85
Epoch 73/85
Epoch 74/85
Epoch 75/85
Epoch 76/85
Epoch 77/85
Epoch 78/85
Epoch 79/85
Epoch 80/85
Epoch 81/85
Epoch 82/85
Epoch 83/85
Epoch 84/85
E

<tf_keras.src.callbacks.History at 0x7f1c2c3633d0>

In [50]:
model.save_pretrained("tf_model/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}


In [59]:
tokenizer.save_pretrained("tf_model/")

('tf_model/tokenizer_config.json',
 'tf_model/special_tokens_map.json',
 'tf_model/vocab.json',
 'tf_model/source.spm',
 'tf_model/target.spm',
 'tf_model/added_tokens.json')

###testing

In [51]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [57]:
input_text  = "My name is Raj and I live in Delhi. I go to the park every morning and do yoga. I love reading books and listening to music. In my family, I have my parents and a younger sister. We all eat dinner together in the evening and share our day's experiences. I am very content with my life."

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=1000)
print(out)

tf.Tensor(
[[61949   500   179  1686   130     5     9   104 36034    11  1081   743
     40   104  2911 17429    11   273   254     9  3549 12433   260   273
    254    40   104    63   823     6   116  5223     9  4985  2542     6
     39   176   634   161   254    40     2   104    63   989    69 17254
      9  1775  1699    18   116  2510 58196   161   254     2     9  4330
     24    63   183     6  2538     6   116  4941   161   254    40     0]], shape=(1, 72), dtype=int32)


In [58]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

मेरा नाम राजी है और मैं दिल्ली में रहता हूं। मैं सुबह पार्क में जाता हूँ और गर्गा जाता हूँ। मैं अपने परिवार के साथ किताबें और संगीत सुनने के लिए बहुत प्यार करता हूँ।, मैं अपने माता-पिता और छोटे बहन को साथ मिलकर डिनर करता हूँ, और शाम का अपने दिन के अनुभव के साथ साझा करता हूँ।


In [60]:
# Save both model and tokenizer
save_directory = "english_hindi_translator"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}


('english_hindi_translator/tokenizer_config.json',
 'english_hindi_translator/special_tokens_map.json',
 'english_hindi_translator/vocab.json',
 'english_hindi_translator/source.spm',
 'english_hindi_translator/target.spm',
 'english_hindi_translator/added_tokens.json')

In [61]:
# In Colab, run:
!zip -r english_hindi_translator.zip tf_model/

  adding: tf_model/ (stored 0%)
  adding: tf_model/target.spm (deflated 60%)
  adding: tf_model/special_tokens_map.json (deflated 35%)
  adding: tf_model/tokenizer_config.json (deflated 68%)
  adding: tf_model/vocab.json (deflated 76%)
  adding: tf_model/tf_model.h5 (deflated 7%)
  adding: tf_model/config.json (deflated 61%)
  adding: tf_model/generation_config.json (deflated 43%)
  adding: tf_model/source.spm (deflated 51%)
