In [1]:
from IPython.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [2]:
!pip install -q transformers[sentencepiece]
!pip install -q datasets
!pip install -q sacrebleu rouge_score
!pip install -q py7zr
!pip install -q --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install -q transformers accelerate
!pip install -q sacremoses
!pip install -q tensorflow

[0mFound existing installation: transformers 4.38.1
Uninstalling transformers-4.38.1:
  Successfully uninstalled transformers-4.38.1
Found existing installation: accelerate 0.27.2
Uninstalling accelerate-0.27.2:
  Successfully uninstalled accelerate-0.27.2
[0m

In [3]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

import warnings
warnings.filterwarnings('ignore')

2024-02-25 14:11:52.478541: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-25 14:11:52.478606: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-25 14:11:52.479775: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-25 14:11:52.486979: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Check if a GPU is available and set the device accordingly
device_name = "gpu" if tf.config.list_physical_devices('GPU') else "cpu"
print(device_name)

gpu


In [5]:
# Model: https://huggingface.co/Helsinki-NLP/opus-mt-en-hi
model = 'Helsinki-NLP/opus-mt-en-hi'

# Dataset: https://huggingface.co/datasets/cfilt/iitb-english-hindi
dataset = load_dataset('cfilt/iitb-english-hindi')

In [6]:
# train, few examples
print('Train, one example ---------')
display(dataset['train'][0])

print('\nValidation, one example ---------')
display(dataset['validation'][0])

print('\nTest, one example ---------')
display(dataset['test'][0])

Train, one example ---------


{'translation': {'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


Validation, one example ---------


{'translation': {'en': 'Students of the Dattatreya city Municipal corporation secondary school demonstrated their imagination power by creating the fictitious fort "Duttgarh".',
  'hi': "महानगर पालिका अंतर्गत दत्तात्रय नगर माध्यमिक स्कूल के विद्यार्थियों ने काल्पनिक किला 'दत्तगढ़' बनाकर अपनी कल्पनाशक्ति का परिचय दिया।"}}


Test, one example ---------


{'translation': {'en': 'A black box in your car?',
  'hi': 'आपकी कार में ब्लैक बॉक्स?'}}

In [7]:
# Initialize the tokenizer with a pre-trained model.
# This prepares the tokenizer to process text according to the model's training.
tokenizer = AutoTokenizer.from_pretrained(model)

# Directly tokenize the input sentence without specifying target tokenizer context.
# This uses the tokenizer in its default mode.
tokenizer(['चाय पार्टी भौचक्की है।'])

{'input_ids': [[44, 4228, 260, 1321, 44, 2703, 260, 428, 1185, 2136, 130, 44, 1, 1185, 716, 130, 44, 1, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [8]:
# Temporarily set the tokenizer to act as a target tokenizer.
# This is specific to sequence-to-sequence tasks.
# Tokenize the input sentence and print the tokenized output.
with tokenizer.as_target_tokenizer():
    print(tokenizer(['चाय पार्टी भौचक्की है।']))

{'input_ids': [[15889, 9384, 61313, 130, 5, 40, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}


In [9]:
# Define maximum input and target sequence lengths.
max_input_length = 128
max_target_length = 128

# Specify the source and target languages for translation.
source_lang = 'en'
target_lang = 'hi'

def process_function(sentence):
  # Extract the input sentences (in the source language) from the provided data structure.
  # The data structure is expected to be a dictionary with a key 'translation' that contains a list of dictionaries,
  # each having keys corresponding to language codes ('en' for English, 'hi' for Hindi).
  inputs = [_[source_lang] for _ in sentence['translation']]

  # Extract the target sentences (in the target language) in a similar manner.
  targets = [_[target_lang] for _ in sentence['translation']]

  # Tokenize the input sentences. This uses the tokenizer in its default configuration,
  # applying truncation to limit the tokenized inputs to 'max_input_length'.
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Temporarily switch the tokenizer to target tokenizer mode.
  # This is useful for sequence-to-sequence tasks where the target text might need different tokenization.
  with tokenizer.as_target_tokenizer():
    # Tokenize the target sentences, applying truncation to limit them to 'max_target_length'.
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

  # Add the tokenized targets (input IDs) as 'labels' to the model inputs.
  # This is typically used for training sequence-to-sequence models, where the labels are the expected outputs.
  model_inputs['labels'] = labels['input_ids']

  # Return the prepared model inputs, which now include both the tokenized inputs and the corresponding labels.
  return model_inputs

In [10]:
# Apply the 'process_function' to the entire dataset to tokenize and format it for training.
# The 'batched=True' argument indicates that the 'process_function'
# should be applied to batches of the dataset rather than individual items.

tokenized_data = dataset.map(process_function, batched=True)
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2507
    })
})

In [11]:
# Load a pre-trained sequence-to-sequence language model using TensorFlow.
# This model is suitable for tasks like translation, summarization, etc.
# Explicitly open a device scope and place the model on the specified device (GPU or CPU)
with tf.device(device_name):
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model)

# Set the training parameters.
batch_size = 32          # Number of samples per batch to be passed to the model during training.
learning_rate = 0.001    # Learning rate for the optimizer.
weight_decay = 0.01      # Weight decay for regularization.
num_train_epoch = 1      # Number of epochs to train the model.

# Create a data collator that will dynamically batch and preprocess the data.
# The data collator ensures that the data fed to the model is properly formatted and tensorized.
# return_tensors='tf'; Specifies that the data should be returned as TensorFlow tensors.
data_collator = DataCollatorForSeq2Seq(tokenizer,
                                       model=model,
                                       return_tensors='tf')

# Prepare the training dataset by applying the data collator.
# This will format the tokenized data for training, including batching and optional shuffling.
train_dataset = model.prepare_tf_dataset(
    tokenized_data['train'],       # The tokenized training data.
    batch_size=batch_size,         # The batch size to use when preparing the dataset.
    shuffle=True,                  # Indicates that the data should be shuffled.
    collate_fn=data_collator       # The function to collate data samples into batches.
)

# Prepare the validation dataset in a similar manner to the training dataset.
# This dataset is used to evaluate the model's performance on unseen data during training.
validation_dataset = model.prepare_tf_dataset(
    tokenized_data['validation'],  # The tokenized validation data.
    batch_size=batch_size,         # The batch size to use when preparing the dataset.
    shuffle=True,                  # Indicates that the data should be shuffled.
    collate_fn=data_collator       # The function to collate data samples into batches.
)

# Initialize the optimizer with the Adam algorithm and weight decay regularization.
# 'learning_rate' and 'weight_decay' are hyperparameters that control the learning process.
optimizer = AdamWeightDecay(learning_rate=learning_rate,
                            weight_decay_rate=weight_decay)

# Compile the model with the specified optimizer.
# This step prepares the model for training by setting up the backpropagation process.
model.compile(optimizer=optimizer)

# Train the model on the training dataset while also evaluating on the validation dataset.
# 'num_train_epoch' specifies the number of times the model will go through the entire training dataset.
model.fit(train_dataset,
          validation_data=validation_dataset,
          epochs=num_train_epoch)

# After training, save the model's weights and configuration to a directory.
# This allows the model to be reloaded or deployed later without retraining.
model.save_pretrained('tf_model/')

2024-02-25 14:12:04.275855: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22287 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4090, pci bus id: 0000:01:00.0, compute capability: 8.9
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.




Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


In [12]:
# Model inferencing
input_text = 'i am learning coding. how about your learning?'

tokenized = tokenizer([input_text], return_tensors='np')
print(f'Tokenized sentence: {tokenized}')

output = model.generate(**tokenized, max_length=128)

# Use the tokenizer as a target tokenizer to decode the output
# and set 'skip_special_tokens=True' to remove padding or any special tokens.
with tokenizer.as_target_tokenizer():
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f'Output in target language: {decoded_output}')

Tokenized sentence: {'input_ids': array([[5556,  489, 3729, 7308,  232,  137,    3,  287,  195,   85, 3729,
          22,    0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Output in target language: 
