# Experiment : LLM: mBART

**Author:** Gloria Isedu

**Description:** Experiments to fine-tune and use a pre-trained mBART.

**References:** https://huggingface.co/docs/transformers/model_doc/marian



In [1]:
%pip install sentencepiece sacremoses Cython



In [2]:
from transformers import MBartTokenizer, MBart50Tokenizer, TFMBartForConditionalGeneration, MBartForConditionalGeneration, TFBartForConditionalGeneration, AdamWeightDecay

In [1]:
# from transformers import MBartTokenizer, TFBartForConditionalGeneration
from transformers import MBartTokenizer, MBart50Tokenizer, TFMBartForConditionalGeneration, MBartForConditionalGeneration, TFBartForConditionalGeneration, AdamWeightDecay
import tensorflow as tf
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# def predict_and_compare(index, testX, testY, model, tokenizer, max_output_length=5):
#     """ Predicts translation for a given index in the test set and compares with the ground truth. """
#     input_seq = testX[index:index+1]

#     # Determine the total max_length (input length + desired output length)
#     total_max_length = len(input_seq[0]) + max_output_length
#     prediction = model.generate(input_seq, max_length=total_max_length, no_repeat_ngram_size=2)

#     # Decode the prediction and input
#     input_text = tokenizer.decode(input_seq[0], skip_special_tokens=True)
#     predicted_text = tokenizer.decode(prediction[0], skip_special_tokens=True)

#     # For ground truth
#     ground_truth_text = tokenizer.decode(testY[index], skip_special_tokens=True)

#     # Return results
#     return input_text, predicted_text, ground_truth_text


if __name__ == '__main__':

#     gpus = tf.config.experimental.list_physical_devices('GPU')
#     if gpus:
#         try:
#             # Set TensorFlow to use only one GPU
#             tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

#             # Enable memory growth
#             tf.config.experimental.set_memory_growth(gpus[0], True)

#             print("Using GPU:", gpus[0])
#         except RuntimeError as e:
#             # Memory growth must be set at program startup
#             print("RuntimeError:", e)
#     else:
#         raise SystemError("GPU device not found")

#     # --- 2. We define the global variable ---


    BATCH_SIZE = 2#16
    EPOCHS = 3#10
    VALIDATION_SPLIT = 0.2

    # --- 3. We open the data and apply tokenization, with data generator ---

    df = pd.read_csv('preprocessed_data.csv')
    df = df[:121]
    source_texts = df['en_tokens'].to_list()
    target_texts = df['fr_tokens'].to_list()

    source_train, source_val, target_train, target_val = train_test_split(source_texts, target_texts, test_size=VALIDATION_SPLIT, random_state=42)
    # print(source_train)
    # # We extract the test set first
    # train_df, test_df = train_test_split(df, test_size=VALIDATION_SPLIT)

    # Tokenize and pad  sequences using tokenizer
    tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", max_length=512)
    tokenized_trainX = tokenizer(source_train, return_tensors="tf", padding=True, truncation=True, max_length=512)
    tokenized_trainY = tokenizer(target_train, return_tensors="tf", padding=True, truncation=True, max_length=512)
    tokenized_testX = tokenizer(source_val, return_tensors="tf", padding=True, truncation=True, max_length=512)
    tokenized_testY = tokenizer(target_val, return_tensors="tf", padding=True, truncation=True, max_length=512)


    # Create TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_trainX), tokenized_trainY['input_ids']))
    val_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_testX), tokenized_testY['input_ids']))

    # Batch and shuffle the datasets
    train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(buffer_size=len(source_train))
    val_dataset = val_dataset.batch(BATCH_SIZE)

    # Load pretrained mBART model
    model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

    # Adjust the model for conditional generation (translation)
    model.resize_token_embeddings(len(tokenizer))

    # Set up optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    # model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy")
    model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

    # Fine-tune the model with validation
    model.trainable = True
    epochs = 3
    # Assuming tokenized_trainX and tokenized_trainY are BatchEncoding objects
    input_ids_trainX = tokenized_trainX["input_ids"].numpy()
    attention_mask_trainX = tokenized_trainX["attention_mask"].numpy()
    input_ids_trainY = tokenized_trainY["input_ids"].numpy()

    # Train the model
    model.fit(
        x={"input_ids": input_ids_trainX, "attention_mask": attention_mask_trainX},
        y=input_ids_trainY,
        epochs=3,
        batch_size=1
    )
    # model.fit(
    # x=tokenized_trainX,
    # y=tokenized_trainY["input_ids"],
    # epochs=3,
    # batch_size=2,
    # validation_split=0.2,
    # # optimizer=optimizer,
    # )

    # # Iterate over the datasets
    # for epoch in range(EPOCHS):
    #     # Training loop
    #     for batch in train_dataset:
    #         with tf.GradientTape() as tape:
    #             outputs = model(**batch[0], labels=batch[1])
    #             loss = outputs.loss
    #         gradients = tape.gradient(loss, model.trainable_variables)
    #         optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    #     # Validation loop
    #     for batch in val_dataset:
    #         val_outputs = model(**batch[0], labels=batch[1])
    #         val_loss = val_outputs.loss

    #     # Print accuracy
    #     print(f"Epoch {epoch + 1}, Training Loss: {loss.numpy()}, Validation Loss: {val_loss.numpy()}")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All PyTorch model weights were used when initializing TFMBartForConditionalGeneration.

All the weights of TFMBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMBartForConditionalGeneration for predictions without further training.


Epoch 1/3


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1674, in train_step
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 2454, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/backend.py", line 5775, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(1, 278) and logits.shape=(1, 265, 250054)


In [None]:

model = MBartForConditionalGeneration.from_pretrained(model_name)

# Example translation
text = "Hello, how are you?"
input_ids = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
output_ids = model.generate(input_ids)
translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Input:", text)
print("Translation:", translation)




import tensorflow as tf
from transformers import MBartTokenizer, TFBartForConditionalGeneration
from transformers import AdamWeightDecay

# Example text data
source_texts = ["This is an example sentence.", "Translate this sentence."]
target_texts = ["C'est une phrase exemple.", "Traduisez cette phrase."]

# Tokenize and pad the sequences
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenized_inputs = tokenizer(source_texts, return_tensors="tf", padding=True, truncation=True)
tokenized_outputs = tokenizer(target_texts, return_tensors="tf", padding=True, truncation=True)

# Load pretrained mBART model for TensorFlow
model = TFBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Adjust the model for conditional generation (translation)
model.resize_token_embeddings(len(tokenizer))

# Set up optimizer
optimizer = AdamWeightDecay(model.parameters, learning_rate=5e-5)

# Fine-tune the model
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    with tf.GradientTape() as tape:
        outputs = model(**tokenized_inputs, labels=tokenized_outputs['input_ids'])
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(f"Epoch {epoch + 1}, Loss: {loss.numpy()}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_mbart_translation_tf")



In [None]:
# T5
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pretrained T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Example translation
text = "Translate the following sentence: Hello, how are you?"
input_ids = tokenizer.encode("translate English to French: " + text, return_tensors="pt", max_length=1024, truncation=True)
output_ids = model.generate(input_ids)
translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Input:", text)
print("Translation:", translation)
