In [1]:
# Cell 1: Load the OpenSubtitles English-French Dataset
from datasets import load_dataset
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
# Load the dataset
dataset = load_dataset("opus100", "en-fr")

# Inspect the dataset
print(dataset)





README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


test-00000-of-00001.parquet:   0%|          | 0.00/327k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/334k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Cell 2: Preprocess the Data
from transformers import MarianTokenizer

# Load the tokenizer
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

# Define source and target languages
SRC_LANG = "en"
TGT_LANG = "fr"

# Preprocessing function
def preprocess_function(examples):
    inputs = [ex[SRC_LANG] for ex in examples['translation']]
    targets = [ex[TGT_LANG] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)




Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
# Cell 3: Set Up Evaluation Metrics
import evaluate

# Load BLEU metric
bleu = evaluate.load('sacrebleu')

# Metric computation function
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    result = bleu.compute(predictions=decoded_preds, references=labels)
    return {"bleu": result["score"]}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [9]:
del model

In [5]:
# Cell 4: Load the Pre-trained MarianMT Model
from transformers import MarianMTModel

# Load the model
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr')


In [8]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")


Total parameters: 75133952
Trainable parameters: 74609664


In [None]:
# Cell 5: Evaluate the Base Model with GPU


# Check if GPU is available


# Split the data
split_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=24,
    predict_with_generate=True,
    dataloader_pin_memory=True,  # Ensures better performance with GPUs
)
small_train_dataset = train_dataset.shuffle(seed=42).select(range(30000))
small_eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000))
# Use a data collator to handle padding during evaluation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model.to(device),  # Send the model to GPU
    args=training_args,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Evaluate the base model
base_metrics = trainer.evaluate()
print(f"Base Model BLEU Score: {base_metrics['eval_bleu']:.2f}")


In [None]:
del model

In [None]:
torch.cuda.empty_cache()


In [None]:
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr')

In [None]:
# Cell 6: Fine-tune the Model with Proper Padding
from transformers import DataCollatorForSeq2Seq

# Update training arguments for training
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-9,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=2,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    dataloader_pin_memory=True,  # Helps with GPU training efficiency
)

# Use a data collator to handle padding during training and evaluation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Re-initialize the trainer with training dataset and data collator
trainer = Seq2SeqTrainer(
    model=model.to("cuda"),  # Move model to GPU
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start fine-tuning
trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss




Epoch,Training Loss,Validation Loss,Bleu
1,1.4883,1.343655,38.690481
2,1.4873,1.341634,38.738641


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=14064, training_loss=1.488615716819199, metrics={'train_runtime': 7013.6143, 'train_samples_per_second': 256.644, 'train_steps_per_second': 2.005, 'total_flos': 5.09732086307881e+16, 'train_loss': 1.488615716819199, 'epoch': 2.0})