<h2>Import Libraries

In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorWithPadding,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import evaluate
import numpy as np
import random




<h2>Set Random Seeds for Reproducibility

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

<h2>Check CUDA Availability

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


<h2>Load the dataset

In [4]:
raw_datasets = load_dataset('wmt14', 'de-en')

<h2>Reduce Dataset Size to Fit RAM

In [5]:
# Determine actual dataset sizes
train_dataset_size = len(raw_datasets['train'])
validation_dataset_size = len(raw_datasets['validation'])

# Adjust sample sizes based on hardware constraints
desired_train_samples = 50000  # Adjust as needed
desired_validation_samples = 3000  # Use full validation set if possible

train_max_samples = min(desired_train_samples, train_dataset_size)
validation_max_samples = min(desired_validation_samples, validation_dataset_size)

# Select subsets
raw_datasets['train'] = raw_datasets['train'].select(range(train_max_samples))
raw_datasets['validation'] = raw_datasets['validation'].select(range(validation_max_samples))

print(f"Training samples: {len(raw_datasets['train'])}")
print(f"Validation samples: {len(raw_datasets['validation'])}")

Training samples: 50000
Validation samples: 3000


<h2>Model and Tokenizer Initialization

In [6]:
model_checkpoint = 'Helsinki-NLP/opus-mt-de-en'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.to(device)



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

<h2>Preprocess the Data

In [7]:
source_lang = 'de'
target_lang = 'en'

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples['translation']]
    targets = [ex[target_lang] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

<h2>Data Collator with Padding

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

<h2>Define Evaluation Metrics

In [9]:
metric = evaluate.load('sacrebleu')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]  # SacreBLEU expects a list of references
    return preds, labels

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    if isinstance(logits, tuple):
        logits = logits[0]
    decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = result['score']
    return {'bleu': bleu_score}


<h2>Set Training Arguments with Optimizations

In [10]:
batch_size = 16  # Adjusted higher to improve training (if VRAM allows)
gradient_accumulation_steps = 2  # Simulate a larger batch size

num_train_epochs = 6  # Increased epochs for better learning
learning_rate = 3e-5  # Slightly reduced learning rate

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    lr_scheduler_type='linear',
    warmup_steps=500,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # Increased steps to simulate larger batch size
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='bleu',
    greater_is_better=True,
    report_to=[],
)



<h2>Initialize the Trainer

In [11]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


<h2>Train the Model

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,1.4299,1.480435,30.436042
2,1.3294,1.519053,29.870248
3,1.2496,1.548196,29.01853
4,1.1935,1.573558,28.479676
5,1.1091,1.597771,28.285297


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=9372, training_loss=1.2425728072175806, metrics={'train_runtime': 6939.5943, 'train_samples_per_second': 43.23, 'train_steps_per_second': 1.351, 'total_flos': 4854561536212992.0, 'train_loss': 1.2425728072175806, 'epoch': 5.99648})

<h2>Evaluate the Best Model

In [13]:
results = trainer.evaluate()
print(f"Final BLEU score: {results['eval_bleu']:.2f}")

Final BLEU score: 30.44


<h2>Test the Model with Custom Sentences

In [14]:
def translate(text):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            max_length=128,
            num_beams=5,  # Using beam search for better translations
            early_stopping=True,
        )
    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translation

test_sentences = [
    "Das Wetter heute ist schön.",
    "Ich lerne gerne neue Sprachen.",
    "Künstliche Intelligenz revolutioniert die Technologiebranche.",
    "Maschinelles Lernen hat enormes Potenzial.",
    "Die globale Zusammenarbeit ist entscheidend für den Fortschritt.",
]

for sentence in test_sentences:
    translated = translate(sentence)
    print(f"German: {sentence}")
    print(f"English: {translated}\n")

German: Das Wetter heute ist schön.
English: The weather today is fine.

German: Ich lerne gerne neue Sprachen.
English: I like to learn new languages.

German: Künstliche Intelligenz revolutioniert die Technologiebranche.
English: Artificial intelligence is revolutionising the technology industry.

German: Maschinelles Lernen hat enormes Potenzial.
English: Machine learning has enormous potential.

German: Die globale Zusammenarbeit ist entscheidend für den Fortschritt.
English: Global cooperation is crucial to progress.



<h2>Save the Fine-Tuned Model

In [15]:
trainer.save_model('fine-tuned-opus-mt-de-en')
tokenizer.save_pretrained('fine-tuned-opus-mt-de-en')

('fine-tuned-opus-mt-de-en\\tokenizer_config.json',
 'fine-tuned-opus-mt-de-en\\special_tokens_map.json',
 'fine-tuned-opus-mt-de-en\\vocab.json',
 'fine-tuned-opus-mt-de-en\\source.spm',
 'fine-tuned-opus-mt-de-en\\target.spm',
 'fine-tuned-opus-mt-de-en\\added_tokens.json')