In [1]:
!pip install -qq torch datasets transformers sacrebleu

# loading the dataset and inspecting it:

In [2]:
from datasets import load_dataset
dataset = load_dataset("wmt17", "de-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
print(type(dataset))

<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
print("keys:", dataset.keys())

keys: dict_keys(['train', 'validation', 'test'])


In [5]:
print(dataset['train'][0])

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}


In [6]:
print(len(dataset['train']))
print(len(dataset['validation']))
print(len(dataset['test']))

5906184
2999
3004


In [7]:
train = dataset['train'].shuffle(seed=1).select(range(10000))
validation = dataset['validation']
test = dataset['test']

In [8]:
print(type(train))
print(train[0])
print(len(train))

<class 'datasets.arrow_dataset.Dataset'>
{'translation': {'de': 'Für die Erstellung und Verbreitung amtlicher Statistiken ist in Bulgarien das Nationale Institut für Statistik (National Statistics Institute, NSI) zuständig.', 'en': 'The National Statistics Institute (NSI) is the body charged with producing and disseminating official statistics in Bulgaria.'}}
10000


# tokenization:
im using wordpiece

bert base uncased adds cls and sep tokens. cls works as sos (start of sentence) and sep works as eos (end of sentence)

maximum lenght is 128.
 more would be truncated, less would be padded and attention maks gets created
when padding is done

In [9]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("bert-base-uncased")
MAXLEN = 128

In [10]:
def do_tokenize(texts):
    return tok(
        texts,
        padding="max_length",
        truncation=True,
        max_length=MAXLEN,
        return_tensors="pt"
    )

# de
train_de = [ex["translation"]["de"] for ex in train]
valid_de = [ex["translation"]["de"] for ex in validation]
test_de  = [ex["translation"]["de"] for ex in test]

train_de_tokens = do_tokenize(train_de)
valid_de_tokens = do_tokenize(valid_de)
test_de_tokens  = do_tokenize(test_de)

# en
train_en = [ex["translation"]["en"] for ex in train]
valid_en = [ex["translation"]["en"] for ex in validation]
test_en  = [ex["translation"]["en"] for ex in test]

train_en_tokens = do_tokenize(train_en)
valid_en_tokens = do_tokenize(valid_en)
test_en_tokens  = do_tokenize(test_en)

# de masks (1=keep, 0=mask)
train_de_mask = train_de_tokens["attention_mask"]
valid_de_mask = valid_de_tokens["attention_mask"]
test_de_mask  = test_de_tokens["attention_mask"]

print("train german ids:", train_de_tokens["input_ids"].shape)
print("train english ids:", train_en_tokens["input_ids"].shape)
print("train german mask:", train_de_mask.shape)


train german ids: torch.Size([10000, 128])
train english ids: torch.Size([10000, 128])
train german mask: torch.Size([10000, 128])


In [11]:
idx = 0
print("de text:", train[idx]["translation"]["de"])
print("en text:", train[idx]["translation"]["en"])
print("de token ids:", train_de_tokens["input_ids"][idx])
print("de mask:", train_de_tokens["attention_mask"][idx])
print("en token ids:", train_en_tokens["input_ids"][idx])
print("en mask:", train_en_tokens["attention_mask"][idx])


de text: Für die Erstellung und Verbreitung amtlicher Statistiken ist in Bulgarien das Nationale Institut für Statistik (National Statistics Institute, NSI) zuständig.
en text: The National Statistics Institute (NSI) is the body charged with producing and disseminating official statistics in Bulgaria.
de token ids: tensor([  101,  6519,  3280,  9413, 13473,  3363,  5575,  6151, 12034,  2890,
        28813,  2572, 19646, 17322,  2099, 28093,  2923, 17339,  2078, 21541,
         1999, 20934, 27887, 23144,  8695, 17360, 17126,  6519, 28093,  2923,
         5480,  1006,  2120,  6747,  2820,  1010, 24978,  2072,  1007, 16950,
        21515,  8004,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,   

# data loader

In [12]:
from torch.utils.data import TensorDataset, DataLoader

In [13]:
# make decoder input and labels (english side)
dec_in = train_en_tokens["input_ids"][:, :-1]  # everything except last token sep
labels = train_en_tokens["input_ids"][:, 1:].clone() # everything except first token cls
val_dec_in = valid_en_tokens["input_ids"][:, :-1]
val_labels = valid_en_tokens["input_ids"][:, 1:].clone()
# change PAD (0) to -100 because i want loss to ignore it
labels[labels == 0] = -100
val_labels[val_labels == 0] = -100

train_dataset = TensorDataset(
    train_de_tokens["input_ids"],      # de sentences (to encoder)
    train_de_tokens["attention_mask"], # de mask (1 = real, 0 = pad)
    dec_in,                            # en shifted input (to decoder)
    labels                             # en shifted labels (for loss)
)
val_dataset = TensorDataset(
    valid_de_tokens["input_ids"],
    valid_de_tokens["attention_mask"],
    val_dec_in,
    val_labels
)

# make bathces using data loader:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)




# trainig loop

In [14]:
from transformers import EncoderDecoderModel
import torch
import torch.nn as nn

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased",  # encoder
    "bert-base-uncased",  # decoder
)

# special tokens : pad=0, cls=101, sep=102
model.config.pad_token_id = tok.pad_token_id
model.config.decoder_start_token_id = tok.cls_token_id
model.config.eos_token_id = tok.sep_token_id

model = model.to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-4)

def run_epoch(data_loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    for src_ids, src_mask, dec_in, labels in data_loader:
        src_ids  = src_ids.to(device)
        src_mask = src_mask.to(device)
        dec_in   = dec_in.to(device)
        labels   = labels.to(device)

        # forward; model returns loss when labels are given
        with torch.set_grad_enabled(train):
            out = model(
                input_ids=src_ids,
                attention_mask=src_mask,
                decoder_input_ids=dec_in,
                labels=labels,             # has -100 for pads → ignored in loss
            )
            loss = out.loss

            if train:
                opt.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                opt.step()

        total_loss += float(loss)

    return total_loss / len(data_loader)

EPOCHS = 3
for ep in range(1, EPOCHS + 1):
    train_loss = run_epoch(train_loader, train=True)
    val_loss   = run_epoch(val_loader,   train=False)
    print(f"epoch {ep}: train_loss={train_loss:.4f}  val_loss={val_loss:.4f}")


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

epoch 1: train_loss=5.4288  val_loss=5.2827
epoch 2: train_loss=4.3931  val_loss=5.4375
epoch 3: train_loss=3.7759  val_loss=5.5485
