In [None]:
!nvidia-smi

In [None]:
#!pip install datasets transformers[sentencepiece] sacrebleu -q
#!pip install sentencepiece

In [None]:
import os
import sys
import transformers 
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [5]:
model_checkpoint = "Helsinki-NLP/opus-mt-de-en"

# The Dataset

In [6]:
raw_datasets = load_dataset("bbaaaa/iwslt14-de-en-preprocess")

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 160239
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 6750
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7283
    })
})

In [9]:
raw_datasets['train'][1]

{'translation': {'de': 'und diese zwei zusammen zu bringen , erscheint vielleicht wie eine gewaltige aufgabe . aber was ich ihnen zu sagen versuche ist , dass es trotz dieser komplexität einige einfache themen gibt , von denen ich denke , wenn wir diese verstehen , können wir uns wirklich weiter entwickeln .',
  'en': 'and bringing those two together might seem a very daunting task , but what i &apos;m going to try to say is that even in that complexity , there &apos;s some simple themes that i think , if we understand , we can really move forward .'}}

# Preprocessing the data

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [11]:
#example 1
tokenizer("und diese zwei zusammen zu bringen")

{'input_ids': [10, 170, 338, 687, 24, 1812, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [12]:
#example 2
tokenizer(["und diese zwei zusammen zu bringen , erscheint vielleicht wie eine gewaltige aufgabe . aber was ich ihnen zu sagen versuche ist "])

{'input_ids': [[10, 170, 338, 687, 24, 1812, 17, 2, 6972, 1736, 107, 50, 31961, 37, 11817, 17, 3, 179, 58, 84, 678, 24, 825, 17780, 29, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
print(tokenizer(["and bringing those two together might seem a very daunting task "]))

{'input_ids': [[8, 1707, 79, 962, 20192, 962, 4774, 12, 377, 4577, 285, 4735, 2952, 418, 14, 258, 132, 310, 5235, 79, 7483, 2317, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [14]:
max_input_length = 128
max_target_length = 128

source_lang = "de"
target_lang = "en"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
preprocess_function(raw_datasets["train"][:2])



{'input_ids': [[10, 58, 8282, 17, 36116, 29, 17, 2, 134, 89, 5401, 13747, 167, 17, 3, 0], [10, 170, 338, 687, 24, 1812, 17, 2, 6972, 1736, 107, 50, 31961, 37, 11817, 17, 3, 179, 58, 84, 678, 24, 825, 17780, 29, 17, 2, 87, 65, 5660, 171, 34071, 2193, 861, 6790, 17, 18226, 297, 17, 2, 21, 553, 84, 4087, 17, 2, 176, 100, 170, 3258, 17, 2, 127, 100, 158, 929, 708, 3506, 17, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[8, 56, 85, 43, 14, 256, 16725, 1391, 17, 2, 207, 838, 1146, 19, 514, 0], [8, 9155, 356, 254, 848, 1096, 5438, 14, 256, 310, 5235, 79, 4820, 17, 2, 144, 207, 787, 165, 111, 7962, 77, 90, 751, 12, 1917, 12, 676, 19, 35, 412, 5, 35, 18938, 17, 2, 169, 165, 111, 7962, 77, 6, 286, 2469, 13837, 35, 787, 569, 17, 2, 205, 95, 2104, 17, 2, 95, 85, 859, 2538, 

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [17]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-de-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [18]:

batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 3

In [19]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [20]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [21]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [22]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [23]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [24]:
#pip install tensorflow-addons


In [26]:
import tensorflow as tf 

# Check for GPU availability
print("GPU Available:", tf.config.list_physical_devices('GPU'))

device_name = '/GPU:0'


GPU Available: []


In [29]:
with tf.device(device_name):
    model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



In [30]:
model.save_pretrained("ge_engModel/")

# Model Testing

In [1]:
from transformers import AutoTokenizer 
from transformers import TFAutoModelForSeq2SeqLM 

In [2]:
model_checkpoint = "Helsinki-NLP/opus-mt-de-en"

In [3]:

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("ge_engModel/")



In [4]:
input_text  = "Sprachübersetzungsmodell abgeschlossen"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor([[58100  1384  6822  1950  3807   514     0]], shape=(1, 7), dtype=int32)


In [18]:
with tokenizer.as_target_tokenizer():
    print("Input: ",input_text,"\nOutput: ",tokenizer.decode(out[0], skip_special_tokens=True))

Input:  Sprachübersetzungsmodell abgeschlossen 
Output:  language translation model completed.
