In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq , AutoModelForSeq2SeqLM
import evaluate

checkpoint = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, return_tensors="pt")
metric = evaluate.load("sacrebleu")

In [3]:
dataset = load_dataset('kde4', 'en-fr')

In [4]:
dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
dataset['train'][:5]

{'id': ['118328', '75379', '83158', '14769', '12622'],
 'translation': [{'en': 'Show Pathfinder Lander Image',
   'fr': "Afficher l'image de Pathfinder LanderImage/ info menu item (should be translated)"},
  {'en': 'Publisher', 'fr': 'Éditeur'},
  {'en': 'Remove entry', 'fr': "Supprimer l' entrée"},
  {'en': 'Application Preference Order determines which applications will be associated with the specified & MIME; type.',
   'fr': 'Applications par ordre de préférence & #160;: détermine les associations qui sont associées à ce type & MIME;.'},
  {'en': 'CSS Validator', 'fr': 'Validateur CSS'}]}

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

('train', Dataset({
    features: ['id', 'translation'],
    num_rows: 189155
}))
('test', Dataset({
    features: ['id', 'translation'],
    num_rows: 21018
}))


In [5]:
dataset['validation'] = dataset.pop("test")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [6]:
train_ds = dataset['train']
train_ds[:2]

{'id': ['118328', '75379'],
 'translation': [{'en': 'Show Pathfinder Lander Image',
   'fr': "Afficher l'image de Pathfinder LanderImage/ info menu item (should be translated)"},
  {'en': 'Publisher', 'fr': 'Éditeur'}]}

Lets Tokenize the dataset

In [7]:
train_ds['translation'][0]['en'], train_ds['translation'][0]['fr']

('Show Pathfinder Lander Image',
 "Afficher l'image de Pathfinder LanderImage/ info menu item (should be translated)")

In [8]:


english = train_ds[1]['translation']['en']
french = train_ds[1]['translation']['fr']

print ("english: ", english), print ("french: ",french)

token_en = tokenizer(english, text_target=french)
print ("token_en: ", token_en)
print ("detokenizing : ", tokenizer.convert_ids_to_tokens(token_en["input_ids"]), " fr ", tokenizer.convert_ids_to_tokens(token_en["labels"]) )

token_en_test = tokenizer(english)
token_fr_test = tokenizer(french)

print ("token_en_test: ", token_en_test)
print ("token_fr_test: ", token_fr_test)

print("detoken en: ", tokenizer.convert_ids_to_tokens(token_en_test["input_ids"]))
print("detoken fr: ", tokenizer.convert_ids_to_tokens(token_fr_test["input_ids"]))

token_fr = tokenizer(french , text_target= english)
print ("token_fr: ", token_fr)
print ("detokenizing : ", tokenizer.convert_ids_to_tokens(token_fr["input_ids"]), " fr ", tokenizer.convert_ids_to_tokens(token_fr["labels"]) )


english:  Publisher
french:  Éditeur
token_en:  {'input_ids': [28792, 0], 'attention_mask': [1, 1], 'labels': [17963, 955, 227, 705, 0]}
detokenizing :  ['▁Publisher', '</s>']  fr  ['▁É', 'di', 'te', 'ur', '</s>']
token_en_test:  {'input_ids': [28792, 0], 'attention_mask': [1, 1]}
token_fr_test:  {'input_ids': [17963, 25137, 5488, 0], 'attention_mask': [1, 1, 1, 1]}
detoken en:  ['▁Publisher', '</s>']
detoken fr:  ['▁É', 'dit', 'eur', '</s>']
token_fr:  {'input_ids': [17963, 25137, 5488, 0], 'attention_mask': [1, 1, 1, 1], 'labels': [23917, 6, 1279, 0]}
detokenizing :  ['▁É', 'dit', 'eur', '</s>']  fr  ['▁Publi', 's', 'her', '</s>']


In [9]:
tokenizer(train_ds[1]['translation']['en'], train_ds[1]['translation']['fr'])

{'input_ids': [28792, 17963, 25137, 5488, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [10]:
def tokenize_function(examples):
    english = [ex['en'] for ex in examples['translation']]
    french = [ex['fr'] for ex in examples['translation']]
    return tokenizer(english, text_target=french, max_length=128, truncation=True)

tokenize_function (train_ds[:2])    

{'input_ids': [[1482, 16449, 34824, 545, 45, 6910, 0], [28792, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1]], 'labels': [[20586, 2585, 4042, 1181, 22, 685, 1056, 230, 16449, 11224, 45, 545, 45, 4166, 1056, 75, 3058, 3099, 3035, 20, 6, 2277, 10382, 43, 3388, 6860, 108, 27, 0], [17963, 955, 227, 705, 0]]}

In [11]:
checkpoint = 't5-small'
tokenizer1 = AutoTokenizer.from_pretrained(checkpoint, return_tensors="pt")

english = train_ds[1]['translation']['en']
french = train_ds[1]['translation']['fr']

print ("english: ", english), print ("french: ",french)

token_en = tokenizer1(english, text_target=french)
print ("token_en: ", token_en)
print ("detokenizing : ", tokenizer1.convert_ids_to_tokens(token_en["input_ids"]), " fr ", tokenizer1.convert_ids_to_tokens(token_en["labels"]) )

token_en_test = tokenizer1(english)
token_fr_test = tokenizer1(french)

print ("token_en_test: ", token_en_test)
print ("token_fr_test: ", token_fr_test)

print("detoken en: ", tokenizer1.convert_ids_to_tokens(token_en_test["input_ids"]))
print("detoken fr: ", tokenizer1.convert_ids_to_tokens(token_fr_test["input_ids"]))

token_fr = tokenizer1(french , text_target= english)
print ("token_fr: ", token_fr)
print ("detokenizing : ", tokenizer1.convert_ids_to_tokens(token_fr["input_ids"]), " fr ", tokenizer1.convert_ids_to_tokens(token_fr["labels"]) )

english:  Publisher
french:  Éditeur
token_en:  {'input_ids': [19816, 1], 'attention_mask': [1, 1], 'labels': [7983, 10700, 450, 1]}
detokenizing :  ['▁Publisher', '</s>']  fr  ['▁É', 'dite', 'ur', '</s>']
token_en_test:  {'input_ids': [19816, 1], 'attention_mask': [1, 1]}
token_fr_test:  {'input_ids': [7983, 10700, 450, 1], 'attention_mask': [1, 1, 1, 1]}
detoken en:  ['▁Publisher', '</s>']
detoken fr:  ['▁É', 'dite', 'ur', '</s>']
token_fr:  {'input_ids': [7983, 10700, 450, 1], 'attention_mask': [1, 1, 1, 1], 'labels': [19816, 1]}
detokenizing :  ['▁É', 'dite', 'ur', '</s>']  fr  ['▁Publisher', '</s>']


In [12]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns= dataset['train'].column_names,

)

In [13]:
train_token = train_ds.map(
    tokenize_function,
    batched=True,
    remove_columns= train_ds.column_names,
)

train_token

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 189155
})

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) 
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [16]:
data_collector = DataCollatorForSeq2Seq (tokenizer=tokenizer, model=model ) 

In [17]:
range (1,3)

range(1, 3)

In [18]:
batch = [train_token[i] for i in range (1,3) ]
print ("batch: ", batch)

batch1 = data_collector (batch) 
print("batch1 : ", batch1) 


batch:  [{'input_ids': [28792, 0], 'attention_mask': [1, 1], 'labels': [17963, 955, 227, 705, 0]}, {'input_ids': [9122, 2995, 0], 'attention_mask': [1, 1, 1], 'labels': [124, 641, 16205, 45, 1181, 22, 1170, 7313, 18, 0]}]
batch1 :  {'input_ids': tensor([[28792,     0, 58100],
        [ 9122,  2995,     0]]), 'attention_mask': tensor([[1, 1, 0],
        [1, 1, 1]]), 'labels': tensor([[17963,   955,   227,   705,     0,  -100,  -100,  -100,  -100,  -100],
        [  124,   641, 16205,    45,  1181,    22,  1170,  7313,    18,     0]]), 'decoder_input_ids': tensor([[    0, 17963,   955,   227,   705,     0,     0,     0,     0,     0],
        [    0,   124,   641, 16205,    45,  1181,    22,  1170,  7313,    18]])}


In [19]:
batch1.keys(), batch1['decoder_input_ids']

(dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids']),
 tensor([[    0, 17963,   955,   227,   705,     0,     0,     0,     0,     0],
         [    0,   124,   641, 16205,    45,  1181,    22,  1170,  7313,    18]]))

In [20]:
train_token[1]

{'input_ids': [28792, 0],
 'attention_mask': [1, 1],
 'labels': [17963, 955, 227, 705, 0]}

In [21]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [22]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    
)

In [29]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="steps",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    #fp16=True,
)

In [28]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collector,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [25]:
# trainer.evaluate(max_length=128)

In [30]:
trainer.train()

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

i have trained the model with small dataset as y personal system will take ablout a day to finish training.

### Lets get our model that we have trained and get some translation done.

In [21]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained("marian-finetuned-kde4-en-to-fr")
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

In [22]:
sample_text = "I am a student, Lets Translate it into French"

In [25]:
tokenizer = AutoTokenizer.from_pretrained("marian-finetuned-kde4-en-to-fr")

In [27]:
token_sample = tokenizer(sample_text, return_tensors="pt", )
token_sample

{'input_ids': tensor([[   47,  1010,    15,  6548,     2,  2618,     9, 35742,    61,   208,
          1109,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [34]:
model_out = model.generate(**token_sample)
model_out



tensor([[59513,   131,   558,    34, 16308,     2,    19,    75,     9,  4999,
         10741,    61,    18,  1109,     0]])

In [37]:
model_out.squeeze()


tensor([59513,   131,   558,    34, 16308,     2,    19,    75,     9,  4999,
        10741,    61,    18,  1109,     0])

In [32]:
tokenizer.decode(model_out.squeeze(), skip_special_tokens=True)

'Je suis un étudiant, lets Translate it in French'

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model from a local path
model_path = "marian-finetuned-kde4-en-to-fr"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Define a function for translation
def translate_to_french(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=512, truncation=True)
    translation_ids = model.generate(**inputs)
    translated_text = tokenizer.decode(translation_ids[0], skip_special_tokens=True)
    return translated_text

# Example translation
text_to_translate = "Hello, how are you?"
translated_text = translate_to_french(text_to_translate)
print(translated_text)




Bonjour, comment allez -vous & #160;?
