## Test the model here [Wordfly](https://huggingface.co/Priyanshuchaudhary2425/Wordfly-eng-fr)

**Note:-** Due to less data model may not perform accordingly!!!

## 1. Load the dataset

In [36]:
!pip install transformers -U



In [37]:
!pip install accelerate -U



In [38]:
!pip install datasets



In [39]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [40]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [41]:
raw_datasets["train"][400]

{'id': '400',
 'translation': {'en': 'When you first view an Album, & digikam; selects the first photograph in the Album as the icon to display in the & quot; Albumsquot; list. This helps you remember what kind of photographs are contained in an Album. Sometimes the first photograph in the Album is not the most representative and you want to select a different one as the Album icon.',
  'fr': "Lorsque vous visualisez un album pour la première fois, & digikam; sélectionne la première photo de l'album comme aperçu pour « & #160; Mes albums & #160; ». Parfois la première photo de l'album n'est pas la plus représentative et vous souhaitez en changer."}}

In [42]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.8, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 168138
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 42035
    })
})

In [43]:
split_datasets["validation"] = split_datasets.pop("test")

In [44]:
split_datasets["train"][1]["translation"]

{'en': 'The project page could be found at: https: / /www. gna. org/ projects/ kvpnc;.',
 'fr': 'Téléchargez le depuis la section Fichiers (http: / /download. gna. org/ kvpnc/).'}

## 2. Processing the data

In [45]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [46]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [35, 49, 17427, 647, 137, 43904, 45, 1231, 8571, 71, 37, 4012, 9, 37, 583, 583, 3390, 3, 49, 19015, 3, 49, 1937, 74, 49, 17427, 9, 74, 2635, 973, 529, 13518, 50, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [14728, 13233, 265, 19, 5, 10717, 900, 8, 669, 304, 34902, 794, 24, 11106, 37, 583, 583, 9507, 10571, 3, 49, 19015, 3, 57, 309, 74, 2635, 973, 529, 364, 222, 74, 102, 0]}

In [47]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁Téléchargez', '▁le', '▁depuis', '▁la', '▁section', '▁Fichiers', '▁(', 'http', ':', '▁/', '▁/', 'down', 'load', '.', '▁', 'gna', '.', '▁', 'org', '/', '▁k', 'v', 'p', 'nc', '/', ').', '</s>']
['▁Télé', 'charge', 'z', '▁le', '▁de', 'pu', 'is', '▁la', '▁section', '▁F', 'ichi', 'ers', '▁(', 'http', ':', '▁/', '▁/', 'down', 'load', '.', '▁', 'gna', '.', '▁or', 'g', '/', '▁k', 'v', 'p', 'n', 'c', '/', ').', '</s>']


In [48]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [49]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

## 3. Training the model

In [50]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [51]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [52]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [53]:
batch.labels

tensor([[14728, 13233,   265,    19,     5, 10717,   900,     8,   669,   304,
         34902,   794,    24, 11106,    37,   583,   583,  9507, 10571,     3,
            49, 19015,     3,    57,   309,    74,  2635,   973,   529,   364,
           222,    74,   102,     0],
        [13662,    51,   301,   548,     0,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100]])

In [54]:
batch["decoder_input_ids"]

tensor([[59513, 14728, 13233,   265,    19,     5, 10717,   900,     8,   669,
           304, 34902,   794,    24, 11106,    37,   583,   583,  9507, 10571,
             3,    49, 19015,     3,    57,   309,    74,  2635,   973,   529,
           364,   222,    74,   102],
        [59513, 13662,    51,   301,   548,     0, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513]])

In [55]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[14728, 13233, 265, 19, 5, 10717, 900, 8, 669, 304, 34902, 794, 24, 11106, 37, 583, 583, 9507, 10571, 3, 49, 19015, 3, 57, 309, 74, 2635, 973, 529, 364, 222, 74, 102, 0]
[13662, 51, 301, 548, 0]


## 4. Metrics

In [56]:
!pip install sacrebleu



In [57]:
!pip install evaluate



In [58]:
import evaluate

metric = evaluate.load("sacrebleu")

In [59]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [60]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [61]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    "Wordfly-eng-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [62]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [63]:
trainer.train()

Step,Training Loss
500,2.5899
1000,1.9294
1500,1.7182
2000,1.6008
2500,1.5153
3000,1.4211
3500,1.3801
4000,1.3246
4500,1.3048
5000,1.2518


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=15765, training_loss=1.2259371809833888, metrics={'train_runtime': 3391.9381, 'train_samples_per_second': 148.71, 'train_steps_per_second': 4.648, 'total_flos': 1.2916089009340416e+16, 'train_loss': 1.2259371809833888, 'epoch': 3.0})

In [64]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


CommitInfo(commit_url='https://huggingface.co/Priyanshuchaudhary2425/Wordfly-eng-fr/commit/2be01672bfda5011c02cf24b25a9cf35477f1f67', commit_message='Training complete', commit_description='', oid='2be01672bfda5011c02cf24b25a9cf35477f1f67', pr_url=None, pr_revision=None, pr_num=None)