In [1]:
!nvidia-smi

Wed Nov  1 21:25:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# <font color = 'pickle'>**Translation** </font>







## Outline
1. **Setting up the Environment**: Installing necessary libraries and setting up paths.
2. **Exploring and Understanding IMDB Dataset**: Understanding the structure and content of the dataset.

3. **Data Preprocessing**: Techniques to prepare the data for training, including handling different data splits and tokenization
4. **Training the Model**: Feeding data and adjusting weights.
5. **Prediction and Evaluation**: Evaluate model on test set and making predictions.
6. **Experiment**: Experimenting with different tokenizer and models


# <font color = 'pickle'> **Setting up the Environment** </font>

In [2]:
from pathlib import Path
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount("/content/drive")
    !pip install datasets transformers evaluate wandb accelerate -U -qq
    base_folder = Path("/content/drive/MyDrive/Colab_Notebooks/NLP")
else:
    base_folder = Path("/home/harpreet/Insync/google_drive_shaannoor/data")


from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, GenerationConfig, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, pipeline
from datasets import load_dataset, DatasetDict
import evaluate
from evaluate import evaluator

import wandb
import numpy as np
import pandas as pd
import gc
import torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# <font color = 'pickle'> **Exploring and Understanding Dataset**

## <font color = 'pickle'> **English_French_Translation**</font>



## <font color = 'pickle'> **Load Data set**
    


In [3]:
kde_dataset = load_dataset('kde4', lang1='en', lang2='fr')

## <font color = 'pickle'> **Understanding your data**

In [4]:
print(kde_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})


## <font color = 'pickle'> **Exploratory Data Analysis (EDA)**

### <font color = 'pickle'> **Change dataset format to Pandas**  </font>


In [5]:
# This will convert all the splits into Pandas dataframe
kde_dataset.set_format(type="pandas")
# Get all rows from training split
df_train = kde_dataset["train"][:]

In [6]:
df_train['words_per_sent_en'] = df_train['translation'].apply(lambda x: len(x['en'].split()))
df_train['words_per_sent_fr'] = df_train['translation'].apply(lambda x: len(x['fr'].split()))

In [7]:
df_train.head()

Unnamed: 0,id,translation,words_per_sent_en,words_per_sent_fr
0,0,"{'en': 'Lauri Watts', 'fr': 'Lauri Watts'}",2,2
1,1,"{'en': '& Lauri. Watts. mail;', 'fr': '& Lauri...",4,4
2,2,"{'en': 'ROLES_OF_TRANSLATORS', 'fr': '& traduc...",1,2
3,3,"{'en': '2006-02-26 3.5.1', 'fr': '2006-02-26 3...",2,2
4,4,{'en': 'The Babel & konqueror; plugin gives yo...,14,17


#### <font color = 'pickle'> **Plot the distribution of review length** </font>

In [8]:
# Let us check how many sentences ahs more than 500 words

count = (df_train["words_per_sent_en"] > 500).sum()
print(f"Number of sentences with more than 400 words: {count}")

Number of sentences with more than 400 words: 5


In [9]:
# Let us check how many sentences has less than 2 words

count = (df_train["words_per_sent_en"] <2).sum()
print(f"Number of sentences with less than 2 words: {count}")

Number of sentences with less than 2 words: 48165


In [10]:
df_train[df_train["words_per_sent_fr"]<2]

Unnamed: 0,id,translation,words_per_sent_en,words_per_sent_fr
5,5,"{'en': 'KDE', 'fr': 'KDE'}",1,1
6,6,"{'en': 'kdeaddons', 'fr': 'kdeaddons'}",1,1
7,7,"{'en': 'konqueror', 'fr': 'konqueror'}",1,1
9,9,"{'en': 'babelfish', 'fr': 'babelfish'}",1,1
10,10,"{'en': 'translate', 'fr': 'traduction'}",1,1
...,...,...,...,...
210137,210137,"{'en': 'Games', 'fr': 'JeuxPhonon::'}",1,1
210138,210138,"{'en': 'Accessibility', 'fr': 'Accessibilité'}",1,1
210156,210156,"{'en': 'aRts', 'fr': 'aRts'}",1,1
210161,210161,"{'en': 'Volume', 'fr': 'Volume'}",1,1


### <font color = 'pickle'> **Reset dataset format** </font>


In [11]:
kde_dataset.reset_format()

# <font color = 'pickle'> **Data Pre-processing**</font>

## <font color = 'indianred'> **Create train, valid, test splits** </font>

In [12]:
test_val_splits = kde_dataset['train'].train_test_split(test_size=0.4, seed=42)
train_split= test_val_splits['train']
test_val_splits = test_val_splits['test'].train_test_split(test_size=0.5, seed=42,)
val_split = test_val_splits['train']
test_split = test_val_splits['test']

## <font color = 'indianred'> **Create small subset for experimentation** </font>


In [13]:
train_split_small = train_split.shuffle(seed=42).select(range(1000))
val_split_small = val_split.shuffle(seed=42).select(range(500))
test_split_small = test_split.shuffle(seed=42).select(range(500))

In [14]:
# combine train, val splits into one dataset
train_val_subset = DatasetDict({'train': train_split_small, 'val': val_split_small})

# create test dataset from test split
test_subset= DatasetDict({'test': test_split_small})

# **Experiment 1**

## <font color = 'pickle'> **Tokenization**</font>

###  <font color = 'pickle'> **Load pre-trained Tokenizer** </font>

In [15]:
# !pip install sentencepiece

In [16]:
checkpoint = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



### <font color = 'pickle'> **Understanding tokenizer**</font>

In [17]:
text = ["Upper Window: View on Printers, both Real and Virtual", "Fenêtre supérieure & #160;: Vue de l'ensemble des imprimantes, réelles et virtuelles"]

In [18]:
# get the vocab size
print(f"Pretrained tokenizer vocab size {tokenizer.vocab_size}")

Pretrained tokenizer vocab size 59514


In [20]:
encoded_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
tokens_first_sentence = tokenizer.convert_ids_to_tokens(encoded_text.input_ids[0])
tokens_second_sentence = tokenizer.convert_ids_to_tokens(encoded_text.input_ids[1])
tokenizer.convert_tokens_to_string(tokens_first_sentence)
tokenizer.convert_tokens_to_string(tokens_second_sentence)
special_tokens = tokenizer.all_special_tokens
special_tokens_ids = tokenizer.all_special_ids
df = pd.DataFrame({"special_tokens": special_tokens, "special_tokens_ids": special_tokens_ids})
pd.set_option("display.max_rows", None)
df

Unnamed: 0,special_tokens,special_tokens_ids
0,</s>,0
1,<unk>,1
2,<pad>,59513


###  <font color = 'pickle'> **Create function for Tokenizer**

In [25]:
# we do not need to add padding
# padding and conversion to Pytorch Tensor is handled by the data collators
# in more efficient manner

max_length = 128
def tokenize_fn(batch):

    inputs = [example['en'] for example in batch['translation']]
    targets = [example['fr'] for example in batch['translation']]
    model_inputs = tokenizer(text = inputs, text_target=targets, truncation = True, max_length=max_length)

    return model_inputs

###  <font color = 'pickle'> **Use map function to apply tokenization to all splits**

In [22]:
tokenized_dataset = train_val_subset.map(tokenize_fn, batched = True, remove_columns=train_val_subset['train'].column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [23]:
tokenized_dataset.set_format(type="torch")

In [24]:
tokenized_dataset["train"].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

#  <font color = 'pickle'> **Model Training**

##  <font color = 'pickle'> **Model Config File**</font>



In [25]:
config = AutoConfig.from_pretrained(checkpoint)
config

MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-fr",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    

In [27]:
# generation_config = GenerationConfig.from_model_config(config)
generation_config = GenerationConfig.from_pretrained(checkpoint)
generation_config

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513,
  "renormalize_logits": true
}

##  <font color = 'pickle'> **Download pre-trained model**

In [28]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config)

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

## <font color = 'pickle'> **Model Input/Collate Function** </font>

In [29]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
features = [tokenized_dataset["train"][i] for i in range(3)]
model_input = data_collator(features)
model_input.keys()

<font color='indianred'>• **Note that 59513 is both the start token and pad token for the decoder input ids**</font>

<font color='indianred'>•  **Also note that decoder input ids are shifted version of the labels**


##  <font color = 'pickle'> **Understanding Model Output**


In [34]:
# model output
model_output = model(**model_input)
model_output.keys()
logits = model_output.logits.detach().cpu().numpy()
preds = np.argmax(logits, axis = -1)
def get_label_strings(labels):

  return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(labels))

labels = [get_label_strings(label) for label in model_input['labels']]
preds = [ get_label_strings(pred) for pred in preds]

In [44]:
final_string_preds=[]
for example in preds:
    string_preds =[]
    for s in example.split():
        string_preds.append(s)
        if '</s>' in s:
            break
    final_string_preds.append(' '.join(string_preds))

final_string_preds

['Identification: #160;:</s>',
 "Fenêtre supérieure : #160;: vue sur l'imprimante des imprimantes, réelles et virtuelles</s>",
 'Aperçu</s>']

In [50]:
# generate prediction from logits using beam search
generated_predictions = model.generate(**model_input, generation_config=generation_config, max_length=128)
tokenizer.batch_decode(generated_predictions, skip_special_tokens=True)
# tokens = tokenizer.convert_ids_to_tokens(generated_predictions[0])
# translation = tokenizer.convert_tokens_to_string(tokens)

['Identifiant & #160;: Identificateur:',
 "Fenêtre supérieure & #160;: Vue de l'ensemble des imprimantes, réelles et virtuelles",
 'Aperçu Aperçu']

##  <font color = 'pickle'> **Evaluation metric(s)** </font>

### <font color = 'pickle'> **Function to compute metric** </font>


In [52]:
# !pip install sacrebleu
# !pip install bert_score

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/118.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m71.7/118.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.3.1
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing col

In [53]:
bleu_metric = evaluate.load("sacrebleu")
bert_metric = evaluate.load('bertscore')

def compute_metrics(preds_and_labels):

    preds, labels = preds_and_labels

    # convert predictions into words
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # for any -100 label, replace with pad token id
    labels = np.where( labels != -100, labels, tokenizer.pad_token_id )

    # convert labels into words
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens= True)

    # get rid of extra whitespace
    # and also, put targets into lists

    decoded_preds_cleaned = [pred.strip() for pred in decoded_preds]
    decoded_labels_cleaned = [label.strip() for label in decoded_labels]

    bleu_score = bleu_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned)
    bert_score = bert_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned, lang='fr')

    return{'bleu_score:': bleu_score['score'], 'bert_score': np.mean(bert_score['f1'])}

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

## <font color = 'pickle'> **Set up Logger for experiments**</font>


In [54]:
wandb.login()
# Set project name for logging
%env WANDB_PROJECT = nlp_course_fall_2023_translation_experiments


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=nlp_course_fall_2023_translation_experiments


## <font color = 'pickle'> **Hyperparameters and Checkpointing**</font>

In [56]:
# Define the directory where model checkpoints will be saved
model_folder = base_folder/ "Models"

# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = Seq2SeqTrainingArguments(
    # Training-specific configurations
    num_train_epochs=1,  # Total number of training epochs
    weight_decay=0.01,  # Apply L2 regularization to prevent overfitting
    learning_rate=5e-5,  # Step size for the optimizer during training
    optim="adamw_torch",  # Optimizer,
    warmup_steps=10,
    predict_with_generate=True,
    generation_config=generation_config,
    # memory and speed related arguments
    # Number of samples per training batch for each device
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # Number of samples per eval batch for each device

    gradient_checkpointing=True,  # memory
    # fp16 = True, # Speed
    # bf16=True,
    # tf32=True, # speed
    # evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy="steps",  # Evaluate model at specified step intervals
    eval_steps = 40,  # Perform evaluation every 10 training steps
    # Checkpoint settings
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps = 40,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # metric_for_best_model=,
    # greater_is_better=,
    # Experiment logging configurations (commented out in this example)
    logging_strategy="steps",
    logging_steps = 40,
    report_to="wandb",  # Log metrics and results to Weights & Biases platform
    # Experiment name for Weights & Biases
    run_name="translation-exp1",
)


##  <font color = 'pickle'> **Initialize Trainer**</font>

In [57]:
# initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


##  <font color = 'pickle'> **Start Training**

In [59]:
torch.cuda.empty_cache()
gc.collect()

60

In [60]:
trainer.train()  # start training

[34m[1mwandb[0m: Currently logged in as: [33mshremuk[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Bleu Score:,Bert Score
40,1.7097,1.537132,41.174389,0.867369


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]



TrainOutput(global_step=63, training_loss=1.6225579882424974, metrics={'train_runtime': 103.2615, 'train_samples_per_second': 9.684, 'train_steps_per_second': 0.61, 'total_flos': 15072060506112.0, 'train_loss': 1.6225579882424974, 'epoch': 1.0})

##  <font color = 'pickle'> **Evaluation**

### <font color = 'pickle'> **Check performance on validation set**</font>

In [61]:
trainer.evaluate(tokenized_dataset["val"])

{'eval_loss': 1.5371323823928833,
 'eval_bleu_score:': 41.17438898072662,
 'eval_bert_score': 0.8673686439990997,
 'eval_runtime': 64.1217,
 'eval_samples_per_second': 7.798,
 'eval_steps_per_second': 0.499,
 'epoch': 1.0}

In [62]:
wandb.finish()  # stop logging

VBox(children=(Label(value='0.002 MB of 0.023 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.077517…

0,1
eval/bert_score,▁▁
eval/bleu_score:,▁▁
eval/loss,▁▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▁██
train/global_step,▁▁██
train/learning_rate,▁
train/loss,▁

0,1
eval/bert_score,0.86737
eval/bleu_score:,41.17439
eval/loss,1.53713
eval/runtime,64.1217
eval/samples_per_second,7.798
eval/steps_per_second,0.499
train/epoch,1.0
train/global_step,63.0
train/learning_rate,2e-05
train/loss,1.7097


### <font color = 'pickle'> **Check the best saved model**</font>

In [63]:
# After training, let us check the best checkpoint
# We need this for Predioctions and Evaluations
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split("-")[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")

The best model was saved at step 40.


#  <font color = 'pickle'> **Inference**

## <font color = 'pickle'> **Test Set Evaluation**

In [65]:
checkpoint = str(model_folder / "checkpoint-40")
test_data_flattened = test_subset["test"].map(lambda example: {'en': example['translation']['en'], 'fr': example['translation']['fr']})
task_evaluator = evaluator("translation")
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
eval_results = task_evaluator.compute(
    model_or_pipeline=checkpoint,
    tokenizer=checkpoint,
    data=test_data_flattened,
    input_column='en',
    label_column='fr',
    generation_kwargs=gen_kwargs,
    device=0,
)
eval_results

Map:   0%|          | 0/500 [00:00<?, ? examples/s]



Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Your input_length: 130 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


{'bleu': 0.42472809265348854,
 'precisions': [0.7061954439003619,
  0.5320467000238266,
  0.4183835182250396,
  0.3278784336645237],
 'brevity_penalty': 0.8913955592851547,
 'length_ratio': 0.8968875310292151,
 'translation_length': 4697,
 'reference_length': 5237,
 'total_time_in_seconds': 110.99704300300073,
 'samples_per_second': 4.504624505956279,
 'latency_in_seconds': 0.22199408600600143}

## <font color = 'pickle'> **Prediction for large dataset**

In [69]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 400}
custom_pipeline = pipeline(
    task="translation",
    model=checkpoint,
    tokenizer=checkpoint,
    device=0,
    framework = 'pt'
)
translations = custom_pipeline(test_data_flattened['en'], **gen_kwargs, batch_size=16)

In [74]:
translations[0:10]

[{'translation_text': '& Y en haut & #160;:'},
 {'translation_text': '–'},
 {'translation_text': 'Capitaliser le texte sélectionné ou le mot courant.'},
 {'translation_text': 'Ouvrir & konqueror;.'},
 {'translation_text': 'Chemins de recherche'},
 {'translation_text': 'Réduire à Grayscale'},
 {'translation_text': "L'utilisateur ne peut modifier aucun paramètre sur cette page."},
 {'translation_text': 'Générer'},
 {'translation_text': 'Vietnamien (Vietnam)'},
 {'translation_text': 'Sélectionner le style à importer & #160;:'}]

# **Experiment 2**

## <font color = 'pickle'> **Tokenization**</font>

###  <font color = 'pickle'> **Load pre-trained Tokenizer** </font>

In [None]:
# !pip install sentencepiece

In [79]:
checkpoint = 'Helsinki-NLP/opus-mt-en-romance'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]



### <font color = 'pickle'> **Understanding tokenizer**</font>

In [81]:
# get the vocab size
print(f"Pretrained tokenizer vocab size {tokenizer.vocab_size}")

Pretrained tokenizer vocab size 65001


In [82]:
encoded_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
tokens_first_sentence = tokenizer.convert_ids_to_tokens(encoded_text.input_ids[0])
tokens_second_sentence = tokenizer.convert_ids_to_tokens(encoded_text.input_ids[1])
tokenizer.convert_tokens_to_string(tokens_first_sentence)
tokenizer.convert_tokens_to_string(tokens_second_sentence)
special_tokens = tokenizer.all_special_tokens
special_tokens_ids = tokenizer.all_special_ids
df = pd.DataFrame({"special_tokens": special_tokens, "special_tokens_ids": special_tokens_ids})
pd.set_option("display.max_rows", None)
df

Unnamed: 0,special_tokens,special_tokens_ids
0,</s>,0
1,<unk>,1
2,<pad>,65000


###  <font color = 'pickle'> **Use map function to apply tokenization to all splits**

In [84]:
tokenized_dataset_romance = train_val_subset.map(tokenize_fn, batched = True, remove_columns=train_val_subset['train'].column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [85]:
tokenized_dataset_romance.set_format(type="torch")

In [86]:
tokenized_dataset_romance["train"].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

#  <font color = 'pickle'> **Model Training**

##  <font color = 'pickle'> **Model Config File**</font>



In [87]:
config = AutoConfig.from_pretrained(checkpoint)
config

MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-romance",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 65000,
  "decoder_vocab_size": 65001,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 0,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_s

In [88]:
# generation_config = GenerationConfig.from_model_config(config)
generation_config = GenerationConfig.from_pretrained(checkpoint)
generation_config

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

GenerationConfig {
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 65000,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 65000,
  "renormalize_logits": true
}

##  <font color = 'pickle'> **Download pre-trained model**

In [89]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config)

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

## <font color = 'pickle'> **Model Input/Collate Function** </font>

In [90]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
features = [tokenized_dataset_romance["train"][i] for i in range(3)]
model_input = data_collator(features)
model_input.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

##  <font color = 'pickle'> **Understanding Model Output**


In [91]:
# model output
model_output = model(**model_input)
model_output.keys()
logits = model_output.logits.detach().cpu().numpy()
preds = np.argmax(logits, axis = -1)
def get_label_strings(labels):

  return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(labels))

labels = [get_label_strings(label) for label in model_input['labels']]
preds = [ get_label_strings(pred) for pred in preds]

In [92]:
final_string_preds=[]
for example in preds:
    string_preds =[]
    for s in example.split():
        string_preds.append(s)
        if '</s>' in s:
            break
    final_string_preds.append(' '.join(string_preds))

final_string_preds

['Identificadentificado: #160;:</s>',
 "Janeenêtre superior: Vista: Vista de imprim'im de imprimantes, tantos e virtuals</s>",
 'Ante</s>']

In [93]:
# generate prediction from logits using beam search
generated_predictions = model.generate(**model_input, generation_config=generation_config, max_length=128)
tokenizer.batch_decode(generated_predictions, skip_special_tokens=True)
# tokens = tokenizer.convert_ids_to_tokens(generated_predictions[0])
# translation = tokenizer.convert_tokens_to_string(tokens)

['Identifiant & #160;: Identifica Identifica Identifica Identificador:',
 "Fenêtre supérieure & #160;: Vue de l'ensemble des imprimantes, réelles et virtuelles",
 'Aperçu Ante Ante Vista previa']

##  <font color = 'pickle'> **Evaluation metric(s)** </font>

### <font color = 'pickle'> **Function to compute metric** </font>


In [94]:
bleu_metric = evaluate.load("sacrebleu")
bert_metric = evaluate.load('bertscore')

def compute_metrics(preds_and_labels):

    preds, labels = preds_and_labels

    # convert predictions into words
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # for any -100 label, replace with pad token id
    labels = np.where( labels != -100, labels, tokenizer.pad_token_id )

    # convert labels into words
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens= True)

    # get rid of extra whitespace
    # and also, put targets into lists

    decoded_preds_cleaned = [pred.strip() for pred in decoded_preds]
    decoded_labels_cleaned = [label.strip() for label in decoded_labels]

    bleu_score = bleu_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned)
    bert_score = bert_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned, lang='fr')

    return{'bleu_score:': bleu_score['score'], 'bert_score': np.mean(bert_score['f1'])}

## <font color = 'pickle'> **Hyperparameters and Checkpointing**</font>

In [95]:
# Define the directory where model checkpoints will be saved
model_folder = base_folder/ "Models"

# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = Seq2SeqTrainingArguments(
    # Training-specific configurations
    num_train_epochs=2,  # Total number of training epochs
    weight_decay = 0.1,  # Apply L2 regularization to prevent overfitting
    learning_rate=5e-5,  # Step size for the optimizer during training
    optim="adamw_torch",  # Optimizer,
    warmup_steps = 10,
    predict_with_generate=True,
    generation_config=generation_config,
    # memory and speed related arguments
    # Number of samples per training batch for each device
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # Number of samples per eval batch for each device

    gradient_checkpointing=True,  # memory
    # fp16 = True, # Speed
    # bf16=True,
    # tf32=True, # speed
    # evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy="steps",  # Evaluate model at specified step intervals
    eval_steps = 40,  # Perform evaluation every 10 training steps
    # Checkpoint settings
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps = 40,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # metric_for_best_model=,
    # greater_is_better=,
    # Experiment logging configurations (commented out in this example)
    logging_strategy="steps",
    logging_steps = 40,
    report_to="wandb",  # Log metrics and results to Weights & Biases platform
    # Experiment name for Weights & Biases
    run_name="translation-exp2",
)


##  <font color = 'pickle'> **Initialize Trainer**</font>

In [96]:
# initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_romance["train"],
    eval_dataset=tokenized_dataset_romance["val"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


##  <font color = 'pickle'> **Start Training**

In [97]:
torch.cuda.empty_cache()
gc.collect()

3600

In [98]:
trainer.train()  # start training



Step,Training Loss,Validation Loss,Bleu Score:,Bert Score
40,1.8555,1.467531,38.922111,0.860333
80,1.3106,1.397854,40.894043,0.864249
120,1.088,1.377608,41.275127,0.865127




TrainOutput(global_step=126, training_loss=1.403334799266997, metrics={'train_runtime': 230.0593, 'train_samples_per_second': 8.693, 'train_steps_per_second': 0.548, 'total_flos': 30368697679872.0, 'train_loss': 1.403334799266997, 'epoch': 2.0})

##  <font color = 'pickle'> **Evaluation**

### <font color = 'pickle'> **Check performance on validation set**</font>

In [100]:
trainer.evaluate(tokenized_dataset_romance["val"])

{'eval_loss': 1.3776084184646606,
 'eval_bleu_score:': 41.27512683626905,
 'eval_bert_score': 0.8651271594166756,
 'eval_runtime': 60.3376,
 'eval_samples_per_second': 8.287,
 'eval_steps_per_second': 0.53,
 'epoch': 2.0}

In [101]:
wandb.finish()  # stop logging

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/bert_score,▁▇██
eval/bleu_score:,▁▇██
eval/loss,█▃▁▁
eval/runtime,▄█▁▅
eval/samples_per_second,▄▁█▄
eval/steps_per_second,▄▁█▄
train/epoch,▁▁▄▄▇▇██
train/global_step,▁▁▄▄████
train/learning_rate,█▄▁
train/loss,█▃▁

0,1
eval/bert_score,0.86513
eval/bleu_score:,41.27513
eval/loss,1.37761
eval/runtime,60.3376
eval/samples_per_second,8.287
eval/steps_per_second,0.53
train/epoch,2.0
train/global_step,126.0
train/learning_rate,0.0
train/loss,1.088


### <font color = 'pickle'> **Check the best saved model**</font>

In [102]:
# After training, let us check the best checkpoint
# We need this for Predioctions and Evaluations
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split("-")[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")

The best model was saved at step 120.


#  <font color = 'pickle'> **Inference**

## <font color = 'pickle'> **Test Set Evaluation**

In [103]:
checkpoint = str(model_folder / "checkpoint-120")
test_data_flattened = test_subset["test"].map(lambda example: {'en': example['translation']['en'], 'fr': example['translation']['fr']})
task_evaluator = evaluator("translation")
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
eval_results = task_evaluator.compute(
    model_or_pipeline=checkpoint,
    tokenizer=checkpoint,
    data=test_data_flattened,
    input_column='en',
    label_column='fr',
    generation_kwargs=gen_kwargs,
    device=0,
)
eval_results

Your input_length: 125 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


{'bleu': 0.4198958714511424,
 'precisions': [0.6849627174813587,
  0.5113216266173752,
  0.39867279224093927,
  0.312429696287964],
 'brevity_penalty': 0.9187748627201986,
 'length_ratio': 0.9219018522054612,
 'translation_length': 4828,
 'reference_length': 5237,
 'total_time_in_seconds': 113.79663038400031,
 'samples_per_second': 4.393803211156413,
 'latency_in_seconds': 0.2275932607680006}

## <font color = 'pickle'> **Prediction for large dataset**

In [104]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 400}
custom_pipeline = pipeline(
    task="translation",
    model=checkpoint,
    tokenizer=checkpoint,
    device=0,
    framework = 'pt'
)
translations = custom_pipeline(test_data_flattened['en'], **gen_kwargs, batch_size=16)

In [105]:
translations[0:10]

[{'translation_text': '& Y & Top & #160;:'},
 {'translation_text': '–'},
 {'translation_text': 'Majuscule le texte sélectionné ou le mot courant.'},
 {'translation_text': 'Ouvrir & konqueror;.'},
 {'translation_text': 'Recherche de chemins'},
 {'translation_text': "Réduire à l'échelle de gris"},
 {'translation_text': "L'utilisateur ne peut modifier aucun réglage sur cette page."},
 {'translation_text': 'Générer'},
 {'translation_text': 'Vietnamien (Vietnam)'},
 {'translation_text': 'Sélectionner le style à importer & #160;:'}]

# **Experiment 3**

## <font color = 'pickle'> **Tokenization**</font>

###  <font color = 'pickle'> **Load pre-trained Tokenizer** </font>

In [None]:
# !pip install sentencepiece

In [16]:
checkpoint = 'PaulineSanchez/autotrain-translation_food_english_to_french-52830124391'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



### <font color = 'pickle'> **Understanding tokenizer**</font>

In [22]:
text = ["Upper Window: View on Printers, both Real and Virtual", "Fenêtre supérieure & #160;: Vue de l'ensemble des imprimantes, réelles et virtuelles"]
# get the vocab size
print(f"Pretrained tokenizer vocab size {tokenizer.vocab_size}")

Pretrained tokenizer vocab size 59514


In [23]:
encoded_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
tokens_first_sentence = tokenizer.convert_ids_to_tokens(encoded_text.input_ids[0])
tokens_second_sentence = tokenizer.convert_ids_to_tokens(encoded_text.input_ids[1])
tokenizer.convert_tokens_to_string(tokens_first_sentence)
tokenizer.convert_tokens_to_string(tokens_second_sentence)
special_tokens = tokenizer.all_special_tokens
special_tokens_ids = tokenizer.all_special_ids
df = pd.DataFrame({"special_tokens": special_tokens, "special_tokens_ids": special_tokens_ids})
pd.set_option("display.max_rows", None)
df

Unnamed: 0,special_tokens,special_tokens_ids
0,</s>,0
1,<unk>,1
2,<pad>,59513


###  <font color = 'pickle'> **Use map function to apply tokenization to all splits**

In [26]:
tokenized_dataset_exp3 = train_val_subset.map(tokenize_fn, batched = True, remove_columns=train_val_subset['train'].column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset_exp3.set_format(type="torch")

In [28]:
tokenized_dataset_exp3["train"].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

#  <font color = 'pickle'> **Model Training**

##  <font color = 'pickle'> **Model Config File**</font>



In [29]:
config = AutoConfig.from_pretrained(checkpoint)
config

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

MarianConfig {
  "_name_or_path": "PaulineSanchez/autotrain-translation_food_english_to_french-52830124391",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id

In [30]:
# generation_config = GenerationConfig.from_model_config(config)
generation_config = GenerationConfig.from_pretrained(checkpoint)
generation_config

Downloading (…)neration_config.json:   0%|          | 0.00/258 [00:00<?, ?B/s]

GenerationConfig {
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 59513,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 59513
}

##  <font color = 'pickle'> **Download pre-trained model**

In [31]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config)

Downloading pytorch_model.bin:   0%|          | 0.00/299M [00:00<?, ?B/s]

## <font color = 'pickle'> **Model Input/Collate Function** </font>

In [33]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
features = [tokenized_dataset_exp3["train"][i] for i in range(3)]
model_input = data_collator(features)
model_input.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

##  <font color = 'pickle'> **Understanding Model Output**


In [35]:
# model output
model_output = model(**model_input)
model_output.keys()
logits = model_output.logits.detach().cpu().numpy()
preds = np.argmax(logits, axis = -1)
def get_label_strings(labels):

  return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(labels))

labels = [get_label_strings(label) for label in model_input['labels']]
preds = [ get_label_strings(pred) for pred in preds]

In [36]:
final_string_preds=[]
for example in preds:
    string_preds =[]
    for s in example.split():
        string_preds.append(s)
        if '</s>' in s:
            break
    final_string_preds.append(' '.join(string_preds))

final_string_preds

['I: #160;:</s>',
 "Fenêtre supérieure : #160;: Vue sur l'imprimante des imprimantes, réelles et virtuelles</s>",
 'Aperçu</s>']

In [37]:
# generate prediction from logits using beam search
generated_predictions = model.generate(**model_input, generation_config=generation_config, max_length=128)
tokenizer.batch_decode(generated_predictions, skip_special_tokens=True)
# tokens = tokenizer.convert_ids_to_tokens(generated_predictions[0])
# translation = tokenizer.convert_tokens_to_string(tokens)

['Identifiant & #160;: Identificateur:',
 "Fenêtre supérieure & #160;: Vue de l'ensemble des imprimantes, réelles et virtuelles",
 'Aperçu Aperçu']

##  <font color = 'pickle'> **Evaluation metric(s)** </font>

### <font color = 'pickle'> **Function to compute metric** </font>


In [38]:
bleu_metric = evaluate.load("sacrebleu")
bert_metric = evaluate.load('bertscore')

def compute_metrics(preds_and_labels):

    preds, labels = preds_and_labels

    # convert predictions into words
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # for any -100 label, replace with pad token id
    labels = np.where( labels != -100, labels, tokenizer.pad_token_id )

    # convert labels into words
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens= True)

    # get rid of extra whitespace
    # and also, put targets into lists

    decoded_preds_cleaned = [pred.strip() for pred in decoded_preds]
    decoded_labels_cleaned = [label.strip() for label in decoded_labels]

    bleu_score = bleu_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned)
    bert_score = bert_metric.compute(predictions=decoded_preds_cleaned, references=decoded_labels_cleaned, lang='fr')

    return{'bleu_score:': bleu_score['score'], 'bert_score': np.mean(bert_score['f1'])}

## <font color = 'pickle'> **Hyperparameters and Checkpointing**</font>

In [40]:
# Define the directory where model checkpoints will be saved
model_folder = base_folder/"Models"

# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = Seq2SeqTrainingArguments(
    # Training-specific configurations
    num_train_epochs=2,  # Total number of training epochs
    weight_decay = 0.01,  # Apply L2 regularization to prevent overfitting
    learning_rate=5e-5,  # Step size for the optimizer during training
    optim="adamw_torch",  # Optimizer,
    warmup_steps = 10,
    predict_with_generate=True,
    generation_config=generation_config,
    # memory and speed related arguments
    # Number of samples per training batch for each device
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # Number of samples per eval batch for each device

    gradient_checkpointing=True,  # memory
    # fp16 = True, # Speed
    # bf16=True,
    # tf32=True, # speed
    # evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    evaluation_strategy="steps",  # Evaluate model at specified step intervals
    eval_steps = 40,  # Perform evaluation every 10 training steps
    # Checkpoint settings
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps = 40,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # metric_for_best_model=,
    # greater_is_better=,
    # Experiment logging configurations (commented out in this example)
    logging_strategy="steps",
    logging_steps = 40,
    report_to="wandb",  # Log metrics and results to Weights & Biases platform
    # Experiment name for Weights & Biases
    run_name="translation-exp3",
)


##  <font color = 'pickle'> **Initialize Trainer**</font>

In [41]:
# initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_exp3["train"],
    eval_dataset=tokenized_dataset_exp3["val"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


##  <font color = 'pickle'> **Start Training**

In [42]:
torch.cuda.empty_cache()
gc.collect()

221

In [43]:
trainer.train()  # start training

[34m[1mwandb[0m: Currently logged in as: [33mshremuk[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Bleu Score:,Bert Score
40,1.7602,1.547707,40.15577,0.867111
80,1.3464,1.483206,40.955692,0.862475
120,1.1212,1.468542,41.002584,0.861243




TrainOutput(global_step=126, training_loss=1.397914901612297, metrics={'train_runtime': 388.5359, 'train_samples_per_second': 5.148, 'train_steps_per_second': 0.324, 'total_flos': 30394121453568.0, 'train_loss': 1.397914901612297, 'epoch': 2.0})

##  <font color = 'pickle'> **Evaluation**

### <font color = 'pickle'> **Check performance on validation set**</font>

In [44]:
trainer.evaluate(tokenized_dataset_exp3["val"])

{'eval_loss': 1.4685417413711548,
 'eval_bleu_score:': 41.0025843009921,
 'eval_bert_score': 0.8612429516315461,
 'eval_runtime': 126.0029,
 'eval_samples_per_second': 3.968,
 'eval_steps_per_second': 0.254,
 'epoch': 2.0}

In [45]:
wandb.finish()  # stop logging

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/bert_score,█▂▁▁
eval/bleu_score:,▁███
eval/loss,█▂▁▁
eval/runtime,▃▁▄█
eval/samples_per_second,▆█▅▁
eval/steps_per_second,▆█▅▁
train/epoch,▁▁▄▄▇▇██
train/global_step,▁▁▄▄████
train/learning_rate,█▄▁
train/loss,█▃▁

0,1
eval/bert_score,0.86124
eval/bleu_score:,41.00258
eval/loss,1.46854
eval/runtime,126.0029
eval/samples_per_second,3.968
eval/steps_per_second,0.254
train/epoch,2.0
train/global_step,126.0
train/learning_rate,0.0
train/loss,1.1212


### <font color = 'pickle'> **Check the best saved model**</font>

In [46]:
# After training, let us check the best checkpoint
# We need this for Predioctions and Evaluations
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split("-")[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")

The best model was saved at step 120.


#  <font color = 'pickle'> **Inference**

## <font color = 'pickle'> **Test Set Evaluation**

In [47]:
checkpoint = str(model_folder / "checkpoint-120")
test_data_flattened = test_subset["test"].map(lambda example: {'en': example['translation']['en'], 'fr': example['translation']['fr']})
task_evaluator = evaluator("translation")
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}
eval_results = task_evaluator.compute(
    model_or_pipeline=checkpoint,
    tokenizer=checkpoint,
    data=test_data_flattened,
    input_column='en',
    label_column='fr',
    generation_kwargs=gen_kwargs,
    device=0,
)
eval_results

Your input_length: 130 is bigger than 0.9 * max_length: 128. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


{'bleu': 0.42791960226397435,
 'precisions': [0.6725288053365677,
  0.5073082977288059,
  0.39910758552305403,
  0.31132332878581176],
 'brevity_penalty': 0.9430637580549445,
 'length_ratio': 0.9446247851823563,
 'translation_length': 4947,
 'reference_length': 5237,
 'total_time_in_seconds': 160.85672499599968,
 'samples_per_second': 3.108356209617188,
 'latency_in_seconds': 0.32171344999199936}

## <font color = 'pickle'> **Prediction for large dataset**

In [48]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 400}
custom_pipeline = pipeline(
    task="translation",
    model=checkpoint,
    tokenizer=checkpoint,
    device=0,
    framework = 'pt'
)
translations = custom_pipeline(test_data_flattened['en'], **gen_kwargs, batch_size=16)

In [49]:
translations[0:10]

[{'translation_text': 'Haut de la page & Y & #160;:'},
 {'translation_text': '–'},
 {'translation_text': 'Capitaliser le texte sélectionné ou le mot courant.'},
 {'translation_text': 'Ouvrir & konqueror;.'},
 {'translation_text': 'Chemins de recherche'},
 {'translation_text': "Réduire à l'échelle des gris"},
 {'translation_text': "L'utilisateur ne peut modifier aucun paramètre sur cette page."},
 {'translation_text': 'Générer'},
 {'translation_text': 'Vietnamien (Vietnam)'},
 {'translation_text': 'Sélectionner le style à importer & #160;:'}]

# **Conclusion**
We are using **BLEU** and BERT scores to comapre the models.
**BLEU** score helps us to assess the overlap between model-generated and reference sentences whereas **BERT** helps us to evaluate the semantic similarity between model-generated and reference sentences.

I'd be comparing the above models using BLEU score. The **last model** has a better Test BLEU score hence it becomes my final model.