In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
FOLDER = "/content/drive/My Drive"
original_en = f"{FOLDER}/news-commentary-v9.fr-en.en"
original_fr = f"{FOLDER}/news-commentary-v9.fr-en.fr"
noisy_en = f"{FOLDER}/noisy_dataset.en"

Mounted at /content/drive


In [None]:
!pip install sacrebleu
!pip install jiwer

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.0.0 sacrebleu-2.4.3
Collecting jiwer
  Downloading jiwer

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, MarianMTModel
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch
import json
import sacrebleu
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def load_data(filepath: str) -> list[str]:
    with open(filepath, "r") as input_file:
      lines = input_file.readlines()
      result = []
      for line in lines:
        if not line.isspace():
          result.append(line.strip())
      return result

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def apply_model(model_name: str, texts: list[str]) -> list[list[str]]:
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  if model_name == 'Helsinki-NLP/opus-mt-tc-big-en-fr':
    model = MarianMTModel.from_pretrained(model_name).to(device)
    input_texts = texts
  else:
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    input_texts = [f"translate English to French: {english_sentence}" for english_sentence in texts]

  inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)

  input_ids = inputs['input_ids'].to(device)
  attention_mask = inputs['attention_mask'].to(device)

  generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_beams=5, early_stopping=True)

  generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

  return generated_texts

In [None]:
def calculate_meteor_score(outputs: list[str], references: list[str]) -> float:
  all_scores = []
  for output, reference in zip(outputs, references):
    all_scores.append(meteor_score([word_tokenize(reference)], word_tokenize(output)))
  return sum(all_scores) / len(all_scores)

In [None]:
# Load the data files
original_en_file = load_data(original_en)[:100]
original_fr_file = load_data(original_fr)[:100]
noisy_en_file = load_data(noisy_en)[:100]
print(original_en_file)
print(original_fr_file)
print(noisy_en_file)

['$10,000 Gold?', 'SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.', 'Lately, with gold prices up more than 300% over the last decade, it is harder than ever.', 'Just last December, fellow economists Martin Feldstein and Nouriel Roubini each penned op-eds bravely questioning bullish market sentiment, sensibly pointing out gold’s risks.', 'Wouldn’t you know it?', 'Since their articles appeared, the price of gold has moved up still further. Gold prices even hit a record-high $1,300 recently.', 'Last December, many gold bugs were arguing that the price was inevitably headed for $2,000.', 'Now, emboldened by continuing appreciation, some are suggesting that gold could be headed even higher than that.', 'One successful gold investor recently explained to me that stock prices languished for a more than a decade before the Dow Jones index crossed the 1,000 mark in the early 1980’s.', 'Since then, the index has climbed above 10,000.', 'Now that g

In [None]:
reference = original_fr_file
models = ['Helsinki-NLP/opus-mt-tc-big-en-fr', 't5-base', 'google-t5/t5-small']

model_info_map = {}

for model in models:
  original_translation = apply_model(model, original_en_file)
  noisy_translation = apply_model(model, noisy_en_file)
  print(original_translation)
  print(reference)

  original_bleu_score = sacrebleu.corpus_bleu([sentence.strip() for sentence in original_translation], [[sentence.strip()] for sentence in reference], smooth_value=1)
  original_ter_score = sacrebleu.corpus_ter([sentence.strip() for sentence in original_translation], [[sentence.strip()] for sentence in reference])
  original_meteor_score = calculate_meteor_score(original_translation, reference)

  noisy_bleu_score = sacrebleu.corpus_bleu([sentence.strip() for sentence in noisy_translation], [[sentence.strip()] for sentence in reference], smooth_value=1)
  noisy_ter_score = sacrebleu.corpus_ter([sentence.strip() for sentence in noisy_translation], [[sentence.strip()] for sentence in reference])
  noisy_meteor_score = calculate_meteor_score(noisy_translation, reference)

  bleu_score_diff = abs(original_bleu_score.score - noisy_bleu_score.score)
  ter_score_diff = abs(original_ter_score.score - noisy_ter_score.score)
  meteor_score_diff = abs(original_meteor_score - noisy_meteor_score)

  model_info_map[model] = {
      'original_bleu_score': original_bleu_score.score,
      'noisy_bleu_score': noisy_bleu_score.score,
      'original_ter_score': original_ter_score.score,
      'noisy_ter_score': noisy_ter_score.score,
      'original_meteor_score': original_meteor_score,
      'noisy_meteor_score': noisy_meteor_score,
      'bleu_score_diff': bleu_score_diff,
      'ter_score_diff': ter_score_diff,
      'meteor_score_diff': meteor_score_diff,
      'model': model
  }

all_model_info = list(model_info_map.items())
sorted_model_info_by_bleu = sorted(all_model_info, key=lambda x: (x[1]['bleu_score_diff'], x[1]['model']))
sorted_model_info_by_ter = sorted(all_model_info, key=lambda x: (x[1]['ter_score_diff'], x[1]['model']))
sorted_model_info_by_meteor = sorted(all_model_info, key=lambda x: (x[1]['meteor_score_diff'], x[1]['model']))
result_by_bleu, result_by_ter, result_by_meteor = [], [], []

for i, (_, val) in enumerate(sorted_model_info_by_bleu):
  val_copy = val
  val_copy['bleu_id'] = i
  result_by_bleu.append(val_copy)

for i, (_, val) in enumerate(sorted_model_info_by_ter):
  val_copy = val
  val_copy['ter_id'] = i
  result_by_ter.append(val_copy)

for i, (_, val) in enumerate(sorted_model_info_by_meteor):
  val_copy = val
  val_copy['meteor_id'] = i
  result_by_meteor.append(val_copy)

with open(f"{FOLDER}/output.json", 'w') as output_file:
  json_string = json.dumps({
      'result_by_bleu': result_by_bleu,
      'result_by_ter': result_by_ter,
      'result_by_meteor': result_by_meteor
  }, default=lambda o: o.to_dict())
  output_file.write(json_string)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/461M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

["10 000 $ d'or ?", 'SAN FRANCISCO – Il n’a jamais été facile d’avoir une conversation rationnelle sur la valeur de l’or.', "Dernièrement, avec des prix de l'or en hausse de plus de 300% au cours de la dernière décennie, il est plus difficile que jamais.", 'En décembre dernier, les économistes Martin Feldstein et Nouriel Roubini ont chacun écrit des éditoriaux remettant courageusement en question le sentiment haussier du marché, soulignant raisonnablement les risques de l’or.', 'Ne le sauriez-vous pas ?', 'Depuis que leurs articles sont apparus, le prix de l’or a encore augmenté. Les prix de l’or ont même atteint un record de 1 300 $ récemment.', 'En décembre dernier, de nombreux bugs d’or arguaient que le prix se dirigeait inévitablement vers 2 000 $.', "Maintenant, enhardis par l'appréciation continue, certains suggèrent que l'or pourrait être dirigé encore plus haut que cela.", 'Un investisseur en or qui a réussi m’a récemment expliqué que les cours boursiers avaient stagné pendant 

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['10 000 $ Or?', 'SAN FRANCISCO – Il n’a jamais été facile d’avoir une conversation rationnelle sur la valeur de l’or.', 'Récemment, les prix de l’or ayant augmenté de plus de 300 % au cours de la dernière décennie, il est plus difficile que jamais.', 'Tout juste en décembre dernier, les collègues économistes Martin Feldstein et Nouriel Roubini ont rédigé des op-eds mettant courageusement en question le sentiment bull', 'Ne le saviez-vous pas?', 'Depuis leur apparition, le prix de l’or a encore augmenté, atteignant récemment un niveau record de 1 300 $.', 'En décembre dernier, de nombreux bêtes d’or ont fait valoir que le prix de l’or allait inévitablement s’établir à 2 000 $.', 'Aujourd’hui, encouragés par l’appréciation continue, certains suggèrent que l’or pourrait s’élever encore plus haut.', 'Un investisseur aurifère prospère m’a récemment expliqué que les cours des actions ont stagné pendant plus d’une décennie avant que l’indice Dow Jones ne franchisse', "Depuis, l'indice a augm

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

['10 000 $ Or?', 'SAN FRANCISCO – Il n’a jamais été facile d’avoir une conversation rationnelle sur la valeur de l’or.', "Plus tard, les prix de l'or ayant augmenté de plus de 300% au cours de la dernière décennie, c'est plus difficile que jamais.", 'Tout juste en décembre dernier, des collègues économistes Martin Feldstein et Nouriel Roubini ont chacun rédigé des op-eds mettant en doute courageusement le sentiment', 'Ne le savez-vous pas?', "Depuis l'apparition de leurs articles, le prix de l'or a encore augmenté, et les prix de l'or ont même atteint un record de 1 300 $ récemment.", "En décembre dernier, de nombreux bogues d'or étaient d'avis que le prix était inévitablement fixé à 2 000 $.", "Maintenant, encouragés par la continuité de l'appréciation, certains suggèrent que l'or pourrait être plus élevé que celui-ci.", 'Un investisseur d’or réussi m’a récemment expliqué que les cours d’actions n’étaient plus d’une décennie avant que l’indice Dow Jones ne franchisse', "Depuis lors, l