# Downloading dataset 

In [7]:
import datasets
from datasets import load_dataset

In [8]:
# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Here, you should adjust the loading of subsets to avoid redundant downloads or loading.
# Load 50k rows of the train dataset
train_dataset = dataset["train"].select(range(100020))
# train_dataset = dataset["train"].select(range(600))

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

Checking cache at: /home/paperspace/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [3]:
test_dataset

Dataset({
    features: ['translation'],
    num_rows: 3003
})

In [9]:
texts =[]
labels = []
for element in test_dataset["translation"]:
        # print("element: ", element)
        texts.append(element["en"])
        labels.append(element["fr"])

In [13]:
import evaluate
metric = evaluate.load("sacrebleu")
import os
getpwd = os.getcwd()

In [9]:
# Specify the file path

file_path_en = os.path.join(getpwd, "original_english.txt")
# file_path = "/path/to/translations.txt"

# Open the file in write mode
with open(file_path_en, "w") as file:
    # Write each translation to the file
    for text in texts:
        file.write(text + "\n")

In [10]:
# Specify the file path
# import os
# getpwd = os.getcwd()
file_path_fr = os.path.join(getpwd, "original_french.txt")
# file_path = "/path/to/translations.txt"

# Open the file in write mode
with open(file_path_fr, "w") as file:
    # Write each translation to the file
    for label in labels:
        file.write(label + "\n")

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# google-t5/t5-small

In [17]:
# # Generate multiple translations in batched format


# # Initialize the tokenizer and model
tokenizer_t5_small_pretrained = AutoTokenizer.from_pretrained("google-t5/t5-small")
model_t5_small_pretrained = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

translations_t5_small_pretrained = []
# for text in tqdm(texts):
for idx, text in tqdm(enumerate(texts), desc="Translating", total=len(texts)):
    # print("Original English Sentence: ", text)
    # print("Original French Sentence: ", labels[idx])
    inputs_t5_small_pretrained = tokenizer_t5_small_pretrained(text, return_tensors="pt").input_ids
    outputs_t5_small_pretrained = model_t5_small_pretrained.generate(inputs_t5_small_pretrained, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    translation_t5_small_pretrained = tokenizer_t5_small_pretrained.decode(outputs_t5_small_pretrained[0], skip_special_tokens=True)
    # print("translated french sentences:",translation_t5_small_pretrained)
    translations_t5_small_pretrained.append(translation_t5_small_pretrained)
    # print("\n")

# Specify the file path
file_path_t5_small_pretrained = os.path.join(getpwd, "translated_french_by_t5_small_pretrained.txt")

# Open the file in write mode
with open(file_path_t5_small_pretrained, "w") as file:
    # Write each translation to the file
    for translation in translations_t5_small_pretrained:
        file.write(translation + "\n")


Translating: 100%|██████████| 3003/3003 [23:23<00:00,  2.14it/s]


In [18]:
result_t5_small_pretrained = metric.compute(predictions=translations_t5_small_pretrained, references=labels)
result_t5_small_pretrained = {"bleu": result_t5_small_pretrained["score"]}
result_t5_small_pretrained

{'bleu': 2.041124318007492}

# google-t5/t5-base

In [None]:
# # Initialize the tokenizer and model
tokenizer_t5_base_pretrained = AutoTokenizer.from_pretrained("google-t5/t5-base")
model_t5_base_pretrained = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

translations_t5_base_pretrained = []
# for text in tqdm(texts):
for idx, text in tqdm(enumerate(texts), desc="Translating", total=len(texts)):
    # print("Original English Sentence: ", text)
    # print("Original French Sentence: ", labels[idx])
    inputs_t5_base_pretrained = tokenizer_t5_base_pretrained(text, return_tensors="pt").input_ids
    outputs_t5_base_pretrained = model_t5_base_pretrained.generate(inputs_t5_base_pretrained, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    translation_t5_base_pretrained = tokenizer_t5_base_pretrained.decode(outputs_t5_base_pretrained[0], skip_special_tokens=True)
    # print("translated french sentences:",translation_t5_small_pretrained)
    translations_t5_base_pretrained.append(translation_t5_base_pretrained)
    # print("\n")

# Specify the file path
file_path_t5_base_pretrained = os.path.join(getpwd, "translated_french_by_t5_base_pretrained.txt")

# Open the file in write mode
with open(file_path_t5_base_pretrained, "w") as file:
    # Write each translation to the file
    for translation in translations_t5_base_pretrained:
        file.write(translation + "\n")

In [None]:
result_t5_base_pretrained = metric.compute(predictions=translations_t5_base_pretrained, references=labels)
result_t5_base_pretrained = {"bleu": result_t5_base_pretrained["score"]}
result_t5_base_pretrained

# Helsinki-NLP/opus-mt-en-fr

In [16]:
# # Initialize the tokenizer and model
tokenizer_opus_mt_en_fr_pretrained = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model_opus_mt_en_fr_pretrained = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

translations_opus_mt_en_fr_pretrained = []
# for text in tqdm(texts):
for idx, text in tqdm(enumerate(texts), desc="Translating", total=len(texts)):
    # print("Original English Sentence: ", text)
    # print("Original French Sentence: ", labels[idx])
    inputs_opus_mt_en_fr_pretrained= tokenizer_opus_mt_en_fr_pretrained(text, return_tensors="pt").input_ids
    outputs_opus_mt_en_fr_pretrained = model_opus_mt_en_fr_pretrained.generate(inputs_opus_mt_en_fr_pretrained, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    translation_opus_mt_en_fr_pretrained = tokenizer_opus_mt_en_fr_pretrained.decode(outputs_opus_mt_en_fr_pretrained[0], skip_special_tokens=True)
    # print("translated french sentences:",translation_t5_small_pretrained)
    translations_opus_mt_en_fr_pretrained.append(translation_opus_mt_en_fr_pretrained)
    # print("\n")

# Specify the file path
file_path_opus_mt_en_fr_pretrained = os.path.join(getpwd, "translated_french_by_opus_mt_en_fr_pretrained.txt")

# Open the file in write mode
with open(file_path_opus_mt_en_fr_pretrained, "w") as file:
    # Write each translation to the file
    for translation in translations_opus_mt_en_fr_pretrained:
        file.write(translation + "\n")

Translating: 100%|██████████| 3003/3003 [55:23<00:00,  1.11s/it]  


In [17]:
result_opus_mt_en_fr_pretrained = metric.compute(predictions=translations_opus_mt_en_fr_pretrained, references=labels)
result_opus_mt_en_fr_pretrained = {"bleu": result_opus_mt_en_fr_pretrained["score"]}
result_opus_mt_en_fr_pretrained

{'bleu': 33.13527604394126}

# sriram-sanjeev9s/T5_wmt14_En_Fr_1million

In [19]:
tokenizer_t5_finetuned_wmt14_1mil = AutoTokenizer.from_pretrained("sriram-sanjeev9s/T5_wmt14_En_Fr_1million")
model_t5_finetuned_wmt14_1mil = AutoModelForSeq2SeqLM.from_pretrained("sriram-sanjeev9s/T5_wmt14_En_Fr_1million")

translations_t5_finetuned_wmt14_1mil = []
# for text in tqdm(texts):
for idx, text in tqdm(enumerate(texts), desc="Translating", total=len(texts)):
    # print("english sentence: ", text)
    # print("Original French Sentence: ", labels[texts.index(text)])
    inputs_t5_finetuned_wmt14_1mil = tokenizer_t5_finetuned_wmt14_1mil(text, return_tensors="pt").input_ids
    # outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    outputs_t5_finetuned_wmt14_1mil = model_t5_finetuned_wmt14_1mil.generate(inputs_t5_finetuned_wmt14_1mil, max_length=60, num_beams=5, early_stopping=True)
    translation_t5_finetuned_wmt14_1mil = tokenizer_t5_finetuned_wmt14_1mil.decode(outputs_t5_finetuned_wmt14_1mil[0], skip_special_tokens=True)
    # print("translated french sentences:",translation_t5_finetuned_wmt14_1mil)
    translations_t5_finetuned_wmt14_1mil.append(translation_t5_finetuned_wmt14_1mil)
    # print("\n")

# Specify the file path
file_path_t5_finetuned_wmt14_1mil = os.path.join(getpwd, "translated_french_by_t5_finetuned_wmt14_1mil.txt")

# Open the file in write mode
with open(file_path_t5_finetuned_wmt14_1mil, "w") as file:
    # Write each translation to the file
    for translation in translations_t5_finetuned_wmt14_1mil:
        file.write(translation + "\n")


Translating: 100%|██████████| 3003/3003 [38:09<00:00,  1.31it/s]


# ######################################

In [21]:
result_t5_finetuned_wmt14_1mil = metric.compute(predictions=translations_t5_finetuned_wmt14_1mil, references=labels)
result_t5_finetuned_wmt14_1mil = {"bleu": result_t5_finetuned_wmt14_1mil["score"]}
result_t5_finetuned_wmt14_1mil

{'bleu': 21.71904404108645}

# sriram-sanjeev9s/opus-mt-en-fr_wmt14_En_Fr_1million_20epochs

In [14]:
tokenizer_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = AutoTokenizer.from_pretrained("sriram-sanjeev9s/opus-mt-en-fr_wmt14_En_Fr_1million_20epochs")
model_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = AutoModelForSeq2SeqLM.from_pretrained("sriram-sanjeev9s/opus-mt-en-fr_wmt14_En_Fr_1million_20epochs")

translations_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = []
# for text in tqdm(texts):
for idx, text in tqdm(enumerate(texts), desc="Translating", total=len(texts)):
    # print("english sentence: ", text)
    # print("Original French Sentence: ", labels[texts.index(text)])
    inputs_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = tokenizer_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs(text, return_tensors="pt").input_ids
    # outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    outputs_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = model_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs.generate(inputs_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs, max_length=60, num_beams=5, early_stopping=True)
    translation_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = tokenizer_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs.decode(outputs_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs[0], skip_special_tokens=True)
    # print("translated french sentences:",translation_t5_finetuned_wmt14_1mil)
    translations_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs.append(translation_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs)
    # print("\n")

# Specify the file path
file_path_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = os.path.join(getpwd, "translated_french_by_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs.txt")

# Open the file in write mode
with open(file_path_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs, "w") as file:
    # Write each translation to the file
    for translation in translations_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs:
        file.write(translation + "\n")

Translating: 100%|██████████| 3003/3003 [03:36<00:00, 13.84it/s]


In [15]:
result_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = metric.compute(predictions=translations_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs, references=labels)
result_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs = {"bleu": result_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs["score"]}
result_opus_mt_en_fr_wmt14_En_Fr_1million_20epochs

{'bleu': 0.0}

# sriram-sanjeev9s/T5_base_wmt14_En_Fr_1million

In [24]:
tokenizer_t5_base_finetuned_wmt14_1mil = AutoTokenizer.from_pretrained("sriram-sanjeev9s/T5_base_wmt14_En_Fr_1million")
model_t5_base_finetuned_wmt14_1mil = AutoModelForSeq2SeqLM.from_pretrained("sriram-sanjeev9s/T5_base_wmt14_En_Fr_1million")

translations_t5_base_finetuned_wmt14_1mil = []
# for text in tqdm(texts):
for idx, text in tqdm(enumerate(texts), desc="Translating", total=len(texts)):
    # print("english sentence: ", text)
    # print("Original French Sentence: ", labels[texts.index(text)])
    inputs_t5_base_finetuned_wmt14_1mil = tokenizer_t5_base_finetuned_wmt14_1mil(text, return_tensors="pt").input_ids
    # outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    outputs_t5_base_finetuned_wmt14_1mil = model_t5_base_finetuned_wmt14_1mil.generate(inputs_t5_base_finetuned_wmt14_1mil, max_length=60, num_beams=5, early_stopping=True)
    translation_t5_base_finetuned_wmt14_1mil = tokenizer_t5_base_finetuned_wmt14_1mil.decode(outputs_t5_base_finetuned_wmt14_1mil[0], skip_special_tokens=True)
    # print("translated french sentences:",translation_t5_finetuned_wmt14_1mil)
    translations_t5_base_finetuned_wmt14_1mil.append(translation_t5_base_finetuned_wmt14_1mil)
    # print("\n")

# Specify the file path
file_path_t5_base_finetuned_wmt14_1mil = os.path.join(getpwd, "translated_french_by_t5_base_finetuned_wmt14_1mil.txt")

# Open the file in write mode
with open(file_path_t5_base_finetuned_wmt14_1mil, "w") as file:
    # Write each translation to the file
    for translation in translations_t5_base_finetuned_wmt14_1mil:
        file.write(translation + "\n")

Translating: 100%|██████████| 3003/3003 [1:29:14<00:00,  1.78s/it]


In [25]:
result_t5_base_finetuned_wmt14_1mil = metric.compute(predictions=translations_t5_base_finetuned_wmt14_1mil, references=labels)
result_t5_base_finetuned_wmt14_1mil = {"bleu": result_t5_base_finetuned_wmt14_1mil["score"]}
result_t5_base_finetuned_wmt14_1mil

{'bleu': 23.78058757171012}