# Colab FineTuning

In [1]:
import datasets
# from datasets import load_dataset

In [2]:
import os
os.environ["WANDB_DISABLED"]="true"

In [3]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

In [4]:
from datasets import load_dataset, load_metric
import evaluate
# raw_datasets = load_dataset("wmt16", "ro-en")
# metric = load_metric("sacrebleu")
metric = evaluate.load("sacrebleu")

In [5]:
# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Here, you should adjust the loading of subsets to avoid redundant downloads or loading.
# Load 50k rows of the train dataset
train_dataset = dataset["train"].select(range(1000000))
# train_dataset = dataset["train"].select(range(600))

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

Checking cache at: /root/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [6]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
show_random_elements(dataset["train"])

Unnamed: 0,translation
0,"{'en': 'Pursuant to the Minister of Transport's direction, Cargolux Airlines International, S.A. (hereinafter Cargolux) was originally granted the authorization set out above on August 9, 1984, and renewals of the authorization were granted each year thereafter.', 'fr': 'Conformément à la directive du ministre des Transports, Cargolux Airlines International, S.A. (ci-après Cargolux) s'est vu initialement accorder l'autorisation susmentionnée le 9 août 1984. Cette autorisation a par la suite été renouvelée d'année en année.'}"
1,"{'en': 'See chap.', 'fr': 'Voir chap.'}"
2,"{'en': 'Subject to rule 33, decisions of the Assembly on all matters of substance shall be taken by a two-thirds majority of the representatives present and voting.', 'fr': 'Sous réserve de l'article 33, les décisions de l'Assemblée sur toutes les questions de fond sont prises à la majorité des deux tiers des représentants présents et votants.'}"
3,"{'en': 'The Board reported in paragraph 42 of its previous report (A/57/5, vol. II, chap.', 'fr': 'Au paragraphe 42 de son rapport précédent (A/57/5, vol. II, chap. II), le Comité indiquait qu'un montant de 6,4 millions de dollars des contributions volontaires à recevoir au titre de la Force des Nations Unies chargée du maintien de la paix à Chypre (UNFICYP) était impayé depuis plus de huit ans.'}"
4,"{'en': 'Moreover, dogs, cats, Raccoons, Coyotes, and vehicular traffic also cause Five-lined Skink deaths.', 'fr': 'Enfin, les chiens, les chats, les ratons laveurs, les coyotes et la circulation routiÃ¨re causent Ã©galement la mortalitÃ© des scinques pentalignes.'}"


In [9]:
from transformers import AutoTokenizer, MarianMTModel, AutoModelForSeq2SeqLM

# checkpoint = "google-t5/t5-small"
src = "en"  # source language
trg = "fr"  # target language

checkpoint_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
# checkpoint = MarianMTModel.from_pretrained(checkpoint_name)
checkpoint = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_name)



In [10]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "fr"
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
train_tokenized_datasets = train_dataset.map(preprocess_function, batched=True)
valid_tokenized_datasets = valid_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [12]:
model_checkpoint

'Helsinki-NLP/opus-mt-en-fr'

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint_name, max_length=128)

In [14]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)



In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [22]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [23]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=valid_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.9379,1.265381,32.3231,29.4617
2,0.9105,1.265185,32.1335,29.421




TrainOutput(global_step=15626, training_loss=0.9355023573413177, metrics={'train_runtime': 5005.5576, 'train_samples_per_second': 399.556, 'train_steps_per_second': 3.122, 'total_flos': 5.245765192935014e+16, 'train_loss': 0.9355023573413177, 'epoch': 2.0})

In [26]:
# trainer.save_model(output_dir)
trainer.save_model( f"{model_name}-finetuned-{source_lang}-to-{target_lang}")

pytorch_model.bin:   0%|          | 0.00/299M [00:00<?, ?B/s]

# Testing the PreTrained model on TEST DATA of WMT14

In [None]:

import random
import numpy as np
from datasets import load_dataset
import evaluate
import os
import torch
import dill
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# seed = 1234
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)

# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

texts =[]
labels = []
for element in test_dataset["translation"]:
        # print("element: ", element)
        texts.append(element["en"])
        labels.append(element["fr"])

# metric = evaluate.load("sacrebleu")
bleu_metric = evaluate.load("sacrebleu")
meteor_metric = evaluate.load("meteor")
rouge_metric = evaluate.load("rouge")
ter_metric = evaluate.load("ter")
# chrf_metric = evaluate.load("chrf")
# bleurt_metric = evaluate.load("bleurt")
comet_metric = evaluate.load("comet")
getpwd = os.getcwd()

checkpoint_path_tokenizer = 'Helsinki-NLP/opus-mt-en-fr'
generator2_checkpoint = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

# generator2_train # Extract the underlying model from the DataParallel wrapper
generator2_checkpoint = generator2_checkpoint.module if isinstance(generator2_checkpoint, torch.nn.DataParallel) else generator2_checkpoint

# Check if CUDA is available and then set the default device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path_tokenizer)

batch_size = 40  # Adjust this based on your GPU's memory capacity

translations_batch_PreTrained = []

if torch.cuda.device_count() > 1:
    generator2_checkpoint = torch.nn.DataParallel(generator2_checkpoint).cuda()
else:
    generator2_checkpoint.cuda()

generator2_checkpoint = generator2_checkpoint.module if hasattr(generator2_checkpoint, 'module') else generator2_checkpoint

generator2_checkpoint.eval()
# generator2_checkpoint.to(device)

# Process texts in batches
for i in tqdm(range(0, len(texts), batch_size), desc="Translating batches"):
    batch = texts[i:i + batch_size]
    inputs = tokenizer(batch, truncation=True, padding="max_length", max_length=128, return_tensors="pt").input_ids.to(device)
    # print("inputs shape: ", inputs.shape)
    # Generate outputs for the entire batch
    outputs = generator2_checkpoint.generate(inputs, max_length=60, num_beams=5, early_stopping=True)
    # print("outputs shape", outputs.shape)
    
    # Decode all outputs in the batch
    # batch_translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    batch_translation = tokenizer.batch_decode(outputs , skip_special_tokens=True)
    translations_batch_PreTrained.extend(batch_translation)

# result_batch = bleu_metric.compute(predictions=translations_batch_PreTrained, references=labels)
# result_batch = {"bleu": result_batch["score"]}
result_batch = []

result_batch = {
"bleu": bleu_metric.compute(predictions=translations_batch_PreTrained, references=labels)["score"],
"meteor": meteor_metric.compute(predictions=translations_batch_PreTrained, references=labels)["meteor"],
"rouge": rouge_metric.compute(predictions=translations_batch_PreTrained, references=labels),
"ter": ter_metric.compute(predictions=translations_batch_PreTrained, references=labels)["score"],
"comet": comet_metric.compute(predictions=translations_batch_PreTrained, references=labels, sources=texts)["mean_score"]
}


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


Checking cache at: /root/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  warn(f"Failed to load image Python extension: {e}")
  torch.utils._pytree._register_pytree_node(
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 29916.58it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/envs/preprocess_bert_udem_eval/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the m

Using device: cuda


Translating batches: 100%|██████████| 76/76 [00:48<00:00,  1.55it/s]
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX 6000 Ada Generation') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


In [None]:
result_batch

{'bleu': 39.1440990614117,
 'meteor': 0.6451421562380347,
 'rouge': {'rouge1': 0.6806146771227544,
  'rouge2': 0.4905935533562486,
  'rougeL': 0.6468490327553935,
  'rougeLsum': 0.6469465804494383},
 'ter': 48.41090893423906,
 'comet': 0.8455673381165191}

In [None]:
with open("results_translations_batch_PreTrained", "w") as f:
        f.write("BLEU Score: " + str(result_batch["bleu"]) + "\n")
        f.write("METEOR Score: " + str(result_batch["meteor"]) + "\n")
        f.write("ROUGE Scores: " + str(result_batch["rouge"]) + "\n")
        f.write("TER Score: " + str(result_batch["ter"]) + "\n")
        f.write("COMET Score: " + str(result_batch["comet"]) + "\n")

In [None]:
# Save the translations to a text file - translations
import os
file_path = os.path.join(os.getcwd(), "translations_batch_PreTrained")
with open(file_path, "w") as file:
    for translation in translations_batch_PreTrained:
        file.write(translation + "\n")


In [None]:
file_path_en = os.path.join(getpwd, "original_english_translations_batch_PreTrained.txt")
# file_path = "/path/to/translations.txt"

# Open the file in write mode
with open(file_path_en, "w") as file:
    # Write each translation to the file
    for text in texts:
        file.write(text + "\n")


file_path_fr = os.path.join(getpwd, "original_french_translations_batch_PreTrained.txt")
# file_path = "/path/to/translations.txt"

# Open the file in write mode
with open(file_path_fr, "w") as file:
    # Write each translation to the file
    for label in labels:
        file.write(label + "\n")

In [None]:
# Testing the FineTuned model on TEST DATA of WMT14

In [None]:

import random
import numpy as np
from datasets import load_dataset
import evaluate
import os
import torch
import dill
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# seed = 1234
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)

# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

texts =[]
labels = []
for element in test_dataset["translation"]:
        # print("element: ", element)
        texts.append(element["en"])
        labels.append(element["fr"])

# metric = evaluate.load("sacrebleu")
bleu_metric = evaluate.load("sacrebleu")
meteor_metric = evaluate.load("meteor")
rouge_metric = evaluate.load("rouge")
ter_metric = evaluate.load("ter")
# chrf_metric = evaluate.load("chrf")
# bleurt_metric = evaluate.load("bleurt")
comet_metric = evaluate.load("comet")
getpwd = os.getcwd()

checkpoint_path_tokenizer = 'sriram-sanjeev9s/opus-mt-en-fr-finetuned-en-to-fr'
generator2_checkpoint = AutoModelForSeq2SeqLM.from_pretrained("sriram-sanjeev9s/opus-mt-en-fr-finetuned-en-to-fr")

# generator2_train # Extract the underlying model from the DataParallel wrapper
generator2_checkpoint = generator2_checkpoint.module if isinstance(generator2_checkpoint, torch.nn.DataParallel) else generator2_checkpoint

# Check if CUDA is available and then set the default device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path_tokenizer)

batch_size = 40  # Adjust this based on your GPU's memory capacity

translations_batch_fineTuned = []

if torch.cuda.device_count() > 1:
    generator2_checkpoint = torch.nn.DataParallel(generator2_checkpoint).cuda()
else:
    generator2_checkpoint.cuda()

generator2_checkpoint = generator2_checkpoint.module if hasattr(generator2_checkpoint, 'module') else generator2_checkpoint

generator2_checkpoint.eval()
# generator2_checkpoint.to(device)

# Process texts in batches
for i in tqdm(range(0, len(texts), batch_size), desc="Translating batches"):
    batch = texts[i:i + batch_size]
    inputs = tokenizer(batch, truncation=True, padding="max_length", max_length=128, return_tensors="pt").input_ids.to(device)
    # print("inputs shape: ", inputs.shape)
    # Generate outputs for the entire batch
    outputs = generator2_checkpoint.generate(inputs, max_length=60, num_beams=5, early_stopping=True)
    # print("outputs shape", outputs.shape)
    
    # Decode all outputs in the batch
    # batch_translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    batch_translation = tokenizer.batch_decode(outputs , skip_special_tokens=True)
    translations_batch_fineTuned.extend(batch_translation)

# result_batch = bleu_metric.compute(predictions=translations_batch_PreTrained, references=labels)
# result_batch = {"bleu": result_batch["score"]}
result_batch_ft = []

result_batch_ft = {
"bleu": bleu_metric.compute(predictions=translations_batch_fineTuned, references=labels)["score"],
"meteor": meteor_metric.compute(predictions=translations_batch_fineTuned, references=labels)["meteor"],
"rouge": rouge_metric.compute(predictions=translations_batch_fineTuned, references=labels),
"ter": ter_metric.compute(predictions=translations_batch_fineTuned, references=labels)["score"],
"comet": comet_metric.compute(predictions=translations_batch_fineTuned, references=labels, sources=texts)["mean_score"]
}


Checking cache at: /root/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 46397.17it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/envs/preprocess_bert_udem_eval/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
  return torch.loa

Using device: cuda


Translating batches: 100%|██████████| 76/76 [00:49<00:00,  1.54it/s]
Using default tokenizer.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


In [None]:
result_batch_ft

{'bleu': 36.82378117644398,
 'meteor': 0.627017352608584,
 'rouge': {'rouge1': 0.6569234190744335,
  'rouge2': 0.46299227829079226,
  'rougeL': 0.6235961083895429,
  'rougeLsum': 0.6236485242634786},
 'ter': 50.583791701971826,
 'comet': 0.8286577677710867}

In [None]:
with open("results_translations_batch_fineTuned", "w") as f:
        f.write("BLEU Score: " + str(result_batch_ft["bleu"]) + "\n")
        f.write("METEOR Score: " + str(result_batch_ft["meteor"]) + "\n")
        f.write("ROUGE Scores: " + str(result_batch_ft["rouge"]) + "\n")
        f.write("TER Score: " + str(result_batch_ft["ter"]) + "\n")
        f.write("COMET Score: " + str(result_batch_ft["comet"]) + "\n")

In [10]:
# Save the translations to a text file - translations
import os
file_path = os.path.join(os.getcwd(), "translations_batch_fineTuned.txt")
with open(file_path, "w") as file:
    for translation in translations_batch_fineTuned:
        file.write(translation + "\n")


In [11]:
file_path_en = os.path.join(getpwd, "original_english_translations_batch_fineTuned.txt")
# file_path = "/path/to/translations.txt"

# Open the file in write mode
with open(file_path_en, "w") as file:
    # Write each translation to the file
    for text in texts:
        file.write(text + "\n")


file_path_fr = os.path.join(getpwd, "original_french_translations_batch_fineTuned.txt")
# file_path = "/path/to/translations.txt"

# Open the file in write mode
with open(file_path_fr, "w") as file:
    # Write each translation to the file
    for label in labels:
        file.write(label + "\n")

# Evaluating on Different Datasets 

# PreTrained 

In [None]:
dataset_tedtalks = load_dataset("ted_talks_iwslt", "en-fr", trust_remote_code=True)

# FineTuned

# GAN-KD Best Model

# ProtoTyping

In [2]:
from datasets import list_datasets

# List all available datasets
datasets_list = list_datasets()

# Print the first 10 datasets
print(datasets_list[:10])


  from .autonotebook import tqdm as notebook_tqdm
  datasets_list = list_datasets()


['amirveyseh/acronym_identification', 'ade-benchmark-corpus/ade_corpus_v2', 'UCLNLP/adversarial_qa', 'Yale-LILY/aeslc', 'nwu-ctext/afrikaans_ner_corpus', 'fancyzhx/ag_news', 'allenai/ai2_arc', 'google/air_dialogue', 'komari6/ajgt_twitter_ar', 'legacy-datasets/allegro_reviews']


In [3]:
translation_datasets = [ds for ds in datasets_list if "translation" in ds]

# Print the translation-related datasets
print(translation_datasets)

['ignatius/igbo_english_machine_translation', 'microsoft/msr_zhen_translation_parity', 'abidlabs/test-translation-dataset', 'persiannlp/parsinlu_translation_en_fa', 'persiannlp/parsinlu_translation_fa_en', 'shivam/test-translation-2', 'shivam/test-translation', 'svalabs/all-nli-german-translation-wmt19', 'botisan-ai/cantonese-mandarin-translations', 'atenglens/taiwanese_english_translation', 'DigitalUmuganda/kinyarwanda-english-machine-translation-dataset', 'VanessaSchenkel/translation-en-pt', 'Kamrani/en-fa-translation', 'open-source-metrics/translation-checkpoint-downloads', 'KETI-AIR/aihub_spoken_language_translation', 'woctordho/autotrain-data-lojban-translation', 'KETI-AIR/aihub_scitech_translation', 'KETI-AIR/aihub_koenzh_food_translation', 'KETI-AIR/aihub_scitech20_translation', 'KETI-AIR/aihub_socialtech20_translation', 'Sotaro0124/Ainu-Japan_translation_model', 'HuggingFace-CN-community/translation', 'swaption2009/20k-en-zh-translation-pinyin-hsk', 'DigitalUmuganda/monolingual

In [6]:
from datasets import load_dataset
dataset_ccrawl = load_dataset("mc4", "fr", trust_remote_code=True)
# print(dataset['train'][0])

Downloading data:  37%|███▋      | 757/2048 [2:27:19<3:04:04,  8.55s/files] 