In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
# tokenizer = AutoTokenizer.from_pretrained("sriram-sanjeev9s/T5_wmt14_En_Fr_1million")

In [18]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [19]:
inputs = tokenizer(text, return_tensors="pt").input_ids

In [20]:
from transformers import AutoModelForSeq2SeqLM

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("sriram-sanjeev9s/T5_wmt14_En_Fr_1million")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [22]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Les lugumes en partagent leurs ressources avec les bactéries fixant l’azote.'

In [24]:
from nltk.translate.bleu_score import sentence_bleu

# Define the reference and candidate sentences
reference = ["translate English to French: Legumes share resources with nitrogen-fixing bacteria."]
candidate = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Compute the BLEU score
bleu_score = sentence_bleu([reference], candidate)

# Print the BLEU score
print("BLEU score:", bleu_score)


BLEU score: 0


# Downloading dataset 

In [25]:
import datasets
from datasets import load_dataset

In [26]:
# Get user's home directory
import os
home = os.path.expanduser("~")

# Define the path of the cache directory
cache_dir = os.path.join(home, ".cache", "huggingface", "datasets")

# Define the name and configuration of the dataset
dataset_name = "wmt14"
config_name = "fr-en"

# Build the path for the specific dataset configuration
dataset_config_path = os.path.join(cache_dir, dataset_name, config_name)

print(f"Checking cache at: {dataset_config_path}")

# Check if the dataset configuration is already cached
if os.path.exists(dataset_config_path) and len(os.listdir(dataset_config_path)) > 0:
    print("Dataset already downloaded, loading from cache.")
    # If the dataset is already downloaded, load it from the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)
else:
    print("Downloading the dataset.")
    # Download the dataset and specify the cache directory
    dataset = load_dataset(dataset_name, config_name, cache_dir=cache_dir)

# Here, you should adjust the loading of subsets to avoid redundant downloads or loading.
# Load 50k rows of the train dataset
# train_dataset = dataset["train"].select(range(100020))
train_dataset = dataset["train"].select(range(600))

# Keep the full valid and test datasets
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

Checking cache at: /home/paperspace/.cache/huggingface/datasets/wmt14/fr-en
Dataset already downloaded, loading from cache.


Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 6.45MB/s]
Downloading data: 100%|██████████| 252M/252M [00:16<00:00, 15.2MB/s] 
Downloading data: 100%|██████████| 241M/241M [00:14<00:00, 16.2MB/s] 
Downloading data: 100%|██████████| 243M/243M [00:15<00:00, 16.1MB/s] 
Downloading data: 100%|██████████| 247M/247M [00:15<00:00, 16.4MB/s] 
Downloading data: 100%|██████████| 242M/242M [00:13<00:00, 18.1MB/s] 
Downloading data: 100%|██████████| 238M/238M [00:16<00:00, 14.7MB/s] 
Downloading data: 100%|██████████| 240M/240M [00:15<00:00, 15.1MB/s] 
Downloading data: 100%|██████████| 241M/241M [00:15<00:00, 15.7MB/s] 
Downloading data: 100%|██████████| 242M/242M [00:16<00:00, 14.5MB/s] 
Downloading data: 100%|██████████| 239M/239M [00:15<00:00, 15.2MB/s] 
Downloading data: 100%|██████████| 239M/239M [00:14<00:00, 16.2MB/s] 
Downloading data: 100%|██████████| 241M/241M [00:14<00:00, 16.5MB/s] 
Downloading data: 100%|██████████| 241M/241M [00:14<00:00, 16.5MB/s] 
Downloading data:

In [27]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [28]:
tokenized_test_datasets = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 3003/3003 [00:00<00:00, 5256.25 examples/s]


In [31]:
# tokenized_test_datasets

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3003
})

In [32]:
# Access the 'translation' column
translations = tokenized_test_datasets['translation']

# Access the 'input_ids' column
input_ids = tokenized_test_datasets['input_ids']

# Access the 'labels' column
labels = tokenized_test_datasets['labels']


In [39]:
test_dataset

Dataset({
    features: ['translation'],
    num_rows: 3003
})

In [41]:
texts =[]
labels = []
for element in test_dataset["translation"]:
        # print("element: ", element)
        texts.append(element["en"])
        labels.append(element["fr"])

In [46]:
# # Generate multiple translations in batched format
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# # Assume 'texts' is your list of strings to translate
# # texts = ["Your first text here", "Your second text here", ...]
# texts = test_dataset['translation']['en']

# # Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

translations = []
for text in texts[:100]:
    print("english sentence: ", text)
    print("Original French Sentence: ", labels[texts.index(text)])
    inputs = tokenizer(text, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("translated french sentences:",translation)
    translations.append(translation)
"""   
# # Tokenize the batch of texts
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids

# # Generate translations in batch
outputs = model.generate(input_ids, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

# # Decode each translation and store the results
translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# # Optionally, print or process the translations
for translation in translations:
    print(translation)
"""

english sentence:  Spectacular Wingsuit Jump Over Bogota
Original French Sentence:  Spectaculaire saut en "wingsuit" au-dessus de Bogota
translated french sentences: "Gesin' on the other hand"
english sentence:  Sportsman Jhonathan Florez jumped from a helicopter above Bogota, the capital of Colombia, on Thursday.
Original French Sentence:  Le sportif Jhonathan Florez a sauté jeudi d'un hélicoptère au-dessus de Bogota, la capitale colombienne.
translated french sentences: Sportsman Jhonathan Florez, 38, jumped from a helicopter above Bogota, the capital of Colombia, on Thursday.
english sentence:  Wearing a wingsuit, he flew past over the famous Monserrate Sanctuary at 160km/h. The sanctuary is located at an altitude of over 3000 meters and numerous spectators had gathered there to watch his exploit.
Original French Sentence:  Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'altitude

'   \n# # Tokenize the batch of texts\ninputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids\n\n# # Generate translations in batch\noutputs = model.generate(input_ids, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)\n\n# # Decode each translation and store the results\ntranslations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]\n\n# # Optionally, print or process the translations\nfor translation in translations:\n    print(translation)\n'

In [52]:
# # Generate multiple translations in batched format
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# # Assume 'texts' is your list of strings to translate
# # texts = ["Your first text here", "Your second text here", ...]
# texts = test_dataset['translation']['en']

# # Initialize the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

tokenizer = AutoTokenizer.from_pretrained("sriram-sanjeev9s/T5_wmt14_En_Fr_1million")
model = AutoModelForSeq2SeqLM.from_pretrained("sriram-sanjeev9s/T5_wmt14_En_Fr_1million")

translations = []
for text in texts[:100]:
    print("english sentence: ", text)
    print("Original French Sentence: ", labels[texts.index(text)])
    inputs = tokenizer(text, return_tensors="pt").input_ids
    # outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    outputs = model.generate(inputs, max_length=60, num_beams=5, early_stopping=True)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("translated french sentences:",translation)
    translations.append(translation)


english sentence:  Spectacular Wingsuit Jump Over Bogota
Original French Sentence:  Spectaculaire saut en "wingsuit" au-dessus de Bogota
translated french sentences: Spectacular Wingsuit Jump Over Bogota
english sentence:  Sportsman Jhonathan Florez jumped from a helicopter above Bogota, the capital of Colombia, on Thursday.
Original French Sentence:  Le sportif Jhonathan Florez a sauté jeudi d'un hélicoptère au-dessus de Bogota, la capitale colombienne.
translated french sentences: Sportman Jhonathan Florez jumped from a helicopter above Bogota, the capital of Colombia, on Thursday.
english sentence:  Wearing a wingsuit, he flew past over the famous Monserrate Sanctuary at 160km/h. The sanctuary is located at an altitude of over 3000 meters and numerous spectators had gathered there to watch his exploit.
Original French Sentence:  Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'alt

# ######################################

In [47]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import numpy as np


# Define the postprocess_text function
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Load the metric
metric = evaluate.load("sacrebleu")

# Define the compute_metrics function
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result




# t5-small

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, default_data_collator
from torch.utils.data import DataLoader

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

# Assuming tokenized_test_datasets is your dataset after tokenization and preprocessing
# tokenized_test_datasets = ...

# DataLoader for handling batches of data
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
test_dataloader = DataLoader(tokenized_test_datasets, batch_size=16, collate_fn=data_collator)

# Prepare the model for evaluation
model.eval()

translations = []  # Store the decoded translations
for batch in test_dataloader:
    # Move batch to GPU if using CUDA
    batch = {k: v.to(model.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']}
    
    # Generate outputs
    with torch.no_grad():
        outputs = model.generate(**batch, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    
    # Decode and store translations
    decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    translations.extend(decoded_outputs)

# Here, `translations` contains the translated texts
# To evaluate translations, you would compare them with the reference translations in your dataset

# outputs = model.generate(inputs.input_ids, max_length=60, num_beams=5, early_stopping=True)

In [53]:
# Example usage
# Ensure eval_preds is a tuple (predictions, labels) with appropriate format
# eval_preds = (outputs, labels)
# eval_preds = (translations, labels[:100])
# metrics = compute_metrics(eval_preds)
# print(metrics)
result = metric.compute(predictions=translations, references=labels[:100])
result = {"bleu": result["score"]}
result

{'bleu': 3.6134314586723937}

# sriram-sanjeev9s/T5_wmt14_En_Fr_1million

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

# Assuming you have 'inputs' and 'labels' ready (e.g., from a dataset)
# For example:
# inputs = tokenizer(["Your input text here", ...], return_tensors="pt", padding=True, truncation=True, max_length=512)
# labels = [...]

# Generate predictions (adjust as per your specific setup; this is just an example)
# outputs = model.generate(inputs.input_ids, max_length=60, num_beams=5, early_stopping=True)

In [None]:
# Example usage
# Ensure eval_preds is a tuple (predictions, labels) with appropriate format
eval_preds = (outputs, labels)
metrics = compute_metrics(eval_preds)
print(metrics)