In [1]:
%%capture
!pip install sacrebleu
!pip install evaluate
!pip install datasets

In [2]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the validation and testing datasets
validation_dataset = load_dataset('wmt16', 'de-en', split='validation')
test_dataset = load_dataset('wmt16', 'de-en', split='test')

# Load the t5-small model and tokenizer
model_name = 't5-small'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/8.73k [00:00<?, ?B/s]

Downloading and preparing dataset wmt16/de-en (download: 1.57 GiB, generated: 1.28 GiB, post-processed: Unknown size, total: 2.85 GiB) to /root/.cache/huggingface/datasets/wmt16/de-en/1.0.0/9e0038fe4cc117bd474d2774032cc133e355146ed0a47021b2040ca9db4645c0...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4548885 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2999 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /root/.cache/huggingface/datasets/wmt16/de-en/1.0.0/9e0038fe4cc117bd474d2774032cc133e355146ed0a47021b2040ca9db4645c0. Subsequent calls will reuse this data.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
test_dataset

Dataset({
    features: ['translation'],
    num_rows: 2999
})

In [5]:
  # Function to generate translations

  def generate_translation(batch):
      inputs = tokenizer(["translate English to German" + entry['en'] for entry in batch['translation']], return_tensors='pt', padding=True)
      outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], do_sample=False)
      preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      return {"translations": preds}

  # Generate translations for validation set
  validation_results = validation_dataset.map(generate_translation, batched=True, batch_size=4)

  # Generate translations for test set
  test_results = test_dataset.map(generate_translation, batched=True, batch_size=4)

  0%|          | 0/543 [00:00<?, ?ba/s]



  0%|          | 0/750 [00:00<?, ?ba/s]

In [7]:
# Calculate evaluation metrics
import evaluate

metric = evaluate.load('bleu')

2024-04-03 17:49:00.503787: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 17:49:00.503895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 17:49:00.690998: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [8]:
validation_results[0]

{'translation': {'de': 'Die Premierminister Indiens und Japans trafen sich in Tokio.',
  'en': 'India and Japan prime ministers meet in Tokyo'},
 'translations': 'In Tokio treffen sich Premierminister in Indien und Japan'}

In [9]:
preds_val = [ex['translations'] for ex in validation_results]
references_val = [ex['translation']['de'] for ex in validation_results]

preds_test = [ex['translations'] for ex in test_results]
references_test = [ex['translation']['de'] for ex in test_results]

In [11]:
score_val = metric.compute(predictions=preds_val, references=references_val)
score_test = metric.compute(predictions=preds_test, references=references_test)

In [12]:
print(score_val)
print(score_test)

{'bleu': 0.1121480385157142, 'precisions': [0.5469280831730179, 0.2994789327104483, 0.19122257053291536, 0.1298020630052969], 'brevity_penalty': 0.4441323116836975, 'length_ratio': 0.551988251242657, 'translation_length': 24431, 'reference_length': 44260}
{'bleu': 0.11971767311094333, 'precisions': [0.5625072784441598, 0.3244019138755981, 0.21118713409042816, 0.1442485902440948], 'brevity_penalty': 0.438440455293214, 'length_ratio': 0.548085975522188, 'translation_length': 34348, 'reference_length': 62669}


In [13]:
score_val = metric.compute(predictions=preds_val, references=references_val, max_order=2)
score_test = metric.compute(predictions=preds_test, references=references_test, max_order=2)
print(score_val)
print(score_test)

{'bleu': 0.17974657362098445, 'precisions': [0.5469280831730179, 0.2994789327104483], 'brevity_penalty': 0.4441323116836975, 'length_ratio': 0.551988251242657, 'translation_length': 24431, 'reference_length': 44260}
{'bleu': 0.18729077945943867, 'precisions': [0.5625072784441598, 0.3244019138755981], 'brevity_penalty': 0.438440455293214, 'length_ratio': 0.548085975522188, 'translation_length': 34348, 'reference_length': 62669}


In [14]:
score_val = metric.compute(predictions=preds_val, references=references_val, max_order=3)
score_test = metric.compute(predictions=preds_test, references=references_test, max_order=3)
print(score_val)
print(score_test)

{'bleu': 0.13999878106724378, 'precisions': [0.5469280831730179, 0.2994789327104483, 0.19122257053291536], 'brevity_penalty': 0.4441323116836975, 'length_ratio': 0.551988251242657, 'translation_length': 24431, 'reference_length': 44260}
{'bleu': 0.14809382726383616, 'precisions': [0.5625072784441598, 0.3244019138755981, 0.21118713409042816], 'brevity_penalty': 0.438440455293214, 'length_ratio': 0.548085975522188, 'translation_length': 34348, 'reference_length': 62669}


In [15]:
score_val = metric.compute(predictions=preds_val, references=references_val, max_order=4)
score_test = metric.compute(predictions=preds_test, references=references_test, max_order=4)
print(score_val)
print(score_test)

{'bleu': 0.1121480385157142, 'precisions': [0.5469280831730179, 0.2994789327104483, 0.19122257053291536, 0.1298020630052969], 'brevity_penalty': 0.4441323116836975, 'length_ratio': 0.551988251242657, 'translation_length': 24431, 'reference_length': 44260}
{'bleu': 0.11971767311094333, 'precisions': [0.5625072784441598, 0.3244019138755981, 0.21118713409042816, 0.1442485902440948], 'brevity_penalty': 0.438440455293214, 'length_ratio': 0.548085975522188, 'translation_length': 34348, 'reference_length': 62669}


In [17]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m902.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [18]:
#make a function which will calculate the BERTScore
def calculate_bert_score(preds, refs):
    from bert_score import score
    P, R, F1 = score(preds, refs, lang='de', verbose=True)
    return {"P": P.mean().item(), "R": R.mean().item(), "F1": F1.mean().item()}

# Calculate BERTScore
bert_score_val = calculate_bert_score(preds_val, references_val)
bert_score_test = calculate_bert_score(preds_test, references_test)

print(bert_score_val)
print(bert_score_test)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/68 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/34 [00:00<?, ?it/s]

done in 10.22 seconds, 212.30 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/93 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]



done in 12.79 seconds, 234.50 sentences/sec
{'P': 0.8086380362510681, 'R': 0.7521953582763672, 'F1': 0.7783204317092896}
{'P': 0.8144293427467346, 'R': 0.754437267780304, 'F1': 0.782230794429779}


In [None]:
!pip install evaluate[nltk] 
!pip install bert_score
!pip install nltk -U

In [44]:
meteor = evaluate.load('meteor')
score_val = meteor.compute(predictions=preds_val,references=references_val)
print("The meteor score for validation set: ")
print(score_val)
score_test = meteor.compute(predictions =  preds_test, references = references_test)
print("The meteor score for testing test")
print(score_test)

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************