In [1]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
import pandas as pd

In [3]:
data_train = load_dataset("wmt16","de-en", split="train[:50000]")

In [4]:
print(data_train)

Dataset({
    features: ['translation'],
    num_rows: 50000
})


In [5]:
df_train = pd.DataFrame(data_train)
train_data = []
for i in df_train['translation']:
    de = i['de']
    en = i['en']
    train_data.append((de,en))
print(train_data[:10])

[('Wiederaufnahme der Sitzungsperiode', 'Resumption of the session'), ('Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'), ('Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."), ('Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tag

In [6]:
data_val = load_dataset("wmt16","de-en", split="validation")

In [7]:
df_val = pd.DataFrame(data_val)
val_data = []
for i in df_val['translation']:
    de = i['de']
    en = i['en']
    val_data.append((de,en))
print(val_data[:10])

[('Die Premierminister Indiens und Japans trafen sich in Tokio.', 'India and Japan prime ministers meet in Tokyo'), ('Indiens neuer Premierminister Narendra Modi trifft bei seinem ersten wichtigen Auslandsbesuch seit seinem Wahlsieg im Mai seinen japanischen Amtskollegen Shinzo Abe in Toko, um wirtschaftliche und sicherheitspolitische Beziehungen zu besprechen.', "India's new prime minister, Narendra Modi, is meeting his Japanese counterpart, Shinzo Abe, in Tokyo to discuss economic and security ties, on his first major foreign visit since winning May's election."), ('Herr Modi befindet sich auf einer fünftägigen Reise nach Japan, um die wirtschaftlichen Beziehungen mit der drittgrößten Wirtschaftsnation der Welt zu festigen.', 'Mr Modi is on a five-day trip to Japan to strengthen economic ties with the third largest economy in the world.'), ('Pläne für eine stärkere kerntechnische Zusammenarbeit stehen ganz oben auf der Tagesordnung.', 'High on the agenda are plans for greater nuclear

In [8]:
data_test = load_dataset("wmt16","de-en", split="test")

In [9]:
df_test = pd.DataFrame(data_test)
test_data = []
for i in df_test['translation']:
    de = i['de']
    en = i['en']
    test_data.append((de,en))
print(test_data[:10])

[('Obama empfängt Netanyahu', 'Obama receives Netanyahu'), ('Das Verhältnis zwischen Obama und Netanyahu ist nicht gerade freundschaftlich.', 'The relationship between Obama and Netanyahu is not exactly friendly.'), ('Die beiden wollten über die Umsetzung der internationalen Vereinbarung sowie über Teherans destabilisierende Maßnahmen im Nahen Osten sprechen.', "The two wanted to talk about the implementation of the international agreement and about Teheran's destabilising activities in the Middle East."), ('Bei der Begegnung soll es aber auch um den Konflikt mit den Palästinensern und die diskutierte Zwei-Staaten-Lösung gehen.', 'The meeting was also planned to cover the conflict with the Palestinians and the disputed two state solution.'), ('Das Verhältnis zwischen Obama und Netanyahu ist seit Jahren gespannt.', 'Relations between Obama and Netanyahu have been strained for years.'), ('Washington kritisiert den andauernden Siedlungsbau Israels und wirft Netanyahu mangelnden Willen bei

In [10]:
!pip install sentencepiece
!pip install transformers
!pip install --upgrade transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [20]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
prefix = "translate English to German: "
sentences_val_eng = [prefix + i[1] for i in val_data]
sentences_test_eng = [prefix + i[1] for i in test_data]

inputs = tokenizer(sentences_val_eng, return_tensors="pt", padding=True, truncation=True)
output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], do_sample=False)
output_sequences_val = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
print(len(output_sequences_val))



2169


In [42]:
print(output_sequences_val[:5])

['In Tokio treffen sich die Premierminister Indiens und Japans', 'Der neue indische Premierminister Narendra Modi trifft sein japanisches Am', 'Herr Modi ist auf einer fünftägigen Reise nach Japan, um die wirtschaftliche', 'Auf der Tagesordnung stehen Pläne für eine verstärkte nukleare Zusammenarbeit.', 'Indien hofft ebenfalls auf eine Vereinbarung über die Zusammenarbeit im Verteidigungs']


In [45]:
inputs = tokenizer(sentences_test_eng, return_tensors="pt", padding=True, truncation=True)
output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], do_sample=False)
output_sequences_test = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)



In [47]:
import evaluate
bleu = evaluate.load('bleu')

sentences_val_ger_bleu = [[i[0]] for i in val_data]
sentences_test_ger_bleu = [[i[0]] for i in test_data]

bleu_score = bleu.compute(predictions=output_sequences_val, references=sentences_val_ger_bleu, max_order=1)
print("Validation BLEU-1 Score:", bleu_score)
bleu_score = bleu.compute(predictions=output_sequences_val, references=sentences_val_ger_bleu, max_order=2)
print("Validation BLEU-2 Score:", bleu_score)
bleu_score = bleu.compute(predictions=output_sequences_val, references=sentences_val_ger_bleu, max_order=3)
print("Validation BLEU-3 Score:", bleu_score)
bleu_score = bleu.compute(predictions=output_sequences_val, references=sentences_val_ger_bleu, max_order=4)
print("Validation BLEU-4 Score:", bleu_score)

bleu_score = bleu.compute(predictions=output_sequences_test, references=sentences_test_ger_bleu, max_order = 1)
print("Test BLEU-1 Score:", bleu_score)
bleu_score = bleu.compute(predictions=output_sequences_test, references=sentences_test_ger_bleu, max_order = 2)
print("Test BLEU-2 Score:", bleu_score)
bleu_score = bleu.compute(predictions=output_sequences_test, references=sentences_test_ger_bleu, max_order = 3)
print("Test BLEU-3 Score:", bleu_score)
bleu_score = bleu.compute(predictions=output_sequences_test, references=sentences_test_ger_bleu, max_order = 4)
print("Test BLEU-4 Score:", bleu_score)

Validation BLEU-1 Score: {'bleu': 0.2661676664462372, 'precisions': [0.5953264548754129], 'brevity_penalty': 0.4470953109280848, 'length_ratio': 0.5540216900135563, 'translation_length': 24521, 'reference_length': 44260}
Validation BLEU-2 Score: {'bleu': 0.20111319933833893, 'precisions': [0.5953264548754129, 0.3398801002147459], 'brevity_penalty': 0.4470953109280848, 'length_ratio': 0.5540216900135563, 'translation_length': 24521, 'reference_length': 44260}
Validation BLEU-3 Score: {'bleu': 0.15829949742184973, 'precisions': [0.5953264548754129, 0.3398801002147459, 0.21935995244228673], 'brevity_penalty': 0.4470953109280848, 'length_ratio': 0.5540216900135563, 'translation_length': 24521, 'reference_length': 44260}
Validation BLEU-4 Score: {'bleu': 0.1275631721885134, 'precisions': [0.5953264548754129, 0.3398801002147459, 0.21935995244228673, 0.14930093209054593], 'brevity_penalty': 0.4470953109280848, 'length_ratio': 0.5540216900135563, 'translation_length': 24521, 'reference_length'

In [53]:
sentences_val_ger = [i[0] for i in val_data]
from datasets import load_metric
metric = load_metric("meteor")
x=metric.compute(predictions=output_sequences_val, references=sentences_val_ger)
print("Validation Meteor Score:",x )

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /home/deb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/deb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/deb/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Validation Meteor Score: {'meteor': 0.3697782590377392}


In [49]:
inputs = tokenizer(sentences_test_eng, return_tensors="pt", padding=True, truncation=True)
output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], do_sample=False)
output_sequences_test = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
print(len(output_sequences_test))



2999


In [54]:
sentences_test_ger = [i[0] for i in test_data]
from datasets import load_metric
metric = load_metric("meteor")
x = metric.compute(predictions=output_sequences_test, references=sentences_test_ger)
print("Test Meteor Score:", x)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /home/deb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/deb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/deb/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Test Meteor Score: {'meteor': 0.3889479564070394}


In [51]:
from datasets import load_metric
metric = load_metric("bertscore")
bertscore_score_val = metric.compute(predictions=output_sequences_val, references=sentences_val_ger, lang="de")
val_precision = bertscore_score_val["precision"]
avg_val_precision = sum(val_precision) / len(val_precision)
print("Average Validation BERTScore Precision:", avg_val_precision)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Average Validation BERTScore Precision: 0.8268642122704843


In [52]:
bertscore_score_test = metric.compute(predictions=output_sequences_test, references=sentences_test_ger, lang="de")
test_precision = bertscore_score_test["precision"]
avg_test_precision = sum(test_precision) / len(test_precision)
print("Average Test BERTScore Precision:", avg_test_precision)

Average Test BERTScore Precision: 0.8345420370900102


In [55]:
import pandas as pd
df = pd.read_csv('input_2B.csv')
prefix = "translate English to German: "
temp_df = [prefix + i for i in df['en']]
inputs = tokenizer(temp_df, return_tensors="pt", padding=True, truncation=True)
output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], do_sample=False)
output_sequences = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
df['de'] = output_sequences
df.to_csv('output2B.csv', index=False)

