In [1]:
import os
os.chdir('../')

from research.utils import load_vocabulary, Tokenizer
from research.scorer import Scorer
from research.greedy_optimizer import GreedyOptimizer
from research.beam_optimizer import BeamOptimizer

### Model loading

In [2]:
vocab = load_vocabulary()
tokenizer = Tokenizer()
scorer = Scorer()

### Translation scoring

In [3]:
english_sentence = 'I think that machine translation is very interesting subject.'
german_translation_from_google = 'Ich denke, dass maschinelle Übersetzung ein sehr interessantes Thema ist.'
score = scorer.score_texts(english_sentence, german_translation_from_google)
print('Translation scoring')
print('English sentence:', english_sentence)
print('Translation from google translate to score:', german_translation_from_google)
print('Log likehood of translation:', score[0])

Translation scoring
English sentence: I think that machine translation is very interesting subject.
Translation from google translate to score: Ich denke, dass maschinelle Übersetzung ein sehr interessantes Thema ist.
Log likehood of translation: -7.1561403


### Predicting of next word in translation

In [4]:
unfinished_translation = 'Ich denke, dass maschinelle Übersetzung ein sehr'
english_tokens = tokenizer.tokenize(english_sentence)
german_tokens = tokenizer.tokenize(unfinished_translation)
next_word_probs = scorer.next_word_probabilities([english_tokens], [german_tokens])
val, ind = next_word_probs.topk(3)
print('English sentence:', english_sentence)
print('Unfinished translation:', unfinished_translation )
print('Top 3 next tokens: ', vocab.itos[ind[0,0].view(-1)], vocab.itos[ind[0,1].view(-1)], vocab.itos[ind[0,2].view(-1)])

English sentence: I think that machine translation is very interesting subject.
Unfinished translation: Ich denke, dass maschinelle Übersetzung ein sehr
Top 3 next tokens:  ▁interessante ▁interessant ▁Interessant


### Greedy optimization

In [5]:
english_tokens = tokenizer.tokenize(english_sentence)
optimizer = GreedyOptimizer(english_sentence)
print('Optimizer initialized...')
german_tokens = optimizer.optimize()[:-1]
score = scorer.score_tokenized_texts([english_tokens], [german_tokens])[0]
print('English sentence:', english_sentence)
print('Translated tokens', ' '.join(german_tokens))
print(f'Log likehood of translation: {score:.2f}')

Optimizer initialized...
English sentence: I think that machine translation is very interesting subject.
Translated tokens ▁Ich ▁denke , ▁dass ▁die ▁ maschine lle ▁Übersetzung ▁sehr ▁interessant ▁ist .
Log likehood of translation: -6.63


### Beam search optimization

In [6]:
n_beams = 10
english_tokens = tokenizer.tokenize(english_sentence)
optimizer = BeamOptimizer(english_sentence, n_beams=n_beams)
print('Optimizer initialized...')
translations, probabilities = optimizer.optimize()
print('English sentence:', english_sentence)
print('Top', n_beams, 'translations:')
for i, translation in enumerate(translations):
    print(f"{i+1}. {' '.join(translation[:-1])} [p = {probabilities[i]:.2f}]")

score = scorer.score_tokenized_texts([english_tokens], [translations[0][:-1]])[0]
print(f'Sanity check: log likehood of the best translation = {score:.2f}')

Optimizer initialized...
English sentence: I think that machine translation is very interesting subject.
Top 10 translations:
1. ▁Ich ▁halte ▁die ▁ maschine lle ▁Übersetzung ▁für ▁ein ▁sehr ▁interessante s ▁Thema . [p = -6.49]
2. ▁Ich ▁glaube , ▁dass ▁die ▁ maschine lle ▁Übersetzung ▁sehr ▁interessant ▁ist . [p = -6.54]
3. ▁Ich ▁denke , ▁dass ▁die ▁ maschine lle ▁Übersetzung ▁ein ▁sehr ▁interessante s ▁Thema ▁ist . [p = -6.57]
4. ▁Ich ▁denke , ▁dass ▁die ▁ maschine lle ▁Übersetzung ▁sehr ▁interessant ▁ist . [p = -6.63]
5. ▁Ich ▁glaube , ▁dass ▁die ▁ maschine lle ▁Übersetzung ▁ein ▁sehr ▁interessante s ▁Thema ▁ist . [p = -6.78]
6. ▁Ich ▁denke , ▁dass ▁die ▁ maschine lle ▁Übersetzung ▁sehr ▁interessant ▁ist ▁Thema . [p = -6.97]
7. ▁Ich ▁denke , ▁dass ▁ maschine lle ▁Übersetzung ▁sehr ▁interessant ▁ist ▁Thema . [p = -7.11]
8. ▁Ich ▁denke , ▁dass ▁ maschine lle ▁Übersetzung ▁ein ▁sehr ▁interessante s ▁Thema ▁ist . [p = -7.16]
9. ▁Ich ▁glaube , ▁dass ▁ maschine lle ▁Übersetzung ▁sehr ▁inter