# Install prerequisites for downloading pre-trained machine translation models

In [None]:
!pip install transformers sentencepiece fasttext sacremoses

In [None]:
import pandas as pd
import tarfile
import gzip
from transformers import MarianMTModel, MarianTokenizer

# The Tatoeba Challenge Dataset

In [None]:
!wget https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test.tar

tar = tarfile.open('/content/test.tar')
tar.extractall()

--2023-04-22 14:59:06--  https://object.pouta.csc.fi/Tatoeba-Challenge-devtest/test.tar
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 164843520 (157M) [application/x-tar]
Saving to: ‘test.tar.1’


2023-04-22 14:59:15 (19.3 MB/s) - ‘test.tar.1’ saved [164843520/164843520]



In [None]:
source_target_path = '/content/data/release/test/v2021-08-07/tatoeba-test-v2021-08-07.eng-fra.txt.gz'
unzipped = gzip.GzipFile(source_target_path)
out = unzipped.read()
parallel = [x.split("\t") for x in out.decode("utf-8").split("\n")[0:-1]]
parallel[-1]

['eng',
 'fra',
 '"Zugzwang" is a German word which, with reference to chess, means more or less the following: "obligation to make a move and, consequently, to lose the game".',
 '« Zugzwang » est un mot allemand qui, en référence aux échecs, signifie plus ou moins ce qui suit : « Obligation de faire un mouvement et, par conséquent, de perdre la partie. »']

# Load in the models for translation into the target language and back from the target into the source language

In [None]:
model_name = "Helsinki-NLP/opus-mt-tc-big-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

backmodel_name = "Helsinki-NLP/opus-mt-tc-big-fr-en"
backtokenizer = MarianTokenizer.from_pretrained(backmodel_name)
backmodel = MarianMTModel.from_pretrained(backmodel_name)

Downloading source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/570M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/570M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
src_text = []
for i, (l1, _, sent, _) in enumerate(parallel):
  if l1=='eng':
    src_text.append((i, sent))
other_language = [parallel[i][-1] for i, sent in src_text]

In [None]:
df = pd.DataFrame(src_text, columns=['index', 'source'])
df['target'] = other_language

In [None]:
df.head()

Unnamed: 0,index,source,target
0,0,80% of all English words come from other langu...,Quatre-vingt pour cent des mots anglais provie...
1,1,95 years old! God Save the Queen!,95 ans ! Dieu sauve la reine !
2,2,Aah. Now I understand.,"Ah ! Maintenant, je comprends."
3,3,A bad habit is easily acquired.,Il est facile de prendre de mauvaises habitudes.
4,4,A better world for women is a better world for...,Un monde meilleur pour les femmes est un monde...


Sample from the original dataset because doing 10k sentences would take approximately a million years

In [None]:
sample_df = df.sample(n=120, replace=False)
sample_df.head()

Unnamed: 0,index,source,target
2180,2180,He criticized the war after leaving the military.,Il critiqua la guerre après avoir quitté les a...
6891,6891,Mary is a physics student.,Marie est étudiante en physique.
774,774,Christmas is soon.,Noël arrive bientôt.
4357,4357,I knew it was time to stop.,Je savais qu'il était temps d'arrêter.
10098,10098,Tom didn't seem to be aware that he was suppos...,Tom ne semblait pas au courant qu'il était cen...


Translate into target; translate back from target to source

In [None]:
outputs = []
for i, row in sample_df.iterrows():
  sent = row['source']
  # tokenize the source
  tokenized = tokenizer(sent, return_tensors="pt", padding=False)
  # generate the target language
  t = model.generate(**tokenized, max_new_tokens=len(tokenized['input_ids'][0]) + 3)[0]
  # produce a string of the target language
  out = tokenizer.decode(t, skip_special_tokens=True)
  print(row['target'], out)
  # take that string and translate it back into the source
  backtokenized = backtokenizer(out, return_tensors="pt", padding=True)
  backtranslated = backmodel.generate(
      **backtokenized, max_new_tokens=len(backtokenized['input_ids'][0] + 3))[0]
  # render backtranslation as a string
  backtranslated_string = backtokenizer.decode(backtranslated, skip_special_tokens=True)
  outputs.append((sent, backtranslated_string))

Il critiqua la guerre après avoir quitté les armes. Il a critiqué la guerre après avoir quitté l'armée.
Marie est étudiante en physique. Mary est étudiante en physique.
Noël arrive bientôt. Noël est bientôt.
Je savais qu'il était temps d'arrêter. Je savais qu'il était temps d'arrêter.
Tom ne semblait pas au courant qu'il était censé le faire. Tom n'avait pas l'air d'être au courant qu'il était censé faire
Le robot fait tout ce que son maître lui ordonne. Le robot fait ce que son maître lui dit de faire.
Pour perdre du poids, certaines personnes sautent les repas. Pour perdre du poids, certaines personnes sautent des repas.
Essaie-la encore une fois. Essaie encore une fois.
Je ne vais pas vous aider. Je ne vais pas t'aider.
Quel est ton restaurant de restauration rapide préféré? Quel est votre fast-food préféré?
Je t'ai vue avec lui. Je t'ai vu avec lui.
Cette phrase est écrite en allemand. Cette phrase est écrite en allemand.
Pourquoi ne me dis-tu pas que je suis belle ? Pourquoi ne m'

# Inspect the output

In [None]:
out_df = pd.DataFrame(outputs, columns=['gold', 'backtranslated'])

In [None]:
out_df.head(30)

Unnamed: 0,gold,backtranslated
0,He criticized the war after leaving the military.,He criticized the war after leaving the army.
1,Mary is a physics student.,Mary is a physics student.
2,Christmas is soon.,Christmas is coming soon
3,I knew it was time to stop.,I knew it was time to stop.
4,Tom didn't seem to be aware that he was suppos...,Tom didn't seem to know what he was supposed t...
5,The robot does whatever its master tells it to...,The robot does what its master tells it to do.
6,In order to lose weight some people skip meals.,"To lose weight, some people skip meals."
7,Try it once again.,Try it again.
8,I'm not going to help you.,I'm not gonna help you.
9,What is your favourite fast-food restaurant?,What is your favorite fast food?
