#Setup

In [1]:
!pip install --no-cache-dir transformers sentencepiece &> /dev/null 
!pip install datasets &> /dev/null 
!pip install evaluate &> /dev/null 

import os

In [2]:
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-uk-ru")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-uk-ru")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer2 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-uk")

model2 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-uk")

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.36M [00:00<?, ?B/s]



Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/297M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/297M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [4]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer3 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-uk-en")

# model3 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-uk-en")

# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer4 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-uk")

# model4 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-uk")

In [5]:
# src_text = ["Крім того, у документі Британія підтверджує право України досягати власних домовленостей щодо безпеки, включно з майбутнім членством в НАТО."]
# print(src_text)
# translated = model3.generate(**tokenizer3(src_text, return_tensors="pt", padding=True))
# out = [tokenizer3.decode(t, skip_special_tokens=True) for t in translated]
# print(out)

# translated = model4.generate(**tokenizer4(out, return_tensors="pt", padding=True))
# corrupted = [tokenizer4.decode(t, skip_special_tokens=True) for t in translated]
# print(corrupted)

In [6]:
src_text = ["Піт заливав чоло , і водночас озноб пробивав від голови до п'ят ."]
print(src_text)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
out = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
print(out)

translated = model2.generate(**tokenizer(out, return_tensors="pt", padding=True))
corrupted = [tokenizer2.decode(t, skip_special_tokens=True) for t in translated]
print(corrupted)

["Піт заливав чоло , і водночас озноб пробивав від голови до п'ят ."]




['Пит заливал лоб, и в то же время озноб пробивал от головы до пят.']
['Піт залив лоб, и в той же час обноб пробив від голови до пят.']


# Dataset

In [7]:
# clean data FILE to be errorified
input_file = "/content/drive/MyDrive/artem-yushko/data-artem/cleaned/borshch4.txt"

# output FOLDER for the errorified and tagged data, future model input
out_folder = "/content/drive/MyDrive/UNLP/assist-data/5k-round-translation"

In [15]:
# creating the output folder
if not os.path.exists(out_folder):
  os.mkdir(out_folder)

# reading the file
with open(input_file, 'r') as f:
  text = f.read()
  lines = text.split('\n')

lines = lines[500000:505000]

In [17]:
import time
s = time.time()
final_list = []
t0 = time.time()
unprocessed_counter = 0

# traversing through the list
for i in range(len(lines)):
  sentence = lines[i]
  # round-translating the sentence
  translated = model.generate(**tokenizer(sentence, return_tensors="pt", padding=True))
  out = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
  translated = model2.generate(**tokenizer(out, return_tensors="pt", padding=True))
  corrupted = [tokenizer2.decode(t, skip_special_tokens=True) for t in translated]
  # adding the sentence to the list
  final_list.append(corrupted[0])
  # estimating the time left
  if i != 0 and not i % 1000:
      print(f"{i} sentences were processed\nProjected time till the end: {(time.time() - t0)/3600/i*(len(lines)-i):.2} hours")
      print(f"{unprocessed_counter} sentences were not processed.")

text = '\n'.join(final_list)
with open(out_folder + "/source.txt", 'w') as f:
  f.write(text)
print(time.time() - s)



1000 sentences were processed
Projected time till the end: 1.4 hours
0 sentences were not processed.
2000 sentences were processed
Projected time till the end: 1.0 hours
0 sentences were not processed.
3000 sentences were processed
Projected time till the end: 0.68 hours
0 sentences were not processed.
4000 sentences were processed
Projected time till the end: 0.34 hours
0 sentences were not processed.
6066.013456106186


In [19]:
text = '\n'.join(lines)
with open(out_folder + "/target.txt", 'w') as f:
  f.write(text)