In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import unidecode

from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.empty_cache()

In [2]:
model_id = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = T5ForConditionalGeneration.from_pretrained(model_id).to(device)

In [3]:
dataset = load_dataset("tapaco")
dataset = pd.DataFrame({
    "sentence": dataset["train"]["paraphrase"],
    "language": dataset["train"]["language"]
    })

dataset = dataset[ dataset.language == "en"].reset_index(drop=True)
dataset = dataset.iloc[::10, :].reset_index(drop=True) # Since this is a rephrasing dataset, we fetch every nth row
dataset

Unnamed: 0,sentence,language
0,I ate the cheese.,en
1,Today is Monday.,en
2,Does he speak English?,en
3,I'm sort of tired.,en
4,I am indebted to him.,en
...,...,...
15801,It would be a difficult job.,en
15802,I ate a burdock root tempura.,en
15803,You say you've seen a UFO? Come on!,en
15804,"It's a good sentence, anyway.",en


In [4]:
def translate_batch(batch, model=model, tokenizer=tokenizer):
    batch = [f"translate English to Romanian: {prompt}" for prompt in batch]
    inputTokens = tokenizer(batch, padding=True, return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(inputTokens['input_ids'], attention_mask=inputTokens['attention_mask'], max_new_tokens=50)

    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs

In [5]:
torch.cuda.empty_cache()

df = dataset.copy(deep=True)
batch_size = 256

translations = []
for batch_number, batch_df in tqdm(df.groupby(np.arange(len(df)) // batch_size)):
    batch = batch_df.sentence.to_numpy().tolist()
    translations += translate_batch(batch)

df["translations"] = translations
df = df.drop(columns = ["language"])

  0%|          | 0/62 [00:00<?, ?it/s]

In [6]:
df

Unnamed: 0,sentence,translations
0,I ate the cheese.,Am mâncat brânza.
1,Today is Monday.,Astăzi este ziua de luni.
2,Does he speak English?,Vorbeşte el limba engleză?
3,I'm sort of tired.,Sunt oarecum obosit.
4,I am indebted to him.,Sunt îndatorat acestuia.
...,...,...
15801,It would be a difficult job.,Ar fi o sarcină dificilă.
15802,I ate a burdock root tempura.,Am mâncat o tempura de rădăcini burdice.
15803,You say you've seen a UFO? Come on!,Aţi spus că aţi văzut o UFO?
15804,"It's a good sentence, anyway.","Oricum, este o frază bună."


In [7]:
df.to_csv(f"data/{model_id}.csv", index=False)