In [None]:
import os

INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
import numpy as np
import pandas as pd
import torch
import gc

!pip install --quiet transformers
!pip install --quiet sentencepiece
from transformers import MarianMTModel, MarianTokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train = pd.read_csv(INPUT_DIR+'train.csv')
# train = train = train.sample(n=15, replace=False, ignore_index=True)

In [None]:
first_model_name = 'Helsinki-NLP/opus-mt-en-es'
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)
first_model = MarianMTModel.from_pretrained(first_model_name)
first_model.to(device)

second_model_name = 'Helsinki-NLP/opus-mt-es-en'
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)
second_model = MarianMTModel.from_pretrained(second_model_name)
second_model.to(device)

In [None]:
def format_batch_texts(language_code, batch_texts):
    formated_batch = [">>{}<< {}".format(language_code, text) for text in batch_texts]
    return formated_batch

def perform_translation(batch_texts, model, tokenizer, language="es"):
    formated_batch_texts = format_batch_texts(language, batch_texts)
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True).to(device))
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

In [None]:
# print(train['target'][:10])
# translated_targets = perform_translation(train['target'].values.tolist(), first_model, first_model_tkn)
# print(translated_targets[:10])
# back_translated_targets = perform_translation(translated_targets, second_model, second_model_tkn)
# print(back_translated_targets[:10])

train_trans = train.copy()
train_trans['id'] = train_trans['id'].map(lambda x: x+'_trans')

n = 50
for i in range(len(train)//n):
    if i % 10 == 0:
        print(i*n)
    if (i+1)*n < len(train):
        target_list = train['target'][i*n:(i+1)*n].values.tolist()
        anchor_list = train['anchor'][i*n:(i+1)*n].values.tolist()
    else:
        target_list = train['target'][i*n:].values.tolist()
        anchor_list = train['anchor'][i*n:].values.tolist()
        
    translated_targets = perform_translation(target_list, first_model, first_model_tkn)
    back_translated_targets = perform_translation(translated_targets, second_model, second_model_tkn)
    translated_anchors = perform_translation(anchor_list, first_model, first_model_tkn)
    back_translated_anchors = perform_translation(translated_anchors, second_model, second_model_tkn)
       
    train_trans.loc[i*n:(i+1)*n-1, 'target'] = back_translated_targets
    train_trans.loc[i*n:(i+1)*n-1, 'anchor'] = back_translated_anchors
    
    torch.cuda.empty_cache()
    gc.collect()

# print(train_trans.head(10))
train_trans.to_csv(OUTPUT_DIR+'train_trans.csv')