## Data Augmentation

### Backtranslation with facebook/nllb-200-distilled-600M

In [1]:
# Imports

import pandas as pd
from tqdm import tqdm
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [3]:
import sys
sys.path.append('../../')
from utils.config import setConfig

device = setConfig()

Usando MPS: mps
Tensor de prueba creado en el dispositivo: tensor([1.], device='mps:0') mps


In [None]:
# Backtranslation 

# Cargar el modelo y el tokenizador
model_name1 = "facebook/nllb-200-distilled-600M"
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForSeq2SeqLM.from_pretrained(model_name1)

model_name2 = "Helsinki-NLP/opus-mt-fr-es"
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
model2 = AutoModelForSeq2SeqLM.from_pretrained(model_name2)

In [None]:

translatorES2EN = pipeline('translation', model=model1, tokenizer=tokenizer1, src_lang='es_Latn', tgt_lang='fra_Latn', max_length=512)

translatorEN2ES = pipeline('translation', model=model2, tokenizer=tokenizer2, src_lang='fra', tgt_lang='spa', max_length=512)

In [None]:
def back_translate(text, trans1, trans2):
    translated_text = trans1(text)[0]['translation_text']
    backtranslated = trans2(translated_text)[0]['translation_text']
    return backtranslated

In [4]:
data = pd.read_csv(r'../../data/train/train.csv')

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

In [5]:
# Example 

Reviews1 = data[data['Polarity'] == 1]
Reviews2 = data[data['Polarity'] == 2]
Reviews3 = data[data['Polarity'] == 3]

print('Reviews1:', Reviews1.size)
print('Reviews2:', Reviews2.size)
print('Reviews3:', Reviews3.size)

Reviews1: 32646
Reviews2: 32976
Reviews3: 93114


In [None]:
new_rows1 = []
for i in tqdm(random.sample(range(0, len(Reviews1)), int(len(Reviews1) * 0.5))):
    new_row = {
        'Title':  back_translate(Reviews1['Title'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Review': back_translate(Reviews1['Review'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Town': Reviews1['Town'].iloc[i],
        'Region': Reviews1['Region'].iloc[i],
        'Type': Reviews1['Type'].iloc[i],
        'Polarity': Reviews1['Polarity'].iloc[i]
    }

    new_rows1.append(new_row)

new_rows2 = []
for i in tqdm(random.sample(range(0, len(Reviews2)), int(len(Reviews2) * 0.5))):
    new_row = {
        'Title':  back_translate(Reviews2['Title'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Review': back_translate(Reviews2['Review'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Town': Reviews2['Town'].iloc[i],
        'Region': Reviews2['Region'].iloc[i],
        'Type': Reviews2['Type'].iloc[i],
        'Polarity': Reviews2['Polarity'].iloc[i]
    }

    new_rows2.append(new_row)

new_rows3 = []
for i in tqdm(random.sample(range(0, len(Reviews3)), int(len(Reviews3) * 0.5))):
    new_row = {
        'Title':  back_translate(Reviews3['Title'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Review': back_translate(Reviews3['Review'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Town': Reviews3['Town'].iloc[i],
        'Region': Reviews3['Region'].iloc[i],
        'Type': Reviews3['Type'].iloc[i],
        'Polarity': Reviews3['Polarity'].iloc[i]
    }

    new_rows3.append(new_row)

# Crear un nuevo DataFrame con las nuevas filas
new_data1 = pd.DataFrame(new_rows1)
new_data2 = pd.DataFrame(new_rows2)
new_data3 = pd.DataFrame(new_rows3)
# Concatenar los DataFrames
new_data = pd.concat([new_data1, new_data2, new_data3], ignore_index=True)
# Guardar el nuevo DataFrame en un archivo CSV
new_data.to_csv(r'../../data/augmented/train_backtranslated.csv', index=False)

# Imprimir el nuevo DataFrame
print(new_data.head())
print('Total rows:', len(new_data))
print('Total rows:', len(data))