## Data Augmentation

### Backtranslation with facebook/nllb-200-distilled-600M

In [1]:
# Imports

import pandas as pd
from tqdm import tqdm

In [2]:
# Backtranslation with facebook/nllb-200-distilled-600M

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Cargar el modelo y el tokenizador
model_name1 = "facebook/nllb-200-distilled-600M"
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForSeq2SeqLM.from_pretrained(model_name1)

model_name2 = "Helsinki-NLP/opus-mt-fr-es"
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
model2 = AutoModelForSeq2SeqLM.from_pretrained(model_name2)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

translatorES2EN = pipeline('translation', model=model1, tokenizer=tokenizer1, src_lang='es_Latn', tgt_lang='fra_Latn', max_length=512)

translatorEN2ES = pipeline('translation', model=model2, tokenizer=tokenizer2, src_lang='fra', tgt_lang='spa', max_length=512)

Device set to use mps:0
Device set to use mps:0


In [4]:
import nltk
from nltk.corpus import wordnet
import random
nltk.download('wordnet')

def find_synonyms(text):
    words = text.split()
    augmented_text = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            augmented_text.append(synonym)
        else:
            augmented_text.append(word)
    return ' '.join(augmented_text)

def back_translate(text, trans1, trans2):
    translated_text = trans1(text)[0]['translation_text']
    #wordenetd_text = find_synonyms(translated_text)
    #print('Translated:', translated_text)
    #print('Wordnet:', wordenetd_text)
    backtranslated = trans2(translated_text)[0]['translation_text']
    return backtranslated

[nltk_data] Downloading package wordnet to /Users/roicort/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
data = pd.read_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train.csv')

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

In [6]:
# Example 

Reviews1 = data[data['Polarity'] == 1]
Reviews2 = data[data['Polarity'] == 2]
Reviews3 = data[data['Polarity'] == 3]

print('Reviews1:', Reviews1.size)
print('Reviews2:', Reviews2.size)
print('Reviews3:', Reviews3.size)

Reviews1: 32646
Reviews2: 32976
Reviews3: 93114


In [8]:
new_rows1 = []
for i in tqdm(random.sample(range(0, len(Reviews1)), 1500)):
    new_row = {
        'Title':  back_translate(Reviews1['Title'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Review': back_translate(Reviews1['Review'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Town': Reviews1['Town'].iloc[i],
        'Region': Reviews1['Region'].iloc[i],
        'Type': Reviews1['Type'].iloc[i],
        'Polarity': Reviews1['Polarity'].iloc[i]
    }

    new_rows1.append(new_row)

new_rows2 = []
for i in tqdm(random.sample(range(0, len(Reviews2)), 1500)):
    new_row = {
        'Title':  back_translate(Reviews2['Title'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Review': back_translate(Reviews2['Review'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Town': Reviews2['Town'].iloc[i],
        'Region': Reviews2['Region'].iloc[i],
        'Type': Reviews2['Type'].iloc[i],
        'Polarity': Reviews2['Polarity'].iloc[i]
    }

    new_rows2.append(new_row)

new_rows3 = []
for i in tqdm(random.sample(range(0, len(Reviews3)), 1500)):
    new_row = {
        'Title':  back_translate(Reviews3['Title'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Review': back_translate(Reviews3['Review'].iloc[i][:512], translatorES2EN, translatorEN2ES),
        'Town': Reviews3['Town'].iloc[i],
        'Region': Reviews3['Region'].iloc[i],
        'Type': Reviews3['Type'].iloc[i],
        'Polarity': Reviews3['Polarity'].iloc[i]
    }

    new_rows3.append(new_row)

# Crear un nuevo DataFrame con las nuevas filas
new_data1 = pd.DataFrame(new_rows1)
new_data2 = pd.DataFrame(new_rows2)
new_data3 = pd.DataFrame(new_rows3)
# Concatenar los DataFrames
new_data = pd.concat([new_data1, new_data2, new_data3], ignore_index=True)
# Guardar el nuevo DataFrame en un archivo CSV
new_data.to_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train_backtranslated.csv', index=False)

# Imprimir el nuevo DataFrame
print(new_data.head())
print('Total rows:', len(new_data))
print('Total rows:', len(data))

 95%|█████████▌| 1430/1500 [1:52:40<04:20,  3.73s/it] Your input_length: 499 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
100%|██████████| 1500/1500 [1:58:22<00:00,  4.73s/it]
  1%|▏         | 20/1500 [01:35<1:27:51,  3.56s/it]Your input_length: 501 is bigger than 0.9 * max_length: 512. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
100%|██████████| 1500/1500 [1:53:48<00:00,  4.55s/it] 
100%|██████████| 1500/1500 [1:46:46<00:00,  4.27s/it] 


                                               Title  \
0                                   ¡Es una canción!   
1                             No hay recomendaciones   
2               La comida es un asco, nos hemos ido.   
3  Cuando los COBRAS están a la baja, la comida d...   
4                                 La cena de Navidad   

                                              Review  \
0  El lugar es increíble (como todo Tulúm), pero ...   
1  La comida es bastante mala y tiene poca higien...   
2  La comida es una deshonra, nos fuimos hoy el 2...   
3  Cenamos ayer, la decoración del lugar es hermo...   
4  Había mucho que desear el servicio, las entrad...   

                         Town       Region        Type  Polarity  
0                       Tulum  QuintanaRoo       Hotel         1  
1  San_Cristobal_de_las_Casas      Chiapas  Restaurant         1  
2                     Cholula       Puebla  Restaurant         1  
3                 Tlaquepaque      Jalisco  Restaurant    