In [1]:
import time
import pandas as pd
from src.baseline.baseline import train_df
from src.augument.back_translation_augmenter import BackTranslationAugmenter

✓ All random seeds set to 42
training files: ['train_en.csv', 'train_es.csv', 'train_it.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




# GPT-4.1 Based Data Augumentation

* For each tweet:
    - English → Spanish + Italian (2 new samples)
    - Spanish → English + Italian (2 new samples)
    - Italian → English + Spanish (2 new samples)

In [None]:
do_translate = False

In [2]:
if do_translate:
    for lang in ["es", "it", "en"]:
        df = train_df[train_df.lang==lang]
        augmenter = BackTranslationAugmenter(model="gpt-4.1-mini", temperature=0)
        augmented_df = augmenter.augment_dataframe(
            df, 
            output_path=f'../data/augmented_multilingual_tweets_{lang}.csv'
        )
        print(augmented_df.shape)
        time.sleep(60)
else:
    pass


DATA AUGMENTATION STATISTICS

Original Dataset:
  Total samples: 876
  Positive samples: 133
  Negative samples: 743

  By language:
    EN: 0
    ES: 876
    IT: 0

Augmented Dataset:
  Total samples: 2628
  Positive samples: 399
  Negative samples: 2229
  Multiplication factor: 3.00x

  By language:
    EN: 876
    ES: 876
    IT: 876

  By augmentation type:
    Back-translation: 1752
    Original: 876

(2628, 8)

DATA AUGMENTATION STATISTICS

Original Dataset:
  Total samples: 1086
  Positive samples: 207
  Negative samples: 879

  By language:
    EN: 0
    ES: 0
    IT: 1086

Augmented Dataset:
  Total samples: 3258
  Positive samples: 621
  Negative samples: 2637
  Multiplication factor: 3.00x

  By language:
    EN: 1086
    ES: 1086
    IT: 1086

  By augmentation type:
    Back-translation: 2172
    Original: 1086

(3258, 8)

DATA AUGMENTATION STATISTICS

Original Dataset:
  Total samples: 1026
  Positive samples: 88
  Negative samples: 938

  By language:
    EN: 1026
    E

# Consolidated Augumented Data

In [1]:
import pandas as pd

In [2]:
augumented_df = pd.DataFrame()
for lang in ["es", "it", "en"]:
    csv_path = f'../data/augmented_multilingual_tweets_{lang}.csv'
    temp_df = pd.read_csv(csv_path)
    augumented_df = pd.concat([augumented_df, temp_df], ignore_index=True)

In [3]:
augumented_df.shape

(8964, 8)

In [5]:
original_samples = augumented_df[augumented_df['original_id'].isna()]
augmented_samples = augumented_df[augumented_df['original_id'].notna()]
print(original_samples.shape, augmented_samples.shape)

(2988, 8) (5976, 8)


In [6]:
augmented_samples

Unnamed: 0,id,text,label,lang,bio,original_id,original_lang,augmentation_type
876,es_1850_bkt_es_to_en,June 28 - International LGTBI Pride Day. Long ...,0,en,,es_1850,es,back_translation
877,es_1850_bkt_es_to_it,28 giugno - Giornata Internazionale dell'Orgog...,0,it,,es_1850,es,back_translation
878,es_773_bkt_es_to_en,"@USER I don't like Montero, because of her sup...",0,en,,es_773,es,back_translation
879,es_773_bkt_es_to_it,"@USER non mi piace Montero, per il suo sostegn...",0,it,,es_773,es,back_translation
880,es_1899_bkt_es_to_en,It’s #GayPride week and I will dedicate it to ...,0,en,,es_1899,es,back_translation
...,...,...,...,...,...,...,...,...
8959,en_208_bkt_en_to_it,Tua mamma è gay\n\nTuo papà lesbica\n\nTua non...,0,it,,en_208,en,back_translation
8960,en_1331_bkt_en_to_es,La única palabra que me molesta es maricón. Un...,0,es,,en_1331,en,back_translation
8961,en_1331_bkt_en_to_it,"L'unica parola che mi dà fastidio è ""frocio"". ...",0,it,,en_1331,en,back_translation
8962,en_1810_bkt_en_to_es,"Hablando como una persona LGBT+, no me importa...",0,es,,en_1810,en,back_translation


In [7]:
augmented_samples.to_csv("../data/augmented_multilingual_tweets.csv", index=False)