<a href="https://colab.research.google.com/github/steliosg23/PDS-A2/blob/main/Incidents%20Augmentation%20using%20llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from collections import Counter
from transformers import MarianMTModel, MarianTokenizer
from nltk.corpus import wordnet
import torch
import random
import nltk
from google.colab import drive
drive.mount('/content/drive')

# Download NLTK WordNet if not already present
nltk.download('wordnet')

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the dataset
file_path = '/content/drive/MyDrive/Data/incidents_train.csv'
df = pd.read_csv(file_path)

# Compute class imbalance for 'hazard' and 'product'
hazard_counts = Counter(df['hazard'])
product_counts = Counter(df['product'])

# Define a threshold for imbalance
threshold = min(hazard_counts.values())

# Identify underrepresented classes for hazard and product
underrepresented_hazards = [h for h, count in hazard_counts.items() if count < threshold]
underrepresented_products = [p for p, count in product_counts.items() if count < threshold]

# Load translation models
def load_translation_models(source_lang="en", target_lang="fr"):
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)

    reverse_model_name = f"Helsinki-NLP/opus-mt-{target_lang}-{source_lang}"
    reverse_tokenizer = MarianTokenizer.from_pretrained(reverse_model_name)
    reverse_model = MarianMTModel.from_pretrained(reverse_model_name).to(device)

    return (tokenizer, model), (reverse_tokenizer, reverse_model)

# Back-Translation function with batching
def back_translation_batch(texts, lang_pair=("en", "fr"), batch_size=32):
    (src_tokenizer, src_model), (tgt_tokenizer, tgt_model) = load_translation_models(*lang_pair)

    augmented_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = src_tokenizer(batch, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
        translated = src_model.generate(**inputs)
        translated_texts = [src_tokenizer.decode(t, skip_special_tokens=True) for t in translated]

        back_inputs = tgt_tokenizer(translated_texts, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
        back_translated = tgt_model.generate(**back_inputs)
        augmented_texts.extend([tgt_tokenizer.decode(bt, skip_special_tokens=True) for bt in back_translated])
    return augmented_texts

# Synonym Replacement function
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# Random Insertion function
def random_insertion(text, n=2):
    words = text.split()
    for _ in range(n):
        synonyms = []
        while len(synonyms) < 1:
            random_word = random.choice(words)
            synonyms = wordnet.synsets(random_word)
        synonym = synonyms[0].lemmas()[0].name()
        random_idx = random.randint(0, len(words))
        words.insert(random_idx, synonym)
    return ' '.join(words)

# Random Swap function
def random_swap(text, n=2):
    words = text.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# Random Deletion function
def random_deletion(text, p=0.2):
    words = text.split()
    if len(words) == 1:
        return text
    remaining_words = [word for word in words if random.uniform(0, 1) > p]
    return ' '.join(remaining_words) if remaining_words else random.choice(words)

# Augment all rows using all techniques
def augment_all_techniques_batch(rows, batch_size=32):
    texts = [row['text'] for _, row in rows.iterrows()]
    augmented_rows = []

    # Back Translation in batches
    back_translated_texts = back_translation_batch(texts, lang_pair=("en", "fr"), batch_size=batch_size)
    for i, text in enumerate(texts):
        augmented_rows.append({**rows.iloc[i], 'text': back_translated_texts[i]})

    # Other augmentations for each row
    for _, row in rows.iterrows():
        augmented_rows.append({**row, 'text': synonym_replacement(row['text'])})
        augmented_rows.append({**row, 'text': random_insertion(row['text'])})
        augmented_rows.append({**row, 'text': random_swap(row['text'])})
        augmented_rows.append({**row, 'text': random_deletion(row['text'])})

    return augmented_rows

# Collect augmented rows for all underrepresented classes
augmented_rows = []
target_rows = len(df) * 2

while len(df) + len(augmented_rows) < target_rows:
    for hazard in underrepresented_hazards:
        rows = df[df['hazard'] == hazard]
        augmented_rows.extend(augment_all_techniques_batch(rows))

    for product in underrepresented_products:
        rows = df[df['product'] == product]
        augmented_rows.extend(augment_all_techniques_batch(rows))

    if len(df) + len(augmented_rows) >= target_rows:
        break

# Combine original and augmented data
augmented_df = pd.DataFrame(augmented_rows[:target_rows - len(df)])
balanced_df = pd.concat([df, augmented_df], ignore_index=True)




Mounted at /content/drive


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [6]:
# Save the new DataFrame
balanced_df.to_csv('/content/drive/MyDrive/Data/augmented_incidents_train.csv', index=False)

# Display first few rows of the balanced DataFrame
balanced_df.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
0,0,1994,1,7,us,Recall Notification: FSIS-024-94,Case Number: 024-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,smoked sausage
1,1,1994,3,10,us,Recall Notification: FSIS-033-94,Case Number: 033-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria spp,sausage
2,2,1994,3,28,us,Recall Notification: FSIS-014-94,Case Number: 014-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,ham slices
3,3,1994,4,3,us,Recall Notification: FSIS-009-94,Case Number: 009-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,thermal processed pork meat
4,4,1994,7,1,us,Recall Notification: FSIS-001-94,Case Number: 001-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,chicken breast
