In [None]:
%pip install pandas googletrans==4.0.0-rc1 nltk tqdm


Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")


GPU is available


In [3]:
import pandas as pd
import random
from googletrans import Translator  # For back translation
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm  # For progress bar
import torch
from transformers import MarianMTModel, MarianTokenizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('wordnet')

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Function for back translation using GPU-accelerated model
def back_translate(text, src_lang='en', target_lang='fr'):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{target_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    try:
        # Translate to target language
        inputs = tokenizer(text, return_tensors="pt").to(device)
        translated = model.generate(**inputs)
        translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
        
        # Translate back to source language
        back_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{src_lang}'
        back_tokenizer = MarianTokenizer.from_pretrained(back_model_name)
        back_model = MarianMTModel.from_pretrained(back_model_name).to(device)
        
        back_inputs = back_tokenizer(translated_text, return_tensors="pt").to(device)
        back_translated = back_model.generate(**back_inputs)
        back_translated_text = back_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]
        
        return back_translated_text
    except Exception as e:
        print(f"Error during back translation: {e}")
        return text  # Return original text if translation fails

# Function for synonym replacement
def synonym_replacement(text):
    words = word_tokenize(text)
    new_words = []
    for word in words:
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if synonyms:
            new_word = random.choice(synonyms)
            new_words.append(new_word.replace("_", " "))
        else:
            new_words.append(word)
    return " ".join(new_words)

# Function for random insertion, deletion, and swap (no new text generated)
def random_transform(text):
    words = word_tokenize(text)
    operation = random.choice(['insert', 'delete', 'swap'])
    
    if operation == 'insert' and len(words) > 0:
        # Randomly insert a word from the sentence at a random position
        random_word = random.choice(words)
        insert_pos = random.randint(0, len(words))
        words.insert(insert_pos, random_word)
    elif operation == 'delete' and len(words) > 1:
        # Randomly delete a word from the sentence
        delete_pos = random.randint(0, len(words) - 1)
        words.pop(delete_pos)
    elif operation == 'swap' and len(words) > 1:
        # Randomly swap two words in the sentence
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    
    return " ".join(words)

# Main function for data augmentation
def augment_data(df, augmentation_percentage=0.3):
    augmented_rows = []
    
    # Group by Intent
    grouped = df.groupby('intent')
    
    # Total number of rows to process
    total_rows = len(df)
    processed_rows = 0
    
    # Add general progress bar
    with tqdm(total=total_rows, desc="Augmenting Data", unit="row") as pbar:
        for intent, group in grouped:
            # Calculate number of rows to augment
            num_rows_to_augment = int(len(group) * augmentation_percentage)
            rows_to_augment = group.sample(n=num_rows_to_augment, random_state=42)
            
            # Split rows_to_augment into back translation (70%) and synonym replacement (30%)
            num_back_translation = int(num_rows_to_augment * 0.7)
            num_synonym_replace = num_rows_to_augment - num_back_translation
            
            # Back Translation Rows
            back_translation_rows = rows_to_augment.sample(n=num_back_translation, random_state=42)
            for _, row in back_translation_rows.iterrows():
                augmented_text = back_translate(row['instruction'])
                augmented_row = {
                    'instruction': augmented_text,
                    'intent': row['intent'],
                    'category': row['category'],
                    'response': row['response'],
                    'DA_Tag': 'BackTranslation'
                }
                augmented_rows.append(augmented_row)
                processed_rows += 1
                pbar.update(1)  # Update progress bar
            
            # Synonym Replacement Rows
            synonym_replace_rows = rows_to_augment.drop(back_translation_rows.index).sample(n=num_synonym_replace, random_state=42)
            for _, row in synonym_replace_rows.iterrows():
                augmented_text = synonym_replacement(row['instruction'])
                augmented_row = {
                    'instruction': augmented_text,
                    'intent': row['intent'],
                    'category': row['category'],
                    'response': row['response'],
                    'DA_Tag': 'SynonymReplace'
                }
                augmented_rows.append(augmented_row)
                processed_rows += 1
                pbar.update(1)  # Update progress bar
            
            # Random Transformations (No new text added, just tagging)
            rows_for_random_transform = group.drop(rows_to_augment.index)
            for _, row in rows_for_random_transform.iterrows():
                # Apply random transform but don't save the result
                _ = random_transform(row['instruction'])
                # Tag the row for analysis (optional)
                row['DA_Tag'] = 'RandomTransform'
                processed_rows += 1
                pbar.update(1)  # Update progress bar
    
    # Convert augmented rows to DataFrame
    augmented_df = pd.DataFrame(augmented_rows)
    
    # Combine original and augmented data
    augmented_df['DA_Tag'] = augmented_df['DA_Tag'].fillna('None')  # Add DA_Tag column to original data
    original_df_with_tag = df.copy()
    original_df_with_tag['DA_Tag'] = 'None'
    
    final_df = pd.concat([original_df_with_tag, augmented_df], ignore_index=True)
    return final_df

# Example usage
if __name__ == "__main__":
    # Load your dataset (replace 'your_dataset.csv' with your actual file path)
    df = pd.read_csv('filtered_dataset1.csv')
    
    # Apply data augmentation with general progress bar
    augmented_df = augment_data(df)
    
    # Save the augmented dataset to a new file
    augmented_df.to_csv('augmented_dataset.csv', index=False)
    print("Data augmentation completed and saved to 'augmented_dataset.csv'.")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reaga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\reaga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using device: cuda


Augmenting Data:   0%|          | 5/5000 [01:06<18:21:18, 13.23s/row]


KeyboardInterrupt: 