# PIP


# Import


In [None]:

import nltk
nltk.download('wordnet')


In [None]:
import pandas as pd
import random
import torch
from nltk.corpus import wordnet
from easynmt import EasyNMT
from transformers import pipeline
from tqdm import tqdm
from collections import Counter
import re



# Load Dataset


In [None]:
# Load CSV
df = pd.read_csv("dataset.csv")


# Load Models and Define NER Patterns


In [None]:
# Load models
translator = EasyNMT('opus-mt')  # Local translation (fast)
paraphraser = pipeline("text2text-generation", model="t5-small", device=0 if torch.cuda.is_available() else -1)

# Define valid NER patterns (based on your original dataset)
VALID_NER_PATTERNS = [
    r"order number",
    r"invoice number",
    r"person name",
    r"account type",
    r"account category",
    r"delivery city",
    r"delivery country",
    r"currency symbol",
    r"refund amount"
]


# Helper Functions


In [None]:

# Helper function to preserve placeholders
def preserve_placeholders(text, augmented_text):
    """
    Replace placeholders in augmented_text with the original placeholders from text.
    """
    placeholders = re.findall(r"{{.*?}}", text)  # Extract placeholders from the original text
    for placeholder in placeholders:
        augmented_text = re.sub(re.escape(placeholder), placeholder, augmented_text)  # Ensure placeholders remain unchanged
    return augmented_text

# Helper function to preserve NER entities
def preserve_ner_entities(text):
    """
    Identify and protect NER entities in the text by marking them.
    """
    ner_entities = []
    for pattern in VALID_NER_PATTERNS:
        matches = re.findall(pattern, text)
        for match in matches:
            ner_entities.append(match)
            text = text.replace(match, f"[[{match}]]")  # Mark NER entities to protect them
    return text, ner_entities

# Helper function to restore NER entities
def restore_ner_entities(text, ner_entities):
    """
    Restore NER entities after augmentation.
    """
    for entity in ner_entities:
        text = text.replace(f"[[{entity}]]", entity)  # Restore original NER entities
    return text

# Augmentation functions
def synonym_replace(text):
    words = text.split()
    new_words = []
    for word in words:
        if re.match(r"{{.*?}}", word) or re.match(r"\[\[.*?\]\]", word):  # Skip placeholders and NER entities
            new_words.append(word)
            continue
        synonyms = wordnet.synsets(word)
        if synonyms:
            new_word = synonyms[0].lemmas()[0].name()
            new_words.append(new_word if new_word != word else word)
        else:
            new_words.append(word)
    return " ".join(new_words), "Synonym Replacement"

def add_noise(text):
    words = text.split()
    noise_type = random.choice(["insertion", "deletion"])  # Reduced noise types to avoid disarranging
    if noise_type == "insertion":
        words.insert(random.randint(0, len(words)), random.choice(["uh", "well", "hmm"]))
    elif noise_type == "deletion" and len(words) > 1:
        words.pop(random.randint(0, len(words) - 1))
    return " ".join(words), f"Noise ({noise_type})"

def paraphrase(text):
    return paraphraser(text, max_length=50, do_sample=True)[0]["generated_text"], "Paraphrasing"

# Validation function to ensure NER patterns are preserved
def validate_augmentation(original_text, augmented_text):
    """
    Validate that the augmented text retains the original NER patterns.
    """
    original_entities = set()
    augmented_entities = set()
    for pattern in VALID_NER_PATTERNS:
        original_entities.update(re.findall(pattern, original_text))
        augmented_entities.update(re.findall(pattern, augmented_text))
    return original_entities == augmented_entities

# Weighted augmentation selection
def select_augmentation():
    choices = [
        (paraphrase, 40),
        (synonym_replace, 30),
        (random.choice([add_noise, paraphrase, synonym_replace]), 30)
    ]
    methods, weights = zip(*choices)
    return random.choices(methods, weights=weights, k=1)[0]

# Randomized augmentation function with progress tracking
def augment_instruction(row):
    instruction = row["instruction"]

    # Preserve placeholders and NER entities
    instruction_with_ner, ner_entities = preserve_ner_entities(instruction)

    chosen_method = select_augmentation()
    try:
        augmented_instruction, technique = chosen_method(instruction_with_ner)
        augmented_instruction = restore_ner_entities(augmented_instruction, ner_entities)  # Restore NER entities
        augmented_instruction = preserve_placeholders(instruction, augmented_instruction)  # Preserve placeholders

        # Validate the augmented text
        if not validate_augmentation(instruction, augmented_instruction):
            raise ValueError("Augmented text does not retain original NER patterns.")
    except Exception as e:
        print(f"Error during augmentation: {e}")  # Log the error message
        augmented_instruction, technique = instruction, "None (Fallback)"

    return {
        "category": row["category"],
        "intent": row["intent"],
        "response": row["response"],
        "instruction_original": instruction,
        "instruction_augmented": augmented_instruction,
        "augmentation_technique": technique
    }


# Start Augmentation and Save Data


In [None]:

# Augment only 30% of rows for each intent
augmented_data = []

# Process all intents
for intent in tqdm(df["intent"].unique(), desc="Processing Intents"):
    intent_df = df[df["intent"] == intent]
    num_rows = len(intent_df)
    num_to_augment = int(0.3 * num_rows)  # 30% of rows for augmentation
    rows_to_augment = intent_df.sample(n=num_to_augment, random_state=42)  # Randomly select rows
    rows_to_keep = intent_df.drop(rows_to_augment.index)  # Remaining rows

    # Augment selected rows
    for _, row in tqdm(rows_to_augment.iterrows(), total=num_to_augment, desc=f"Augmenting Intent: {intent}"):
        augmented_data.append(augment_instruction(row))

    # Keep remaining rows unchanged
    for _, row in rows_to_keep.iterrows():
        augmented_data.append({
            "category": row["category"],
            "intent": row["intent"],
            "response": row["response"],
            "instruction_original": row["instruction"],
            "instruction_augmented": row["instruction"],
            "augmentation_technique": "None (Unchanged)"
        })

# Convert back to DataFrame
augmented_df = pd.DataFrame(augmented_data)


# Save to CSV
augmented_df.to_csv("augmented_dataset.csv", index=False)
print("âœ… Data augmentation complete! Saved as 'augmented_dataset.csv'.")
