In [11]:
import pandas as pd

data = pd.read_parquet("../data/dataset/reddit+shsyt/data.parquet")


In [12]:
import random

def replace_utterances(iob_tags, tokens, class_dict):
    """
    Replaces the utterances in tokens based on the IOB tags with random utterances from the provided class_dict.

    Parameters:
    iob_tags (list of str): A list of IOB tags.
    tokens (list of str): A list of corresponding tokens/words.
    class_dict (dict): A dictionary mapping class names (keys) to lists of possible replacement utterances.

    Returns:
    list of str: The modified list of tokens with replaced utterances.
    list of str: The modified list of IOB tags corresponding to the new tokens.
    """
    new_tokens = []
    new_iob_tags = []
    
    i = 0
    while i < len(iob_tags):
        if iob_tags[i].startswith('B-'):
            class_label = iob_tags[i][2:]  # Get the class label without the "B-" prefix
            utterance_list = class_dict.get(class_label, [tokens[i]])  # Get the replacement list or the original token
            
            # Select a random replacement utterance
            replacement = random.choice(utterance_list)
            replacement_tokens = replacement.split()  # Split the replacement into tokens
            
            # Add the replacement tokens to the new list
            new_tokens.extend(replacement_tokens)
            
            # Add corresponding IOB tags for the replacement tokens
            new_iob_tags.append(f'B-{class_label}')
            for _ in range(1, len(replacement_tokens)):
                new_iob_tags.append(f'I-{class_label}')
            
            # Skip the original utterance tokens
            while i < len(iob_tags) and iob_tags[i].startswith(('B-', 'I-')):
                i += 1
        else:
            new_tokens.append(tokens[i])
            new_iob_tags.append(iob_tags[i])
            i += 1
    
    return new_tokens, new_iob_tags

# Example usage:
iob_tags = ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC']
tokens = ['Hello', 'John', 'Doe', 'from', 'New', 'York']
class_dict = {
    'PER': ['Jane Smith', 'Alice Johnson'],
    'LOC': ['San Francisco', 'Los Angeles California']
}

new_tokens, new_iob_tags = replace_utterances(iob_tags, tokens, class_dict)
print("New Tokens:", new_tokens)
print("New IOB Tags:", new_iob_tags)


New Tokens: ['Hello', 'Jane', 'Smith', 'from', 'Los', 'Angeles', 'California']
New IOB Tags: ['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'I-LOC', 'I-LOC']


In [15]:
import random

def perturb_characters(text, num_chars_to_perturb):
    """
    Perturb a specific number of characters in the input text.
    
    Args:
        text (str): The original text to perturb.
        num_chars_to_perturb (int): The number of characters to perturb.
        
    Returns:
        str: The perturbed text.
    """
    characters = list(text)
    text_length = len(characters)

    if num_chars_to_perturb > text_length:
        num_chars_to_perturb = text_length

    for _ in range(num_chars_to_perturb):
        perturbation_type = random.choice(["substitution", "deletion", "insertion"])
        index = random.randint(0, len(characters) - 1)
        
        if perturbation_type == "substitution":
            # Replace the character with a random one
            characters[index] = random.choice("abcdefghijklmnopqrstuvwxyz-./_=")
        elif perturbation_type == "deletion" and len(characters) > 1:
            # Remove the character
            characters.pop(index)
        elif perturbation_type == "insertion":
            # Insert a random character
            characters.insert(index, random.choice("abcdefghijklmnopqrstuvwxyz-./="))
    
    return ''.join(characters)


def perturb_tokens(text, perturbation_strength=0.1):
    """
    Perturb tokens in the input text.
    
    Args:
        text (str): The original text to perturb.
        perturbation_strength (float): The proportion of tokens to perturb (0 to 1).
        
    Returns:
        str: The perturbed text.
    """
    tokens = text.split()
    num_perturbations = int(len(tokens) * perturbation_strength)
    
    for _ in range(num_perturbations):
        perturbation_type = random.choice(["substitution", "deletion", "shuffle"])
        index = random.randint(0, len(tokens) - 1)
        
        if perturbation_type == "substitution":
            # Replace the token with a random one (for simplicity, replacing with "RANDOM" token)
            tokens[index] = "RANDOM"
        elif perturbation_type == "deletion" and len(tokens) > 1:
            # Remove the token
            tokens.pop(index)
        elif perturbation_type == "shuffle" and len(tokens) > 1:
            # Shuffle the token with another random token
            swap_index = random.randint(0, len(tokens) - 1)
            tokens[index], tokens[swap_index] = tokens[swap_index], tokens[index]
    
    return ' '.join(tokens)



In [16]:
perturb_characters("The beatles", num_chars_to_perturb=1)


'lThe beatles'

In [17]:
# Example usage
text = "This is an example text."
perturbed_text = perturb_tokens(text, perturbation_strength=0.2)
print(perturbed_text)



This an example text.
