In [1]:
from perturbation_utils import *
from datasets import Dataset, load_from_disk, DatasetDict
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = 'bert'
DATA = 'SST-2'

In [3]:
from_path_dataset = f"./Data/Clean/{DATA}"


dataset = load_from_disk(from_path_dataset)['test'][:50] #['test']
dataset = Dataset.from_dict(dataset)


In [4]:
def create_human_masks(example, anno):
    global pos_tags
    noisy = {}
    ranked_anno = np.argsort(anno)
    
    strategic_anno = [] # Hold strategic annotations
    for i, v in enumerate(anno):
        if v != 0.0:
            strategic_anno.append(v)
        elif pos_tags[i].startswith('J'): # ADJECTIVE
            strategic_anno.append(0.3) ### Minimum value in list is 0.333 so this will be after the list
        elif pos_tags[i].startswith('R'): # ADVERB
            strategic_anno.append(0.25)
        elif pos_tags[i].startswith('V'): # VERB
            strategic_anno.append(0.2)
        elif pos_tags[i].startswith('N'): # NOUN
            strategic_anno.append(0.1)
        else:
            strategic_anno.append(0.05)
            
    ranked_strat_anno = np.argsort(strategic_anno)
    
    
    ##### Note when human annotation ends
    mask = (anno != 0)
    noisy[f'human-A_token'] = ' '.join(np.where(mask, '{TOKEN}', example))  
    noisy[f'human-A_charswap'] = ' '.join(np.where(mask, random_charswap(example), example)) 
    noisy[f'human-A_synonym'] = ' '.join(np.where(mask, wordswap(example), example)) 

    for prop in [0.25,0.5,0.7,0.8,0.9,0.95]:
        
        ## Random fill
        mask = [True if ele in ranked_anno[-round(len(anno)*prop):] else False for ele in np.arange(len(anno))]
        noisy[f'human-R_token_{prop*100:2.0f}'] = ' '.join(np.where(mask, '{TOKEN}', example))  
        noisy[f'human-R_charswap_{prop*100:2.0f}'] = ' '.join(np.where(mask, random_charswap(example), example))   #' '.join(np.where(mask,  , example))     
        noisy[f'human-R_synonym_{prop*100:2.0f}'] = ' '.join(np.where(mask, wordswap(example), example))   #' '.join(np.where(mask, wordswap(example) , example))  
        
        ## Strategic Fill  
        
        mask = [True if ele in ranked_strat_anno[-round(len(anno)*prop):] else False for ele in np.arange(len(anno))]
        noisy[f'human-S_token_{prop*100:2.0f}'] = ' '.join(np.where(mask, '{TOKEN}', example))  
        noisy[f'human-S_charswap_{prop*100:2.0f}'] = ' '.join(np.where(mask, random_charswap(example), example))   #' '.join(np.where(mask,  , example))     
        noisy[f'human-S_synonym_{prop*100:2.0f}'] = ' '.join(np.where(mask, wordswap(example), example))   #' '.join(np.where(mask, wordswap(example) , example))  
    
    return noisy

In [5]:
def insert_human_noise(example):
    global pos_tags
    nltk_tokens = nltk.word_tokenize(example['text'])
    pos_tags = [val[1] for val in nltk.pos_tag(nltk_tokens)]
    pos = {'pos_tags' : pos_tags}
    
    tokens = example['text'].split()
    anno = np.abs(np.array(example['annotations'].split()).astype(float)) ## Chose + or - to indicate if positive or negative word in SemEval
    
    noise = create_human_masks(tokens, anno)
    return example | noise | pos

In [6]:
def wordswap(iterable):
    global pos_tags
    global NO_MATCH
    global no_match_counter
    global word_counter
    out = []
    for i,x in enumerate(iterable):
        word_counter += 1
        new = find_replacement(x, pos_tags[i])
        if new == x:
            NO_MATCH.add((x, pos_tags[i]))
            no_match_counter += 1
        out.append(new)
                
    return out

In [7]:
new_dataset = dataset.map(insert_human_noise)

                                                           

In [9]:
new_dataset[0]['pos_tags']

['DT',
 'JJ',
 'NN',
 'NN',
 'WDT',
 'VBZ',
 'IN',
 'DT',
 'NNS',
 'IN',
 'DT',
 'JJ',
 'NN']