In [1]:
import nltk
from nltk.corpus import wordnet as wn
import random
from itertools import chain, compress
import numpy as np
import string
from datasets import Dataset, load_from_disk, DatasetDict
import json
from random import randint
import random


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load PPDB synonyms
with open('Resources/ppdb_synonyms.json') as json_file:
    clean_ppdb_synonyms = json.load(json_file)
    
with open('Resources/ppdb_synonyms_xxxl.json') as json_file:
    clean_ppdb_synonyms_XL = json.load(json_file)
    
with open('Resources/ppdb_synonyms_xxxl_nopostag.json') as json_file:
    clean_ppdb_synonyms_XXL = json.load(json_file)
    
    
from_path_dataset = "./Data/Clean/SST-2"


dataset = load_from_disk(from_path_dataset)['test'] #['test']

In [3]:
vocab = set()
twitter_ids = set()
for line in dataset['text']:
    for word in line.split():
        vocab.add(word.lower())
        if word[0] == '@':
            twitter_ids.add(word)

In [4]:
pos_tags = None

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return ''

def insert_char(string, index, chartoadd):
    return string[:index] + chartoadd + string[index:]

def swap_char(string, index, chartoadd):
    return string[:index] + chartoadd + string[index+1:]

def charinsert(iterable):
    return [insert_char(x, random.randint(0,len(x)), random.choice(string.ascii_letters)) for x in iterable]

def random_charswap(iterable):
    return [swap_char(x, random.randint(0,len(x)), random.choice(string.ascii_letters)) for x in iterable]

def realistic_charswap(iterable):
    return [butterfinger(x) for x in iterable]

def butterfinger(text,errors=1,keyboard='querty'):
    ### Adapted from https://github.com/alexyorke/butter-fingers/

    keyApprox = {}

    if keyboard == "querty": ## removed original word
        keyApprox['q'] = "wasedzx"
        keyApprox['w'] = "qesadrfcx"
        keyApprox['e'] = "wrsfdqazxcvgt"
        keyApprox['r'] = "etdgfwsxcvgt"
        keyApprox['t'] = "ryfhgedcvbnju"
        keyApprox['y'] = "tugjhrfvbnji"
        keyApprox['u'] = "yihkjtgbnmlo"
        keyApprox['i'] = "uojlkyhnmlp"
        keyApprox['o'] = "ipklujm"
        keyApprox['p'] = "lo['ik"

        keyApprox['a'] = "qszwxwdce"
        keyApprox['s'] = "wxadrfv"
        keyApprox['d'] = "ecsfaqgbv"
        keyApprox['f'] = "dgrvwsxyhn"
        keyApprox['g'] = "tbfhedcyjn"
        keyApprox['h'] = "yngjfrvkim"
        keyApprox['j'] = "hknugtblom"
        keyApprox['k'] = "jlinyhn"
        keyApprox['l'] = "okmpujn"

        keyApprox['z'] = "axsvde"
        keyApprox['x'] = "zcsdbvfrewq"
        keyApprox['c'] = "xvdfzswergb"
        keyApprox['v'] = "cfbgxdertyn"
        keyApprox['b'] = "vnghcftyun"
        keyApprox['n'] = "bmhjvgtuik"
        keyApprox['m'] = "nkjloik"
        keyApprox[' '] = " "
    else:
        print("Keyboard not supported.")
  
    if errors != 1:
        print("Can only make one error per text")
        return text
    
    error_idx = randint(0,len(text)) ### Choose a random letter in the text
    buttertext = ""
    for i,letter in enumerate(text):
        lcletter = letter.lower()
        if not lcletter in keyApprox.keys():
            newletter = lcletter
        else:
            if i == error_idx:
                newletter = random.choice(keyApprox[lcletter])
            else:
                newletter = lcletter
        # go back to original case
        if not lcletter == letter:
            newletter = newletter.upper()
        buttertext += newletter

    return buttertext

"""
Leet speak letter perturbation based on https://simple.wikipedia.org/wiki/Leet, excluding the space > 0.
"""

leet_letter_mappings = {
    "!": "1",
    "A": "4",
    "B": "8",
    "E": "3",
    "G": "6",
    "I": "1",
    "O": "0",
    "S": "5",
    "T": "7",
    "X": "8",
    "Z": "2",
    "a": "@",
    "b": "6",
    "e": "3",
    "g": "9",
    "h": "4",
    "i": "1",
    "l": "1",
    "o": "0",
    "s": "5",
    "t": "7",
    "z": "2"
}

def convert_to_leet(word):
    global leet_letter_mappings
    out = ""
    for l in word:
        if l in leet_letter_mappings.keys():
            out += leet_letter_mappings[l]
        else:
            out += l
    return out

def insert_leet(iterable):
    return [convert_to_leet(x) for x in iterable]


# def wordswap(iterable):
#     global pos_tags
#     out = []
#     for i,x in enumerate(iterable):
#         try:
#             out.append(random.choice ([ w.replace("_", "-") for w in list(chain.from_iterable([word.lemma_names() for word in wn.synsets(x, pos=get_wordnet_pos(pos_tags[i]))])) if w != x]))  #pos=wn.VERB
#             # First add more synsets here: openhownet? babelnet? article from pietro?
#         except IndexError:
#             # Get list of appropriate twitter aliases?
#             # Get list of punctuation
#             # Replace numbers?
#             # Another determiner?
#             out.append(x)
#     return out

def obscure_less(mask, to_remove):
    old_masked = np.array(list(compress(range(len(mask)), mask)))

    try:
        removed = np.random.choice(old_masked, size=to_remove, replace=False)
    except ValueError:
        return old_masked
        
    if len(removed) > 0:
        total_to_mask = np.setdiff1d(old_masked, removed)
        
        # Create new mask
        new_mask = np.zeros(len(mask), dtype=int)
        new_mask[total_to_mask] = 1
        new_mask.astype(bool)
    
        return new_mask
    else:
        print("OOOPS")
        return old_masked
    



In [5]:
import names

NO_MATCH = set()
no_match_counter = 0
word_counter = 0
weird_cases = set()

import re


ones = {
    0: '', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six',
    7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten', 11: 'eleven', 12: 'twelve',
    13: 'thirteen', 14: 'fourteen', 15: 'fifteen', 16: 'sixteen',
    17: 'seventeen', 18: 'eighteen', 19: 'nineteen'}
tens = {
    2: 'twenty', 3: 'thirty', 4: 'forty', 5: 'fifty', 6: 'sixty',
    7: 'seventy', 8: 'eighty', 9: 'ninety'}
illions = {
    1: 'thousand', 2: 'million', 3: 'billion', 4: 'trillion', 5: 'quadrillion',
    6: 'quintillion', 7: 'sextillion', 8: 'septillion', 9: 'octillion',
    10: 'nonillion', 11: 'decillion'}


def say_number(i):
    """
    Convert an integer in to it's word representation.

    say_number(i: integer) -> string
    """
    if i < 0:
        return _join('negative', _say_number_pos(-i))
    if i == 0:
        return 'zero'
    return _say_number_pos(i)


def _say_number_pos(i):
    if i < 20:
        return ones[i]
    if i < 100:
        return _join(tens[i // 10], ones[i % 10])
    if i < 1000:
        return _divide(i, 100, 'hundred')
    for illions_number, illions_name in illions.items():
        if i < 1000**(illions_number + 1):
            break
    return _divide(i, 1000**illions_number, illions_name)


def _divide(dividend, divisor, magnitude):
    return _join(
        _say_number_pos(dividend // divisor),
        magnitude,
        _say_number_pos(dividend % divisor),
    )


def _join(*args):
    return '-'.join(filter(bool, args))

def convert_to_case(old, new):
    global weird_cases
    if old.isupper():
        return new.upper()
    if old.islower():
        return new.lower()
    if old.istitle():
        return new.title()
    weird_cases.add(old)
    return new

def random_wordswap(iterable):
    return [ find_random(x) for x in iterable]

def find_random(word):
    global vocab
    return convert_to_case(word, random.choice([w for w in vocab if w != word.lower()]))

def find_replacement(word, pos=''):
    global twitter_ids
    
    # Get list of appropriate twitter aliases? and names?
    # Get list of punctuation
    quotes = [ "'", "''", "`", "``", '"']
    brackets = ["(", ")", "{", "}", "[", "]", '/']
    punct = [ '.', '!', '?', ',']
    breaks = ['-', '--', ',', ':', ';']
    
    if word[:12] == "http://t.co/" : ##URL: 
        return  word[:-8] + ''.join(random.choice(string.ascii_letters + string.digits) for i in range(8))
    
    if word[:13] == "https://t.co/" : ##URL:
        return  word[:-8] + ''.join(random.choice(string.ascii_letters + string.digits) for i in range(8))
    
    if word[0] == '#':
        return('#' + find_replacement(word[1:]))
    
    if word[0] == '@': # twitter Id
        return random.choice([t for t in twitter_ids if t != word])
    
    if pos == 'DT':
        dets = ['a', 'an', 'the', 'this', 'that']
        return convert_to_case(word, random.choice([d for d in dets if d != word.lower()]))
    
    if pos == 'WDT':
        wdts = ['that', 'what', 'whatever', 'which', 'whichever']
        return convert_to_case(word, random.choice([d for d in wdts if d != word.lower()]))
    
    # if pos == 'PRP$':
    #     prps = ['her', 'his', 'mine', 'my', 'our', 'ours', 'their', 'your']
    #     return random.choice([d for d in prps if d != word.lower()])
    
    # if pos == 'PRP':
    #     prps = ['hers', 'herself', 'him', 'himself', 'hisself', 'it', 'itself', 'me', 'myself', 'one', 'oneself', 'ours', 'ourselves', 'ownself', 'she', 'theirs', 'them', 'themselves', 'they', 'us']
        
    if pos == 'NNP': # Proper noun
        if word[-2:] == "'s":
            return convert_to_case(word[:-2], random.choice([names.get_first_name(), names.get_last_name()])) + "'s'"
        else:
            return convert_to_case(word, random.choice([names.get_first_name(), names.get_last_name()]))
    
    
    if word in quotes:
        return random.choice([d for d in quotes if d != word.lower()])
    
    if word in brackets:
        return random.choice([d for d in brackets if d != word.lower()])
    
    if word in punct:
        return random.choice([d for d in punct if d != word.lower()])
    
    if word in breaks:
        return random.choice([d for d in breaks if d != word.lower()])
    
    if word.isnumeric():
        return say_number(int(word))
        
    # Collect wordnet synonyms
    options_wn = [ w.replace("_", "-") for w in list(chain.from_iterable([syn.lemma_names() for syn in wn.synsets(word.lower(), pos=get_wordnet_pos(pos))])) if w != word]
    
    if options_wn == []:
        options_wn = [ w.replace("_", "-") for w in list(chain.from_iterable([syn.lemma_names() for syn in wn.synsets(word.lower())])) if w != word]
    
    # Collect synonyms from PPDB
    if pos != '':
        try:
            options_ppdb = clean_ppdb_synonyms[word.lower()][pos]
        except KeyError:
            options_ppdb = []
    else:
        try:
            options_ppdb = clean_ppdb_synonyms_XXL[word.lower()]
        except KeyError:
            options_ppdb = []
            
    # Babelnet?? -- REQ PYTHON 3.8
        
    full_set = options_wn + options_ppdb
    
    try:
        return convert_to_case(word,random.choice(full_set))
        
    except IndexError:
        
        if word[-1] in breaks:
            return find_replacement(word[:-1]) + word[-1]
            
        if word[0] in breaks:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] in punct:
            return find_replacement(word[:-1]) + word[-1]
                    
        if word[0] in punct:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] in quotes:
            return find_replacement(word[:-1]) + word[-1]
            
        if word[0] in quotes:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] in brackets:
            return find_replacement(word[:-1]) + word[-1]
            
        if word[0] in brackets:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] == "%":
            return find_replacement(word[:-1]) + '%'
        
        ### Try to parse by hyphens
        if '-' in word:
            parts = word.split('-')
            for i,p in enumerate(parts):
                n = find_replacement(p)
                if n != p:
                    return '-'.join(parts[:i] + [n] + parts[i+1:])

                
        if '/' in word:
            parts = word.split('/')
            for i,p in enumerate(parts):
                n = find_replacement(p)
                if n != p:
                    return '/'.join(parts[:i] + [n] + parts[i+1:])
                
        if '.' in word:
            parts = word.split('.')
            for i,p in enumerate(parts):
                if p != '':
                    n = find_replacement(p)
                    if n != p:
                        return '.'.join(parts[:i] + [n] + parts[i+1:])
    

        
        # #### Try to parse TextLikeThis
        # if len(word) > 1 and not word.isupper() and any(ele.isupper() for ele in word):
        #     parts = re.findall('[a-zA-Z][^A-Z]*', word)
        #     for i,p in enumerate(parts):
        #         if p != '':
        #             n = find_replacement(p)
        #             if n != p:
        #                 return ''.join(parts[:i] + [n] + parts[i+1:])
        
        ## check less good fits
        try:
            return convert_to_case(word,random.choice(clean_ppdb_synonyms_XL[word.lower()][pos]))
        except (KeyError, IndexError):
            try:
                return convert_to_case(word,random.choice(clean_ppdb_synonyms_XXL[word.lower()]))
            except ( KeyError, IndexError) :
                if word[-3:] == 'ish':
                    return find_replacement(word[:-3]) + 'ish'
                if word[-4:] == 'ness':
                    return find_replacement(word[:-4]) + 'ness'
                if word[-4:] == 'less':
                    return find_replacement(word[:-4]) + 'less'
                # if word.istitle(): # implies proper noun
                #     return random.choice([names.get_first_name(), names.get_last_name()])
                return word
            
        
    

def wordswap(iterable):
    global pos_tags
    global NO_MATCH
    global no_match_counter
    global word_counter
    out = []
    for i,x in enumerate(iterable):
        word_counter += 1
        new = find_replacement(x, pos_tags[i])
        if new == x:
            NO_MATCH.add((x, pos_tags[i]))
            no_match_counter += 1
        out.append(new)
                
    return out
        
        
def create_random_masks(example):
    masks = {}
    noisy = {}

    ### Start with everything
    prop=1
    mask = np.ones(len(example), dtype=int)
    mask.astype(bool)
    updated_lasttime= True

    for additional in [.05]: #,.05,.1,.1,.2,.25]:
        prop -= additional
        if not updated_lasttime:
            amt = old + additional
        else:
            amt = additional
            
        to_remove = round(amt * len(mask))
        
        if to_remove == 0:
            updated_lasttime = False
            old = additional
        else:
            updated_lasttime = True
            mask = obscure_less(mask, to_remove)
            
        # masks[f'random_{prop*100:2.0f}'] = mask
        # noisy[f'random_token_{prop*100:2.0f}'] = ' '.join(np.where(mask, '{TOKEN}' , example))    
        # noisy[f'random_charswap_{prop*100:2.0f}'] = ' '.join(np.where(mask, charswap(example) , example))    
        noisy[f'random_synonym_{prop*100:2.0f}'] = ' '.join(np.where(mask, wordswap(example) , example))    
    
    return noisy


def match_pos_token_to_original(pos_tokens, raw_orig, pos_tags):
    orig = []
    for word in raw_orig:
        if word != "":
            orig.append(word.strip())
    pos_idx = 0
    last_pos_idx = 1
    orig_idx = 0
    
    orig_to_pos_mapping = {}
    orig_idx2token = {}

    while pos_idx < len(pos_tokens) and orig_idx < len(orig):

        current_orig = orig[orig_idx]
        current_pos = pos_tokens[pos_idx]
        orig_to_pos_mapping[orig_idx] = [pos_idx]
        
        pos_idx += 1
        orig_idx2token[orig_idx] = current_orig
        if current_pos != current_orig:			
            combined = current_pos
            last_pos_idx = pos_idx
            while last_pos_idx < len(pos_tokens):
                next_part = pos_tokens[last_pos_idx]				
                combined += next_part
                orig_to_pos_mapping[orig_idx].append(last_pos_idx)
                if combined == current_orig:					
                    pos_idx = last_pos_idx + 1
                    break
                else:
                    last_pos_idx += 1

        orig_idx += 1
        
    pos_to_drop = ["$", '', "(", ")", ",", "#", "POS", "--", ".", ":", "''", '``']

    new_pos_tags = []
    for k in orig_to_pos_mapping.keys():
        if len(orig_to_pos_mapping[k]) == 1:
            new_pos_tags.append(pos_tags[orig_to_pos_mapping[k][0]])
        else:
            to_add = []
            for i in orig_to_pos_mapping[k]:
                if pos_tags[i] not in pos_to_drop:
                    to_add.append(pos_tags[i])
            if len(to_add) == 1:
                new_pos_tags.append(to_add[0])
            else:
                new_pos_tags.append('')
    
    return new_pos_tags


def insert_random_noise(example):
    global pos_tags
    global tokens
    nltk_tokens = nltk.word_tokenize(example['text'])
    pos_tags = [val[1] for val in nltk.pos_tag(nltk_tokens)]
    
    tokens = example['text'].split()
    
    pos_tags = match_pos_token_to_original(nltk_tokens, tokens, pos_tags)
        
    noise = create_random_masks(tokens)
    return noise #example | noise


In [6]:
def create_random_masks(example):
    masks = {}
    noisy = {}

    ### Start with everything
    prop=1
    mask = np.ones(len(example), dtype=int)
    mask.astype(bool)
    updated_lasttime= True

    for additional in [.05,.05,.1,.1,.2,.25]:
        prop -= additional
        if not updated_lasttime:
            amt = old + additional
        else:
            amt = additional
            
        to_remove = round(amt * len(mask))
        
        if to_remove == 0:
            updated_lasttime = False
            old = additional
        else:
            updated_lasttime = True
            mask = obscure_less(mask, to_remove)
            
        masks[f'random_{prop*100:2.0f}'] = mask
        noisy[f'random_token_{prop*100:2.0f}'] = ' '.join(np.where(mask, '{TOKEN}' , example))    
        noisy[f'random_charswap_{prop*100:2.0f}'] = ' '.join(np.where(mask, random_charswap(example) , example))    
        noisy[f'random_synonym_{prop*100:2.0f}'] = ' '.join(np.where(mask, wordswap(example) , example))
        noisy[f'random_butterfingers_{prop*100:2.0f}'] = ' '.join(np.where(mask, realistic_charswap(example) , example))
        noisy[f'random_wordswap_{prop*100:2.0f}'] = ' '.join(np.where(mask, random_wordswap(example) , example))
        noisy[f'random_charinsert_{prop*100:2.0f}'] = ' '.join(np.where(mask, charinsert(example) , example))
        noisy[f'random_l33t_{prop*100:2.0f}'] = ' '.join(np.where(mask, insert_leet(example) , example))    
    
    return noisy


# def insert_random_noise(example):
#     global pos_tags
#     nltk_tokens = nltk.word_tokenize(example['sentence'])
#     pos_tags = [val[1] for val in nltk.pos_tag(nltk_tokens)]
    
#     tokens = example['sentence'].split()
    
#     noise = create_random_masks(tokens)
#     return example | noise

def insert_random_noise(example):
    global pos_tags
    nltk_tokens = nltk.word_tokenize(example['text'])
    pos_tags = [val[1] for val in nltk.pos_tag(nltk_tokens)]
    
    tokens = example['text'].split()
    
    noise = create_random_masks(tokens)
    return example | noise


In [7]:
text = { 'text': 'it is an amazingly good movie. Watch it tonight!!!!'}

insert_random_noise(text)


{'text': 'it is an amazingly good movie. Watch it tonight!!!!',
 'random_token_95': '{TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN}',
 'random_charswap_95': 'kt Ts anG ahazingly geod moFie. Waxch itC toVight!!!!',
 'random_synonym_95': 'information-technology be a astonishingly honorable flick. Determine information-technology wednesdays!!!!',
 'random_butterfingers_95': 'it ms an amazingly gomd movie. Wanch ic tonight!!!!',
 'random_wordswap_95': 'their touches beguiling get whose stretches Community art ski',
 'random_charinsert_95': 'itG Ris van amahzingly golod moevie. Watich iet tonight!!!Q!',
 'random_l33t_95': '17 15 @n @m@21n91y 900d m0v13. W@7c4 17 70n19471111',
 'random_token_90': '{TOKEN} {TOKEN} an {TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN} {TOKEN}',
 'random_charswap_90': 'Qt Ks an amazinHly goMd movin. Wabch iW tonightg!!!',
 'random_synonym_90': 'it make-up an stunningly proficient moving-picture-show. Determine information-technology wednesdays!!!!

In [19]:
test = dataset.map(insert_random_noise)


Map:   0%|          | 0/63 [00:00<?, ? examples/s]


TypeError: unsupported operand type(s) for |: 'LazyRow' and 'dict'

In [83]:
# test

In [84]:
# out_path_dataset = "./Data/Noise/SST-2/"

# dataset.cache_files

# full_dataset = DatasetDict({
#     'test': test,})

# dataset.save_to_disk(out_path_dataset)

[]

### Debugging

In [86]:
b.id

NameError: name 'b' is not defined

In [58]:
import babelnet as bn
from babelnet.language import Language
from babelnet.pos import POS

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return POS.ADJ
    elif treebank_tag.startswith('V'):
        return POS.VERB
    elif treebank_tag.startswith('N'):
        return POS.NOUN
    elif treebank_tag.startswith('R'):
        return POS.ADV
    else:
        return ''

byl = bn.get_synsets('metaphor', from_langs=[Language.EN], poses=[POS.NOUN])
senses = set()
for b in byl:
    by = bn.get_synset(bn.BabelSynsetID(str(b.id)))
    senses.add(by.main_sense(Language.EN).full_lemma)
    for edge in by.outgoing_edges(): #bn.data.relation.BabelPointer.SIMILAR_TO
        # print(edge.pointer)
        if 'similar' in str(edge.pointer):
            senses.add(bn.get_synset(edge.id_target).main_sense(Language.EN).full_lemma)
            
print(senses)

{'Metaphor_(designers)', 'metaphor'}


In [65]:
byl = bn.get_synsets('metaphor', from_langs=[Language.EN], poses=[POS.NOUN])
senses = set()
for b in byl:
    by = bn.get_senses(bn.BabelSynsetID(str(b.id)))
    for s in by:
        print(bn.get_synset(s.id))
    print(by)

ValueError: The arguments have to be homogeneous.

In [62]:
byl = bn.get_synsets('metaphor', from_langs=[Language.EN], poses=[POS.NOUN])
senses = set()
for b in byl:
    by = bn.get_synset(bn.BabelSynsetID(str(b.id)))
    senses.add(by.main_sense(Language.EN).full_lemma)
    if 'hyponym' in str(edge.pointer) or 'hypernym' in str(edge.pointer):
        senses.add(bn.get_synset(edge.id_target).main_sense(Language.EN).full_lemma)
        
senses

{'Metaphor_(designers)', 'metaphor'}

In [26]:
test = bn.get_synsets('poignant', from_langs=[Language.EN], poses=[POS.ADJ])[0]
for edge in test.outgoing_edges():
    print(edge.pointer, bn.get_synset(edge.id_target).main_sense(Language.EN).full_lemma)

similar moving
derivation pathos
similar_to moving
derivationally_related_form pathos
gloss_related_form_(monosemous) affect
gloss_related_form_(disambiguated) affect


In [45]:
by = bn.get_synset(bn.BabelSynsetID('bn:00108734a'))
for edge in by.outgoing_edges(bn.data.relation.BabelPointer.SIMILAR_TO):
    print(str(by.id) + '\t' + by.main_sense(Language.EN).full_lemma,
          edge.pointer, edge.id_target, bn.get_synset(edge.id_target).main_sense(Language.EN).full_lemma, sep=' - ')

bn:00108734a	poignant - similar_to - bn:00108168a - painful


In [14]:
list(chain.from_iterable([syn.lemma_names() for syn in wn.synsets('poignant', pos=wn.ADJ)]))

['affecting', 'poignant', 'touching', 'poignant']

In [73]:
w = 'eccentricity'

for s in bn.get_senses(w, from_langs=[Language.EN], poses=[POS.NOUN]):
    if w not in str(s.lemma).lower():
        print(s.lemma)
        
# bn.get_synsets(w, from_langs=[Language.EN], poses=[POS.NOUN])
    


eccentric
eccentric
eccentric
elongated_orbit
orbital_eccentricities
parabolic_and_hyperbolic_orbits
kookiness
quirkiness
Eccentrism
eccentric_behaviour
eccentrics
quirkiness
Quirky
Quirky.com
Undercurrent
Wackiness
Wacko
Wacky
distance
distance
geodetic_distance
diameter
diameter
geodesic_distance
graph_diameter
graph_metric
Graph_radius
pseudo-peripheral_vertex
radius


# Synonym Replacement

In [97]:
import names

NO_MATCH = set()
no_match_counter = 0
word_counter = 0
weird_cases = set()

import re


ones = {
    0: '', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six',
    7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten', 11: 'eleven', 12: 'twelve',
    13: 'thirteen', 14: 'fourteen', 15: 'fifteen', 16: 'sixteen',
    17: 'seventeen', 18: 'eighteen', 19: 'nineteen'}
tens = {
    2: 'twenty', 3: 'thirty', 4: 'forty', 5: 'fifty', 6: 'sixty',
    7: 'seventy', 8: 'eighty', 9: 'ninety'}
illions = {
    1: 'thousand', 2: 'million', 3: 'billion', 4: 'trillion', 5: 'quadrillion',
    6: 'quintillion', 7: 'sextillion', 8: 'septillion', 9: 'octillion',
    10: 'nonillion', 11: 'decillion'}


def say_number(i):
    """
    Convert an integer in to it's word representation.

    say_number(i: integer) -> string
    """
    if i < 0:
        return _join('negative', _say_number_pos(-i))
    if i == 0:
        return 'zero'
    return _say_number_pos(i)


def _say_number_pos(i):
    if i < 20:
        return ones[i]
    if i < 100:
        return _join(tens[i // 10], ones[i % 10])
    if i < 1000:
        return _divide(i, 100, 'hundred')
    for illions_number, illions_name in illions.items():
        if i < 1000**(illions_number + 1):
            break
    return _divide(i, 1000**illions_number, illions_name)


def _divide(dividend, divisor, magnitude):
    return _join(
        _say_number_pos(dividend // divisor),
        magnitude,
        _say_number_pos(dividend % divisor),
    )


def _join(*args):
    return '-'.join(filter(bool, args))

def convert_to_case(old, new):
    global weird_cases
    if old.isupper():
        return new.upper()
    if old.islower():
        return new.lower()
    if old.istitle():
        return new.title()
    weird_cases.add(old)
    return new

def random_wordswap(iterable):
    return [ find_random(x) for x in iterable]

def find_random(word):
    global vocab
    return convert_to_case(word, random.choice([w for w in vocab if w != word.lower()]))

def find_replacement(word, pos=''):
    global twitter_ids
    
    # Get list of appropriate twitter aliases? and names?
    # Get list of punctuation
    quotes = [ "'", "''", "`", "``", '"']
    brackets = ["(", ")", "{", "}", "[", "]", '/']
    punct = [ '.', '!', '?', ',']
    breaks = ['-', '--', ',', ':', ';']
    
    if word[:12] == "http://t.co/" : ##URL: 
        return  word[:-8] + ''.join(random.choice(string.ascii_letters + string.digits) for i in range(8))
    
    if word[:13] == "https://t.co/" : ##URL:
        return  word[:-8] + ''.join(random.choice(string.ascii_letters + string.digits) for i in range(8))
    
    if word[0] == '#':
        return('#' + find_replacement(word[1:]))
    
    if word[0] == '@': # twitter Id
        return random.choice([t for t in twitter_ids if t != word])
    
    if pos == 'DT':
        dets = ['a', 'an', 'the', 'this', 'that']
        return convert_to_case(word, random.choice([d for d in dets if d != word.lower()]))
    
    if pos == 'WDT':
        wdts = ['that', 'what', 'whatever', 'which', 'whichever']
        return convert_to_case(word, random.choice([d for d in wdts if d != word.lower()]))
    
    # if pos == 'PRP$':
    #     prps = ['her', 'his', 'mine', 'my', 'our', 'ours', 'their', 'your']
    #     return random.choice([d for d in prps if d != word.lower()])
    
    # if pos == 'PRP':
    #     prps = ['hers', 'herself', 'him', 'himself', 'hisself', 'it', 'itself', 'me', 'myself', 'one', 'oneself', 'ours', 'ourselves', 'ownself', 'she', 'theirs', 'them', 'themselves', 'they', 'us']
        
    if pos == 'NNP': # Proper noun
        if word[-2:] == "'s":
            return convert_to_case(word[:-2], random.choice([names.get_first_name(), names.get_last_name()])) + "'s'"
        else:
            return convert_to_case(word, random.choice([names.get_first_name(), names.get_last_name()]))
    
    
    if word in quotes:
        return random.choice([d for d in quotes if d != word.lower()])
    
    if word in brackets:
        return random.choice([d for d in brackets if d != word.lower()])
    
    if word in punct:
        return random.choice([d for d in punct if d != word.lower()])
    
    if word in breaks:
        return random.choice([d for d in breaks if d != word.lower()])
    
    if word.isnumeric():
        return say_number(int(word))
        
    # Collect wordnet synonyms
    options_wn = [ w.replace("_", "-") for w in list(chain.from_iterable([syn.lemma_names() for syn in wn.synsets(word.lower(), pos=get_wordnet_pos(pos))])) if w != word]
    
    if options_wn == []:
        options_wn = [ w.replace("_", "-") for w in list(chain.from_iterable([syn.lemma_names() for syn in wn.synsets(word.lower())])) if w != word]
    
    # Collect synonyms from PPDB
    if pos != '':
        try:
            options_ppdb = clean_ppdb_synonyms[word.lower()][pos]
        except KeyError:
            options_ppdb = []
    else:
        try:
            options_ppdb = clean_ppdb_synonyms_XXL[word.lower()]
        except KeyError:
            options_ppdb = []
            
    # Babelnet?? -- REQ PYTHON 3.8
        
    full_set = options_wn + options_ppdb
    
    try:
        return convert_to_case(word,random.choice(full_set))
        
    except IndexError:
        
        if word[-1] in breaks:
            return find_replacement(word[:-1]) + word[-1]
            
        if word[0] in breaks:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] in punct:
            return find_replacement(word[:-1]) + word[-1]
                    
        if word[0] in punct:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] in quotes:
            return find_replacement(word[:-1]) + word[-1]
            
        if word[0] in quotes:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] in brackets:
            return find_replacement(word[:-1]) + word[-1]
            
        if word[0] in brackets:
            return word[0] + find_replacement(word[1:])
        
        if word[-1] == "%":
            return find_replacement(word[:-1]) + '%'
        
        ### Try to parse by hyphens
        if '-' in word:
            parts = word.split('-')
            for i,p in enumerate(parts):
                n = find_replacement(p)
                if n != p:
                    return '-'.join(parts[:i] + [n] + parts[i+1:])

                
        if '/' in word:
            parts = word.split('/')
            for i,p in enumerate(parts):
                n = find_replacement(p)
                if n != p:
                    return '/'.join(parts[:i] + [n] + parts[i+1:])
                
        if '.' in word:
            parts = word.split('.')
            for i,p in enumerate(parts):
                if p != '':
                    n = find_replacement(p)
                    if n != p:
                        return '.'.join(parts[:i] + [n] + parts[i+1:])
    

        
        # #### Try to parse TextLikeThis
        # if len(word) > 1 and not word.isupper() and any(ele.isupper() for ele in word):
        #     parts = re.findall('[a-zA-Z][^A-Z]*', word)
        #     for i,p in enumerate(parts):
        #         if p != '':
        #             n = find_replacement(p)
        #             if n != p:
        #                 return ''.join(parts[:i] + [n] + parts[i+1:])
        
        ## check less good fits
        try:
            return convert_to_case(word,random.choice(clean_ppdb_synonyms_XL[word.lower()][pos]))
        except (KeyError, IndexError):
            try:
                return convert_to_case(word,random.choice(clean_ppdb_synonyms_XXL[word.lower()]))
            except ( KeyError, IndexError) :
                if word[-3:] == 'ish':
                    return find_replacement(word[:-3]) + 'ish'
                if word[-4:] == 'ness':
                    return find_replacement(word[:-4]) + 'ness'
                if word[-4:] == 'less':
                    return find_replacement(word[:-4]) + 'less'
                # if word.istitle(): # implies proper noun
                #     return random.choice([names.get_first_name(), names.get_last_name()])
                return word
            
        
    

def wordswap(iterable):
    global pos_tags
    global NO_MATCH
    global no_match_counter
    global word_counter
    out = []
    for i,x in enumerate(iterable):
        word_counter += 1
        new = find_replacement(x, pos_tags[i])
        if new == x:
            NO_MATCH.add((x, pos_tags[i]))
            no_match_counter += 1
        out.append(new)
                
    return out
        
        
def create_random_masks(example):
    masks = {}
    noisy = {}

    ### Start with everything
    prop=1
    mask = np.ones(len(example), dtype=int)
    mask.astype(bool)
    updated_lasttime= True

    for additional in [.05]: #,.05,.1,.1,.2,.25]:
        prop -= additional
        if not updated_lasttime:
            amt = old + additional
        else:
            amt = additional
            
        to_remove = round(amt * len(mask))
        
        if to_remove == 0:
            updated_lasttime = False
            old = additional
        else:
            updated_lasttime = True
            mask = obscure_less(mask, to_remove)
            
        # masks[f'random_{prop*100:2.0f}'] = mask
        # noisy[f'random_token_{prop*100:2.0f}'] = ' '.join(np.where(mask, '{TOKEN}' , example))    
        # noisy[f'random_charswap_{prop*100:2.0f}'] = ' '.join(np.where(mask, charswap(example) , example))    
        noisy[f'random_synonym_{prop*100:2.0f}'] = ' '.join(np.where(mask, wordswap(example) , example))    
    
    return noisy


def match_pos_token_to_original(pos_tokens, raw_orig, pos_tags):
    orig = []
    for word in raw_orig:
        if word != "":
            orig.append(word.strip())
    pos_idx = 0
    last_pos_idx = 1
    orig_idx = 0
    
    orig_to_pos_mapping = {}
    orig_idx2token = {}

    while pos_idx < len(pos_tokens) and orig_idx < len(orig):

        current_orig = orig[orig_idx]
        current_pos = pos_tokens[pos_idx]
        orig_to_pos_mapping[orig_idx] = [pos_idx]
        
        pos_idx += 1
        orig_idx2token[orig_idx] = current_orig
        if current_pos != current_orig:			
            combined = current_pos
            last_pos_idx = pos_idx
            while last_pos_idx < len(pos_tokens):
                next_part = pos_tokens[last_pos_idx]				
                combined += next_part
                orig_to_pos_mapping[orig_idx].append(last_pos_idx)
                if combined == current_orig:					
                    pos_idx = last_pos_idx + 1
                    break
                else:
                    last_pos_idx += 1

        orig_idx += 1
        
    pos_to_drop = ["$", '', "(", ")", ",", "#", "POS", "--", ".", ":", "''", '``']

    new_pos_tags = []
    for k in orig_to_pos_mapping.keys():
        if len(orig_to_pos_mapping[k]) == 1:
            new_pos_tags.append(pos_tags[orig_to_pos_mapping[k][0]])
        else:
            to_add = []
            for i in orig_to_pos_mapping[k]:
                if pos_tags[i] not in pos_to_drop:
                    to_add.append(pos_tags[i])
            if len(to_add) == 1:
                new_pos_tags.append(to_add[0])
            else:
                new_pos_tags.append('')
    
    return new_pos_tags


def insert_random_noise(example):
    global pos_tags
    global tokens
    nltk_tokens = nltk.word_tokenize(example['text'])
    pos_tags = [val[1] for val in nltk.pos_tag(nltk_tokens)]
    
    tokens = example['text'].split()
    
    pos_tags = match_pos_token_to_original(nltk_tokens, tokens, pos_tags)
        
    noise = create_random_masks(tokens)
    return noise #example | noise


In [98]:
out = dataset.map(insert_random_noise)

print(no_match_counter / word_counter)


Map: 100%|██████████| 63/63 [00:00<00:00, 257.25 examples/s]

0.16187739463601533





In [1]:
[print(n[0]) for n in NO_MATCH]

NameError: name 'NO_MATCH' is not defined

# Debugging

In [79]:
weird_cases

{"'InternationalDirectionerDay'",
 '+',
 '..',
 '0-0',
 '8:00',
 'BusinessWeek',
 'DiViNetworks',
 'ICONiac',
 'ICONiacz',
 'McBain',
 'PSSAs',
 'PSSAs!',
 'RT/follow',
 'RamCharan',
 'WorldStarHipHop',
 '|'}

In [20]:
out['random_synonym_95']

['ACE scarce trust Edmund Bennett be beginning tabu of that excerpt car this-evening #terrierball #htafc #dontfancyArfieldingoal',
 "Follow this motion-picture day #The70''s' along #OVTV and be sunnily startled to watch Song Brooks (Bathroom, #OLTL) in it.",
 'ATOMIC-NUMBER-53 wish that @michaelaconlin: telecasting http://t.co/c2d35dek ERROL -- quaternary Live Zachary twenty-seven/31 -- We be + Wicks upward a she',
 "THE winning for #LFC five Olson testament see them pose oftheir outside full-point check for that overall of utmost mollify. And we take to gain tomorrow''s' ballgame.",
 '@pariahbeats thanksgiving! Laura the email to info@kayafestival.commanders.uk consider booking. Pick-Up uyou fetch Smith Noe this-evening , flummox striation!',
 'Non an Arsehole hater only pleased to go-steady them add-up backward from Langley pile last-place nighttime v Leray to acquire zero-0. Wilda show hangingwith a Harry Luke as-well.',
 "Thither be even-so just-the-ticket uncommitted for a Nagle n

In [51]:
text = "Watched a movie yesterday #The70's on #OVTV and was 'pleasantly' surprised-- to see Michael Easton (John, #OLTL) in it."
tokens = text.split()
nltk_tokens = nltk.word_tokenize(text)
pos_tags = [val[1] for val in nltk.pos_tag(nltk_tokens)]
orig_to_pos_mapping, _ = match_pos_token_to_original(nltk_tokens, tokens)




In [46]:
pos_dict = dict(enumerate(pos_tags))
orig_to_pos_mapping.map(pos_dict)

AttributeError: 'dict' object has no attribute 'map'

In [58]:
pos_to_drop = ["$", '', "(", ")", ",", "#", "POS", "--", ".", ":", "''"]

new_pos_tags = []
for k in orig_to_pos_mapping.keys():
    if len(orig_to_pos_mapping[k]) == 1:
        new_pos_tags.append(pos_tags[orig_to_pos_mapping[k][0]])
    else:
        to_add = []
        for i in orig_to_pos_mapping[k]:
            if pos_tags[i] not in pos_to_drop:
                to_add.append(pos_tags[i])
        if len(to_add) == 1:
            new_pos_tags.append(to_add[0])
        else:
            print(to_add)
            new_pos_tags.append('')


In [59]:
new_pos_tags

['VBD',
 'DT',
 'NN',
 'NN',
 'NNP',
 'IN',
 'NNP',
 'CC',
 'VBD',
 'RB',
 'JJ',
 'TO',
 'VB',
 'NNP',
 'NNP',
 'NNP',
 'NNP',
 'IN',
 'PRP']

In [None]:
model_name = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(f'./Tokenizers/Pretrained/{model_name}.pt', ## Saved locally
                                            do_lower_case = False ## Check if BERT is uncased
                                            )


collator = ClassificationCollator(tokenizer=tokenizer,
                                    text_label=text_label,
                                    idx_label=idx_label,
                                    test_only=True,
                                    max_seq_len=MAX_LENGTH)

if experiment_time == True:
    g = { 'TOKEN': tokenizer.unk_token }
    test_dataset = load_from_disk(f"./Data/Noise/{test_data}")['test'].format(**g)

In [None]:
def get_model_data(model_name, data_name, device, max_size=None, sequential=False, experiment_time = False):
    
    ####################### Determine Tokenizer rules #############################
    
        
    if data_name == 'SST-2':
        MAX_LENGTH = 64
        is_lower = True
    else:
        MAX_LENGTH = 256
        is_lower = False
        
    idx_label = "index"
    text_label = "text"
    test_data = data_name

        
    if "electra" in model_name: ## Must be lowercase
        is_lower = True
        
        
    tokenizer = AutoTokenizer.from_pretrained(f'./Tokenizers/Pretrained/{model_name}.pt', ## Saved locally
                                                do_lower_case = is_lower ## Check if BERT is uncased
                                                )
    
    if 'gpt2' in model_name:
        # default to left padding
        tokenizer.padding_side = "left"
        # Define PAD Token = EOS Token = 50256
        tokenizer.pad_token = tokenizer.eos_token
    
    collator = ClassificationCollator(tokenizer=tokenizer,
                                      text_label=text_label,
                                      idx_label=idx_label,
                                      test_only=True,
                                      max_seq_len=MAX_LENGTH)
    
    if experiment_time == True:
        g = { 'TOKEN': tokenizer.unk_token }
        test_dataset = load_from_disk(f"./Data/Noise/{test_data}")['test'].format(**g)

    else:

        ##### Get datasets and tokenize them
        if max_size:
            test_dataset = load_from_disk(f"./Data/Clean/{test_data}")['test'][:max_size]
            test_dataset = Dataset.from_dict(test_dataset)
        else:
            test_dataset = load_from_disk(f"./Data/Clean/{test_data}")['test']  

    print("Dataset size: ", test_dataset.num_rows)

    # Create the DataLoaders for our training and validation sets.
    # We'll take training samples in random order. 
    test_dataloader = DataLoader(
                test_dataset,  # The training samples.
                sampler = SequentialSampler(test_dataset) if sequential else RandomSampler(test_dataset), # Select batches randomly
                batch_size = 1, # Test with this batch size.
                collate_fn=collator
            )
    
    print("Loaded data: ", data_name)

    ########################### LOAD MODEL ####################################################
    
    if model_name.split('-')[0] == 'bert':
        
        model = BertForSequenceClassification.from_pretrained(
            f'./Models/Pretrained/{model_name}.pt', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )
        model.load_state_dict(torch.load(f'./Models/{data_name}/bert.pt', map_location=device))
    elif model_name.split('-')[0] == 'roberta':
        model = RobertaForSequenceClassification.from_pretrained(
            f'./Models/Pretrained/{model_name}.pt', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )
        model.load_state_dict(torch.load(f'./Models/{data_name}/roberta.pt', map_location=device))    
    elif model_name.split('-')[0] == 'electra':
        model = ElectraForSequenceClassification.from_pretrained(
            f'./Models/Pretrained/{model_name}.pt', # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = 2, # The number of output labels--2 for binary classification.
                            # You can increase this for multi-class tasks.   
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )
        model.load_state_dict(torch.load(f'./Models/{data_name}/electra.pt', map_location=device))    
    elif 'gpt2' in model_name.split('-')[0]:
        
        model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=f"gpt2-medium", num_labels=2)
        model = GPT2ForSequenceClassification.from_pretrained(
            f'./Models/Pretrained/{model_name}.pt', # Use the 12-layer BERT model, with an uncased vocab.
            config = model_config
        )
        # resize model embedding to match new tokenizer
        model.resize_token_embeddings(len(tokenizer))

        # fix model padding token id
        model.config.pad_token_id = model.config.eos_token_id
        
    else:
        print("unaccepted model type. Received: ", model_name.split('-')[0])
        return -1
    
    model.to(device)
    
    print("Loaded model: ", model_name)
    return test_dataloader, model, tokenizer

In [93]:
nltk_tokens[5]

AttributeError: 'str' object has no attribute 'lemma'

In [95]:
nltk.pos_tag(nltk_tokens)

[('Sara', 'NNP'),
 ("'s", 'POS'),
 ('boyfriend', 'NN'),
 ('had', 'VBD'),
 ('a', 'DT'),
 ('lavish', 'JJ'),
 ('berry', 'NN'),
 ('birthday', 'NN'),
 ('party', 'NN'),
 ('last', 'JJ'),
 ('evening', 'NN')]

In [94]:
nltk_tokens

['Sara',
 "'s",
 'boyfriend',
 'had',
 'a',
 'lavish',
 'berry',
 'birthday',
 'party',
 'last',
 'evening']

In [91]:
%debug

> [0;32m/tmp/ipykernel_1806869/1313937350.py[0m(88)[0;36mcreate_random_masks[0;34m()[0m
[0;32m     86 [0;31m        [0mnoisy[0m[0;34m[[0m[0;34mf'random_token_{prop*100:2.0f}'[0m[0;34m][0m [0;34m=[0m [0;34m' '[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mwhere[0m[0;34m([0m[0mmask[0m[0;34m,[0m [0;34m'{TOKEN}'[0m [0;34m,[0m [0mexample[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     87 [0;31m        [0mnoisy[0m[0;34m[[0m[0;34mf'random_charswap_{prop*100:2.0f}'[0m[0;34m][0m [0;34m=[0m [0;34m' '[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mwhere[0m[0;34m([0m[0mmask[0m[0;34m,[0m [0mcharswap[0m[0;34m([0m[0mexample[0m[0;34m)[0m [0;34m,[0m [0mexample[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 88 [0;31m        [0mnoisy[0m[0;34m[[0m[0;34mf'random_synonym_{prop*100:2.0f}'[0m[0;34m][0m [0;34m=[0m [0;34m' '[0m[0;34m.[0m[0mjoin[0m[0;34m(

### PPDB Data

In [10]:
from nltk.corpus.reader.wordlist import MWAPPDBCorpusReader

ppdb = MWAPPDBCorpusReader(root='/home/fvd442/project/noise-paper/', fileids='ppdb-2.0-xxxl-lexical')

ppdb_synonyms = ppdb.words()

clean_ppdb_synonyms = {}

redo_lines = []
for line in ppdb_synonyms:
    try:
        l = line.split(' ||| ')
        if l[1].startswith('<'):
            continue
        if l[5] == 'Equivalence':
            l[0] = l[0][1:-1]
            if l[1] in clean_ppdb_synonyms.keys():
                if l[0] in clean_ppdb_synonyms[l[1]].keys():
                    clean_ppdb_synonyms[l[1]][l[0]].append(l[2])
                else:
                    clean_ppdb_synonyms[l[1]][l[0]] = [l[2]]
            else:
                clean_ppdb_synonyms[l[1]] = {l[0] : [l[2]]}
    except IndexError:
        redo_lines.append(line) ### ends up being irrelevant

KeyboardInterrupt: 

In [None]:
import json
with open("ppdb_synonyms.json", "w") as outfile:
    # json_data refers to the above JSON
    json.dump(clean_ppdb_synonyms, outfile)

In [None]:
# from nltk.corpus.reader.wordlist import MWAPPDBCorpusReader

# ppdb = MWAPPDBCorpusReader(root='/home/fvd442/project/noise-paper/', fileids='ppdb-2.0-xxxl-lexical')

# ppdb_synonyms = ppdb.words()

clean_ppdb_synonyms_XL = {}

redo_lines = []
for line in ppdb_synonyms:
    try:
        l = line.split(' ||| ')
        if l[1].startswith('<'):
            continue
        if l[5] in ['Equivalence', 'ForwardEntailment', 'ReverseEntailment']:
            l[0] = l[0][1:-1]
            if l[1] in clean_ppdb_synonyms_XL.keys():
                if l[0] in clean_ppdb_synonyms_XL[l[1]].keys():
                    clean_ppdb_synonyms_XL[l[1]][l[0]].append(l[2])
                else:
                    clean_ppdb_synonyms_XL[l[1]][l[0]] = [l[2]]
            else:
                clean_ppdb_synonyms_XL[l[1]] = {l[0] : [l[2]]}
    except IndexError:
        redo_lines.append(line) ### ends up being irrelevant
        
import json
with open("ppdb_synonyms_xxxl.json", "w") as outfile:
    # json_data refers to the above JSON
    json.dump(clean_ppdb_synonyms_XL, outfile)

In [None]:
clean_ppdb_synonyms_XXL = {}

redo_lines = []
for line in ppdb_synonyms:
    try:
        l = line.split(' ||| ')
        if l[1].startswith('<'):
            continue
        if l[5] in ['Equivalence', 'ForwardEntailment', 'ReverseEntailment']:
            if l[1] in clean_ppdb_synonyms_XXL.keys():
                clean_ppdb_synonyms_XXL[l[1]].add(l[2])
            else:
                clean_ppdb_synonyms_XXL[l[1]] = set([l[2]])
    except IndexError:
        redo_lines.append(line) ### ends up being irrelevant
        
for k in clean_ppdb_synonyms_XXL.keys():
    clean_ppdb_synonyms_XXL[k] = list(clean_ppdb_synonyms_XXL[k])

with open("ppdb_synonyms_xxxl_nopostag.json", "w") as outfile:
    # json_data refers to the above JSON
    json.dump(clean_ppdb_synonyms_XXL, outfile)