In [1]:
import json 

In [2]:
data = "./../../data/"
reddit = data + "reddit/"
gab = data + "gab/"

In [3]:

def importData(path):
    train = path + "train.json"
    test = path + "test.json"  
    val = path + "val.json"    
    train_data = json.load(open(train))
    test_data = json.load(open(test))    
    val_data = json.load(open(val)) 
    return train_data, test_data, val_data

In [4]:
reddit_train,reddit_test, reddit_val = importData(reddit)

In [5]:
gab_train,gab_test, gab_val = importData(gab)

In [6]:
import nltk
from string import punctuation

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
from nltk.corpus import wordnet as wn

WORDNET_POS = {'VERB': wn.VERB, 'NOUN': wn.NOUN, 'ADJ': wn.ADJ, 'ADV': wn.ADV}


def _get_info(lemma, pos, info_type):
    results = dict()

    wn_pos = WORDNET_POS[pos] if pos is not None else None
    morphemes = wn._morphy(lemma, pos=wn_pos) if pos is not None else []
    for i, synset in enumerate(set(wn.synsets(lemma, pos=wn_pos))):
        sense_key = None
        for l in synset.lemmas():
            if l.name().lower() == lemma.lower():
                sense_key = l.key()
                break
            elif l.name().lower() in morphemes:
                sense_key = l.key()
        if sense_key is not None:
            results[sense_key] = synset.examples() if info_type == 'examples' else synset.definition()
        else: 
            continue

    return results


def get_glosses(lemma, pos):
    return _get_info(lemma, pos, info_type='gloss')


def get_example_sentences(lemma, pos):
    return _get_info(lemma, pos, info_type='examples')


def get_all_wordnet_lemma_names():
    results = []
    for pos, wn_pos in WORDNET_POS.items():
        results.append((pos, wn.all_lemma_names(pos=wn_pos)))

    return results


In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  
def checkIfNounChunkInWordNet(nchunk):
    nchunk = nchunk.lower()
    nchunkWords = nchunk.split()
    nchunkWords = [word for word in nchunkWords if word not in stop_words]
    nchunkNew = "_".join(nchunkWords)
    gloss = get_glosses(nchunkNew, None)
    if gloss:
        return True
    else:
        return False

In [38]:
def getNounChunks(sentence):
    doc = nlp(sentence)
    return [str(np) for np in doc.noun_chunks]
    
def generateAmbigiousSentences(sentence):
    sentence = sentence.lower()
    nounChunks = getNounChunks(sentence)
    sentences = []
    for nounChunk in nounChunks:
        if checkIfNounChunkInWordNet(nounChunk):
            sentences.append(generateAmbigiousSentence(nounChunk, sentence))
        else:
            for word in nounChunk.split():
                if checkIfNounChunkInWordNet(word):
                    sentences.append(generateAmbigiousSentence(word, sentence))
    nounChunkWords = []
    for i in nounChunks:
        currWords = i.split()
        nounChunkWords += currWords
    for word in sentence.split():
        if word in punctuation:
            continue
        if word not in stop_words:
            if word not in nounChunkWords:
                sentences.append(generateAmbigiousSentence(word,sentence))
    return sentences

def generateAmbigiousSentence(word, sentence):
    nsent = sentence.split(word,1)
    if len(word.split()) > 1:
        word = "_".join(word.split())
    nsent = nsent[0] + "[TGT] " + word + " [TGT]" + nsent[1]
    nsent = nsent.strip()
    return nsent

                

In [15]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [51]:
from tqdm import tqdm
def generateAmbFromData(data_source):
    ambs = []
    for sent in tqdm(data_source):
        nambs = []
        amb_sents = generateAmbigiousSentences(sent['source'])
        for i in amb_sents:
            nambs.append({'target':sent['target'],'source':i})
        sent['namb'] = nambs
        ambs.append(sent)
    return ambs

In [52]:
reddit_ambs = generateAmbFromData(reddit_train)

100%|██████████| 15619/15619 [02:30<00:00, 104.09it/s]


In [53]:
ambs1 = generateAmbFromData(gab_train)

100%|██████████| 23643/23643 [03:10<00:00, 123.92it/s]


In [54]:
import json
with open('reddit_amb.json','w') as handle:
    json.dump(reddit_ambs,handle)
with open('gab_amb.json','w') as handle:
    json.dump(ambs1,handle)

In [56]:
reddit_ambs[0]

{'set': 'reddit',
 'main_index': 1702,
 'index': 7496,
 'type': 'sentences',
 'source': 'if you give me that salsa i will love you forever . spicy is the shit .',
 'target': 0,
 'namb': [{'target': 0,
   'source': 'if you give me [TGT] that_salsa [TGT] i will love you forever . spicy is the shit .'},
  {'target': 0,
   'source': 'if you give me that salsa i will love you forever . [TGT] spicy [TGT] is the shit .'},
  {'target': 0,
   'source': 'if you give me that salsa i will love you forever . spicy is [TGT] the_shit [TGT] .'},
  {'target': 0,
   'source': 'if you [TGT] give [TGT] me that salsa i will love you forever . spicy is the shit .'},
  {'target': 0,
   'source': 'if you give me that salsa i will [TGT] love [TGT] you forever . spicy is the shit .'},
  {'target': 0,
   'source': 'if you give me that salsa i will love you [TGT] forever [TGT] . spicy is the shit .'}]}

In [57]:
generateAmbigiousSentences("kill yourself you whiny , self-righteous faggot")

['kill yourself you [TGT] whiny [TGT] , self-righteous faggot',
 'kill yourself you whiny , [TGT] self-righteous [TGT] faggot',
 'kill yourself you whiny , self-righteous [TGT] faggot [TGT]',
 '[TGT] kill [TGT] yourself you whiny , self-righteous faggot']

In [58]:
generateAmbigiousSentences("but why do they make that face")

['but why do they make [TGT] that_face [TGT]',
 'but why do they [TGT] make [TGT] that face']

In [65]:
generateAmbigiousSentences("you should watch louis le vau ’slatest video . steven oh of tyt is disturbing as hell and makes me hope that jimmy dore wakes the left up ")

['you should watch [TGT] louis [TGT] le vau ’slatest video . steven oh of tyt is disturbing as hell and makes me hope that jimmy dore wakes the left up',
 'you should watch louis [TGT] le [TGT] vau ’slatest video . steven oh of tyt is disturbing as hell and makes me hope that jimmy dore wakes the left up',
 'you should watch louis le vau ’slatest [TGT] video [TGT] . steven oh of tyt is disturbing as hell and makes me hope that jimmy dore wakes the left up',
 'you should watch louis le vau ’slatest video . steven oh of tyt is disturbing as [TGT] hell [TGT] and makes me hope that jimmy dore wakes the left up',
 'you should watch louis le vau ’slatest video . steven oh of tyt is disturbing as hell and makes me hope that [TGT] jimmy [TGT] dore wakes the left up',
 'you should watch louis le vau ’slatest video . steven oh of tyt is disturbing as hell and makes me hope that jimmy dore wakes [TGT] the_left [TGT] up',
 'you should [TGT] watch [TGT] louis le vau ’slatest video . steven oh of ty

In [62]:
gloss = get_glosses('shit', None)