In [1]:
import nltk
import pickle
import numpy as np
from os.path import join
from gensim.models import KeyedVectors
from constants import THESAURUS_DIR, ORIGINAL_VECS_DIR, ORIGINAL_EMBEDDING
from preprocess import GeneralTextProcesser

In [2]:
with open('task_data/lex.mturk.txt','r',encoding='iso-8859-1') as f,\
     open('task_data/lex_mturk_sen.txt','w') as f_sen,\
     open('task_data/targets.pickle','wb') as f_targets,\
     open('task_data/candidates.pickle','wb') as f_candidates:
     # open('task_data/pos_tags.pickle','wb') as f_pos_tags:
    f_str = f.readline()
    targets,candidates,pos_tags = [],[],[]
    for l in f:
        sen,target,*candidate = l.strip().split('\t')
        # sen_pos = nltk.pos_tag(sen.split())
        # id = [w for w,tag in sen_pos].index(target)
        # target_pos = [tag for w,tag in sen_pos][id]
        f_sen.write(sen+'\n')
        # pos_tags.append(target_pos)
        targets.append(target)
        candidates.append(set(candidate))
    pickle.dump(targets,f_targets,protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(candidates,f_candidates,protocol=pickle.HIGHEST_PROTOCOL)
    # pickle.dump(pos_tags,f_pos_tags,pickle.HIGHEST_PROTOCOL)

In [3]:
from pathlib import Path
import stanza

In [4]:
# prepare pos tags of target words
lines = Path('task_data/lex_mturk_sen.txt').read_text()
sens = [line.split() for line in lines.split('\n')[:-1]]
nlp = stanza.Pipeline('en',processors='tokenize,pos',tokenize_pretokenized=True)
doc = nlp(sens)

2020-05-11 17:58:20 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |

2020-05-11 17:58:20 INFO: Use device: gpu
2020-05-11 17:58:20 INFO: Loading: tokenize
2020-05-11 17:58:20 INFO: Loading: pos
2020-05-11 17:58:23 INFO: Done loading processors!


In [5]:
pos_tags = []
for doc_sentence,target,sen in zip(doc.sentences,targets,sens):
    id = sen.index(target)
    word = doc_sentence.words[id]
    pos_tags.append(word.xpos)

In [6]:
with open('pos_tags.pickle','wb') as f:
    pickle.dump(pos_tags,f,pickle.HIGHEST_PROTOCOL)

In [None]:
with open('../data/original_vecs/SIMLEX999_SIMVERB3000-test_SIMVERB500-dev.pickle','rb') as f:
    ori_clf_vecs = pickle.load(f)

In [33]:
# ordered dict
np.linalg.norm(ori_clf_vecs['poach']) 

3.4676987700915958

In [2]:
# prepare vocab
vocab = set()
with open('task_data/lex_mturk_sen.txt','r') as f:
    for l in f:
        vocab |= set(l.split()) 

with open('task_data/candidates.pickle', 'rb') as f:
    candidates = pickle.load(f)

with open('task_data/targets.pickle', 'rb') as f:
    targets = pickle.load(f)

In [3]:
all_cands = set()
for s in candidates:
    all_cands |= s

vocab = vocab | set(targets) | all_cands

In [4]:
len(vocab)

7782

In [5]:
from utils import generate_sub_thesauri
from os.path import join
from constants import THESAURUS_DIR

In [6]:
generate_sub_thesauri(join(THESAURUS_DIR, 'synonyms.txt'),'synonyms.txt',vocab)

In [7]:
generate_sub_thesauri(join(THESAURUS_DIR, 'antonyms.txt'),'antonyms.txt',vocab)

In [8]:
# check constrain vocab
constrain_vocab = set()
with open('synonyms.txt','r') as f:
    for l in f:
        constrain_vocab.update([w[3:] for w in l.split()])
with open('antonyms.txt','r') as f:
    for l in f:
        constrain_vocab.update([w[3:] for w in l.split()])

In [9]:
f'{len(constrain_vocab)},{len(set(targets))},{len(all_cands)}'

'3758,459,4232'

In [10]:
# constrain vocab / target vocab
t_c_vocab = set(targets) & constrain_vocab
len(t_c_vocab)/len(targets)

0.736

In [11]:
# constrain vocab / candidate vocab
c_c_vocab = all_cands & constrain_vocab
len(c_c_vocab)/len(all_cands)

0.554820415879017

In [12]:
l_vocab = list(vocab)
text_preprocesser = GeneralTextProcesser()
emb_dict,_ = text_preprocesser.vocab2vec(l_vocab, ORIGINAL_VECS_DIR, 'lexical_simplification', ORIGINAL_EMBEDDING,
                                       ['pickle','npy'], 'word2vec', normalize=False, oov_handle='random')

preparing vocab vectors
7782 words in vocab, 1564 words not found in word embedding file, init them with random numbers
saving vocab vector file


In [13]:
l_c_vocab = list(constrain_vocab)
l_c_vocab_emb_dict = {w: emb_dict[w] for w in l_c_vocab}
with open('lexical_simplification_constrain.pickle','wb') as f:
    pickle.dump(l_c_vocab_emb_dict,f,pickle.HIGHEST_PROTOCOL)

In [14]:
np.save('lexical_simplification.npy',l_vocab)
np.save('lexical_simplification_constrain.npy',l_c_vocab)

In [42]:
word2vec = KeyedVectors.load_word2vec_format('../data/original_vecs/GoogleNews-vectors-negative300.bin',binary=True)

In [47]:
extended_targets = set()
for w in set(targets):
    if w in word2vec:
        extended_targets |= {w_[0] for w_ in word2vec.similar_by_word(w,50)}

In [48]:
len(extended_targets) 

17943