# lexsub: default program

In [1]:
from lexsub import *
import os
import re

from copy import deepcopy
from pymagnitude import *
from nltk.corpus import stopwords as nltk_stopwords

from lexsub_check import precision

DATA = "../data"

## Run the default solution on dev

In [2]:
lexsub = LexSub(os.path.join(DATA,'glove.6B.100d.magnitude'))
output = []
keys = []
with open(os.path.join(DATA,'input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        keyIndex = int(fields[0].strip())
        sentence = fields[1].strip().split()
        keys.append(sentence[keyIndex])
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), sentence)))

for i, syn in enumerate(output[:20]):
    print("target:%s" % keys[i], "substitutes:%s" % output[i])
#     print("\n".join(output[:20]))

target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:side substitutes:sides edge bottom front club line both back place corner
target:told substitutes:reporters asked said interview saying afp quoted insisted telling spoke
target:tell substitutes:know why ask you me do let sure telling ca
target:tell substitutes:kno

## Evaluate the default output

In [3]:
from lexsub_check import precision
with open(os.path.join(DATA,'reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=27.89


## Documentation

Write some beautiful documentation of your program here.

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

In [4]:
wv = Magnitude(DATA + "/glove.6B.100d.magnitude")
len(wv) # how many words in this word vector file
wv.dim # the dimensionality of each word vector
wv.most_similar("cat", topn=5)

[('dog', 0.87980753),
 ('rabbit', 0.7424427),
 ('cats', 0.7323004),
 ('monkey', 0.72887105),
 ('pet', 0.719014)]

In [5]:
for key, vector in wv:
    if key == "cat" or key == "dog":
        print(key, vector[:5])

dog [ 0.0546582  0.0548728  0.0936535 -0.1641379 -0.1306658]
cat [ 0.0458157  0.0561247  0.1253741 -0.1178949 -0.1162836]


Dog and cat vectors are similar and the reason why we saw that result above.

Create $ \hat{Q} $ and it's corresponding vocab

In [6]:
wv = Magnitude(DATA + "/glove.6B.100d.magnitude")
vocabQHat = set([k for k, v in wv])

Helper function to load lexicons and copy pymagnitude vectors

In [7]:
isNumber = re.compile(r'\d+.*')

def norm_word(word):
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()


def build_lexicon(filename):
    lexicon = {}
    for line in open(filename, 'r'):
        words = line.lower().strip().split()
        lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
    return lexicon


def copyPymagnitude(wv):
    return deepcopy({k: v for k, v in wv})

In [8]:
def get_substitutability(t, s):
        score = wv.similarity(t, s)
        return score

In [15]:
T = 10

qHat = copyPymagnitude(wv)
q = copyPymagnitude(wv)
lexicon = build_lexicon(os.path.join(DATA, "lexicons", "wordnet-synonyms+.txt"))
loopOn = vocabQHat.intersection(set(lexicon.keys()))

for t in range(T):
    for word in loopOn:
        wordNeighbours = set(lexicon[word]).intersection(vocabQHat)
        cand_scores = []
        for x in wordNeighbours:
            cand_scores.append(get_substitutability(x, word))
            
        sorted_candidates = sorted(zip(list(wordNeighbours), cand_scores), key = lambda x : x[1], reverse=True )            
        wordNeighbours = [sub for sub, score in sorted_candidates][:10]
        
        #if len(wordNeighbours)<10:
            #numNeighbour = len(wordNeighbours)
        
        numNeighbours = len(wordNeighbours)
        
        if numNeighbours == 0: 
            continue
            
        newVec = numNeighbours * qHat[word]
        
        for ppWord in wordNeighbours:
            newVec += q[ppWord]
            
        q[word] = newVec / (2*numNeighbours)
        
# Q is ready for writing at this point

Write Q to file

In [16]:
with open(DATA + "/glove.6B.100d.retrofit.large.context.10.txt", "w") as f:
    for i, (k, v) in enumerate(q.items()):
        line = " ".join([k] + list(map(str, v)) + ["\n"])
        f.write(line)

Evaluate again using retrofit

In [17]:
lexsub = LexSub(os.path.join(DATA,'glove.6B.100d.retrofit.fin.10.magnitude'))
output = []

# Run on the retrofit file
with open(os.path.join(DATA,'input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

# Check score
with open(os.path.join(DATA,'reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
front bottom back edge line corner left place right way
Score=43.57


## Lexical substitution

In [18]:
stopwords = list(set(nltk_stopwords.words('english')))

def get_words(s, index, window=5):
    """ Extract a list of words from a sentence string with punctuation, spaces etc 
    s = sentence 
    """
    # strip punctuation 
    s = re.sub(r'[^\w\s]','',s)
    # replace newline 
    s = s.replace('\n', ' ')
    # get rid of spaces
    s = " ".join(s.split())
    list_sentence = s.split(' ')

    mid_window = window // 2

    if index > mid_window:
        l = index - mid_window
    else:
        l = 0

    r = index + mid_window

    return list_sentence[l:r+1]

def unique(iter):
    "removes duplicates from iterable preserving order"
    result = list()
    seen = set()
    for x in iter:
        if x not in seen:
            seen.add(x)
            result.append(x)
    return result

def process_candidates(candidates, target):
    """ words to lower case, replace underscores, remove duplicated words, 
        filter out target word and stop words """
    filterwords = stopwords + [target]
    return unique(filter(lambda x : x not in filterwords, 
                  map(lambda s : s.lower().replace('_', ' '), candidates)))

In [19]:
class LexSub:

    def __init__(self, wvec_file, topn=100):
        self.wvecs = pymagnitude.Magnitude(wvec_file)
        self.n_candidates = 100
        self.n_substitutes = 10

    def substitutes(self, index, sentence):
        "Return ten guesses that are appropriate lexical substitutions for the word at sentence[index]."
        sentence_list = sentence.split(" ")
        #return(list(map(lambda k: k[0], self.wvecs.most_similar(sentence_list[index], topn=self.topn))))
        word = sentence_list[index] 
        words_scores = self.wvecs.most_similar(positive=[word])
        result = [word for word, score in words_scores]
        result = process_candidates(result, word)[:self.n_candidates]
        return result
        
    def get_substitutability(self, t, s, C):
        """ get substitutability of substitution s for target t in context C
        t = target word 
        s = candidate substitution 
        C = list of context words 
        """
        # 1. target score: how similar is it to the target word?
        tscore = self.wvecs.similarity(t, s)
        # 2. context score: how similar is it to the context words?
        cscores = [self.wvecs.similarity(s, c) for c in C ]
        cscore = sum(cscores)
        return (len(C)*tscore + cscore)/(2*len(C) + 1)  # Add +1 in denom to avoid div by zero error

    def lex_sub(self, index, sentence):
        """ Get appropriate substitution for a word given context words 
        
        word_POS = word with part of speech in form word.POS e.g. dog.n
        context_words = list of words in context 
        """
        list_ = sentence.split(" ")
        w = list_[index]
        #w,_,POS = word_POS.partition('.')
        # generate candidate substitutions
        candidates = self.substitutes(index, sentence)
        if sentence is None:
            return candidates[:self.n_substitutes]
        else:
            context_words = get_words(sentence, index=index, window=5)
            # filter context words: exist in the word2vec vocab, not stop words  
            #context_words = list(filter(lambda c : c in vocabQHat 
                                              # and c not in stopwords, 
                                              # context_words))
                    
            context_words = list(filter(lambda c : c in self.wvecs 
                                              and c not in stopwords and c != w, 
                                              context_words))
            
            cand_scores = [self.get_substitutability(w, s, context_words) if s in self.wvecs else 0 for s in candidates ]
            assert(len(cand_scores) == len(candidates))            
            sorted_candidates = sorted(zip(candidates, cand_scores), key = lambda x : x[1], reverse=True )
            return [sub for sub, score in sorted_candidates][:self.n_substitutes]
  

In [20]:
lexsub = LexSub(os.path.join(DATA,'glove.6B.100d.retrofit.fin.10.magnitude'))
output = []
keys = []

# Run on the retrofit file
with open(os.path.join(DATA,'input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        index = int(fields[0].strip())
        list_sentence = fields[1].strip().split(" ")
        str_sentence = fields[1].strip()
        
        keys.append(list_sentence[index])
        
        
        output.append(" ".join(lexsub.lex_sub(int(fields[0]), str_sentence)))
        
for i, syn in enumerate(output[:20]):
    print("target:%s" % keys[i], "substitutes:%s" % output[i])

# Check score
with open(os.path.join(DATA,'reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

target:side substitutes:bottom corner back place left edge front line right way
target:side substitutes:way back place right front line bottom edge left corner
target:side substitutes:back way right place front line left edge bottom corner
target:side substitutes:front bottom back edge line corner left place right way
target:side substitutes:way front right back place left corner line bottom edge
target:side substitutes:front bottom back edge line corner left place right way
target:side substitutes:place right way back front bottom left line corner edge
target:side substitutes:front bottom back edge line corner left place right way
target:side substitutes:way front line back right place bottom edge left corner
target:side substitutes:line corner front edge place back way left bottom right
target:told substitutes:reporters asked interview afp quoted insisted spokesman press statement spoke
target:tell substitutes:say know answer ask let think remember hear
target:tell substitutes:know a

In [49]:
test = "I am a guy who is learning nlp".split()
print(test)
index = 4 # who
window = 5

mid_window = window // 2

if index > mid_window:
    l = index - mid_window
else:
    l = 0

r = index + mid_window

print(test[l:r+1])

['I', 'am', 'a', 'guy', 'who', 'is', 'learning', 'nlp']
['a', 'guy', 'who', 'is', 'learning']
