In [1]:
DATA = "../data"
from pymagnitude import *

ImportError: No module named 'pymagnitude'

In [2]:
import re 

stopwords = ["it's", "she's", 'were', 'because', 'this', 'couldn', 'then', 'how'
, 'd', 'doesn', 'down', 's', 'they', 'she', "needn't", 'wasn', 'haven', 
'between', "wouldn't", 'the', 'ma', "wasn't", 'until', 'my', 'himself', 
"that'll", 'by', 'about', 'in', "aren't", "should've", 'why', 'nor', 
'before', 'when', 'we', 'here', 'only', "couldn't", 'ain', 'no', 'your', 
'will', 'own', 'his', "you'll", 'are', 'and', 'most', 'do', 'now', "isn't", 
'having', 'on', 'her', 'theirs', 'under', 'with', 'to', "mightn't", 'while', 
'its', 'be', 'll', 'don', 'over', 'again', 'their', 'won', 'too', 'during', 
'shan', 'herself', 'has', 'or', 'from', 'ours', 'into', 'our', 'above', 
'wouldn', 'you', 'of', 'so', 't', 'he', 'doing', 'as', 'i', 'can', 'shouldn', 
'have', 'at', 'other', 'hasn', 'more', 'yourselves', 'y', 'yours', 'very', 
'themselves', 'which', 'these', 'being', 'both', 'aren', 'did', 'than', 'needn',
 'for', 'itself', "haven't", 'through', 'weren', 'but', 'once', 'isn', 
 'ourselves', 'didn', 'not', 'yourself', 'mightn', 'after', 've', 'him', 
 'whom', "hasn't", 'a', 'hadn', "shouldn't", "mustn't", 'those', 'off', 
 'each', 'was', "didn't", "you'd", 'where', 'o', 'further', 'below', "shan't", 
 'myself', 'mustn', 'is', 'been', 'just', 'any', 'out', 'that', 'm', 'such', 
 'me', 'same', 'hers', 'some', 'had', 'does', 'against', 'should', "you've", 
 "doesn't", "you're", 'them', 'am', 'if', 'who', 'few', 'what', 'there', 
 "don't", "weren't", "won't", 'an', 'all', 're', 'it', 'up', "hadn't", "'ll"]

def get_words(s):
    """ Extract a list of words from a sentence string with punctuation, spaces etc 
    s = sentence 
    """
    # strip punctuation 
    s = re.sub(r'[^\w\s]','',s)
    # replace newline 
    s = s.replace('\n', ' ')
    # get rid of spaces
    s = " ".join(s.split())
    return s.split(' ')

def unique(iter):
    "removes duplicates from iterable preserving order"
    result = list()
    seen = set()
    for x in iter:
        if x not in seen:
            seen.add(x)
            result.append(x)
    return result

def process_candidates(candidates, target):
    """ words to lower case, replace underscores, remove duplicated words, 
        filter out target word and stop words """
    filterwords = stopwords + [target]
    return unique(filter(lambda x : x not in filterwords, 
                  map(lambda s : s.lower().replace('_', ' '), candidates)))


In [3]:
import os, sys, optparse
import tqdm
import pymagnitude
from functools import reduce 
import operator


class LexSub:

    def __init__(self, wvec_file, topn=100):
        self.wvecs = pymagnitude.Magnitude(wvec_file)
        self.n_candidates = 100
        self.n_substitutes = 10

    def substitutes(self, index, sentence):
        "Return ten guesses that are appropriate lexical substitutions for the word at sentence[index]."
        sentence_list = sentence.split(" ")
        #return(list(map(lambda k: k[0], self.wvecs.most_similar(sentence_list[index], topn=self.topn))))
        word = sentence_list[index] 
        words_scores = self.wvecs.most_similar(positive=[word])
        result = [word for word, score in words_scores]
        result = process_candidates(result, word)[:self.n_candidates]
        return result
    
    def prod(self, factors):
        return reduce(operator.mul, factors, 1)
    
    
    def mult(self, t, s, C):
        '''
        Mult
        '''            
        tscore = self.wvecs.similarity(t, s)
        ptscore = (tscore + 1)/2
        pcscores = [(self.wvecs.similarity(s, c)+1)/2 for c in C ]
        pcscore = self.prod(pcscores)
        return (pcscore*ptscore)**(1.0/(len(C)+1))

    def bal_mult(self, t, s, C):
        '''
        BalMult
        '''
        tscore = self.wvecs.similarity(t,s)
        ptscore = (tscore + 1)/2
        pcscores = [(self.wvecs.similarity(s, c)+1)/2 for c in C ]
        pcscore = self.prod(pcscores)
        return (((ptscore)**len(C))*pcscore)**(1.0/(2*len(C)))
        
    def get_substitutability(self, t, s, C):
        """ get substitutability of substitution s for target t in context C
        t = target word 
        s = candidate substitution 
        C = list of context words 
        """
        # 1. target score: how similar is it to the target word?
        tscore = self.wvecs.similarity(t, s)
        # 2. context score: how similar is it to the context words?
        cscores = [self.wvecs.similarity(s, c) for c in C ]
        cscore = sum(cscores)
        return (len(C)*tscore + cscore)/(2*len(C))

    def lex_sub(self, index, sentence):
        """ Get appropriate substitution for a word given context words 
        
        word_POS = word with part of speech in form word.POS e.g. dog.n
        context_words = list of words in context 
        """
        list_ = sentence.split(" ")
        w = list_[index]
        #w,_,POS = word_POS.partition('.')
        # generate candidate substitutions
        candidates = self.substitutes(index, sentence)
        if sentence is None:
            return candidates[:self.n_substitutes]
        else:
            context_words = get_words(sentence)
            # filter context words: exist in the word2vec vocab, not stop words  
            #context_words = list(filter(lambda c : c in vocabQHat 
                                              # and c not in stopwords, 
                                              # context_words))
            context_words = list(filter(lambda c : c in self.wvecs 
                                              and c not in stopwords, 
                                              context_words))       
            cand_scores = [self.get_substitutability(w, s, context_words) if s in self.wvecs else 0 for s in candidates ]
            assert(len(cand_scores) == len(candidates))            
            sorted_candidates = sorted(zip(candidates, cand_scores), key = lambda x : x[1], reverse=True )
            return [sub for sub, score in sorted_candidates][:self.n_substitutes]
  

In [4]:
import os
import tqdm

lexsub = LexSub(os.path.join('../','data','glove.6B.100d.retrofit.magnitude'))

In [5]:
from lexsub_check import precision
output = []

# Run on the retrofit file
with open(os.path.join(DATA,'input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        index = int(fields[0])
        list_sentence = fields[1].split(" ")
        
        output.append(" ".join(lexsub.lex_sub(int(fields[0]), fields[1])))
print("\n".join(output[:10]))

# Check score
with open(os.path.join(DATA,'reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

english place back way point edge line position front
way place position english back line point front edge
way english place back point position line edge front
way place english back line point position edge front
english place way line back point edge position front
way english place line back position point edge front
place way english back point line edge front position
way english place back position line point edge front
way english line place position point edge back front
edge english place line way point back position front
Score=44.16
