This notebook implements the function to score the document's semantic "progressiveness" with respect to a given linguistic feature (usually a word) that has been identified before to have semantically shifted in meaning. The scoring function assumes that two skipgram models have been trained. Please refer to the detailed derivation [in this note](https://github.gatech.edu/CompLingLab/semantic-lang-change/blob/master/notes/scoring.pdf).

In [3]:
import numpy as np
import scipy

In [14]:
import scipy.misc

In [108]:
class Scorer (object):
    def __init__ (self, early_model, later_model, index, word):
        """ constructor for scoring with respect to the given models and the word
        
        Args:
            early_model (:obj: model): embeddings from early documents.
            later_model (:obj: model): embeddings from later documents.
            index (:obj: tuple): an index to map words to position in the matrix and vice versa.
            word (:obj: str): the word
        """
        self.m_early = early_model
        self.m_later = later_model
        self.w2i, self.i2w = index
        self.word = word
        
        self.sim_early = np.dot (self.m_early.C, self.m_early.W[self.w2i[word]])
        self.sim_later = np.dot (self.m_later.C, self.m_later.W[self.w2i[word]])
        z_early = scipy.special.logsumexp (self.sim_early)
        z_later = scipy.special.logsumexp (self.sim_later)
        
        self.zdiff = z_later - z_early
        
    def score (self, doc, window_size=10):
        """score a given document with respect to the word
        
        Args:
            doc (:obj: list): document as a list of tokens.
            window_size (int, optional): the size of the context window (default=10)
        
        Returns:
            float: document score wrt `self.word`, `self.early_model` and `self.later_model`.
            
        Todo:
            1. test if the scoring function is correct using several test cases.
        """
        def make_contexts_bow (doc, word, w2i, k=10):
            """makes a bow vector of contexts around the target word
            
            Args:
                doc (:obj: list): document as a list of tokens.
                word(:obj: str): the target word
                w2i(:obj: dict): maps any word to a position.
                k(int, optional): the window size around `word` (default=10)
            """
            last_token_index = len(doc) - 1
            word_positions = [i for i, token in enumerate (doc) if token == word]
            spans = [(max(0, pos-k), min(last_token_index, pos+k)) for pos in word_positions]
            bow = np.zeros (len(w2i))
            
            for i, span in enumerate (spans):
                start, end = span
                for pos in range (start, end+1):
                    if not pos == word_positions[i]:
                        bow[w2i[doc[pos]]] += 1
                        
            return bow
        
        contexts_bow = make_contexts_bow (doc, self.word, self.w2i, k=window_size)
        print (contexts_bow)
        cooccurrence_factor = np.dot (contexts_bow, self.sim_later - self.sim_early)
        normalization_factor = sum(contexts_bow) * self.zdiff
        score = cooccurrence_factor - normalization_factor
        return score

In [36]:
class Model (object):
    def __init__ (self, C, W):
        self.C = C
        self.W = W

Following are some examples to sanity check the implementation.

Case 1: Assume all embeddings -- input and output -- are equal. Let this vector be $\mathbf{p}$. If $\mathbf{p}=q\mathbf{1}$, then the answer should be 0.

In [64]:
q=10
vocab = ["a", "b", "c", "d", "e", "f", "i", "j", "k", "l"]
dims = 10
C_early = C_later = W_early = W_later = q*np.ones((len(vocab),dims))
w2i, i2w = {w:i for i,w in enumerate (vocab)},{i:w for i,w in enumerate (vocab)}
doc = list ("fadekelfi")
k=2

In [65]:
m_early = Model (C_early, W_early)
m_later = Model (C_later, W_later)
scorer = Scorer (m_early, m_later, (w2i, i2w), "k")
print (scorer.score(doc, window_size=k))

[1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.] [1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000. 1000.]
1002.302585092994 1002.302585092994
0.0
0.0


Case 2: Assume that the output embeddings are same and the input embeddings are scales of each other the score is still 0.

In [93]:
q=1
vocab = ["a", "b", "c", "d", "e", "f", "i", "j", "k", "l"]
dims = 10
C_early = C_later = q*np.ones((len(vocab),dims))
W_early = q*np.random.randn(len(vocab), dims)
W_later = 10*W_early
w2i, i2w = {w:i for i,w in enumerate (vocab)},{i:w for i,w in enumerate (vocab)}
doc = list ("fadekelfi")
k=2

In [94]:
m_early = Model (C_early, W_early)
m_later = Model (C_later, W_later)
scorer = Scorer (m_early, m_later, (w2i, i2w), "k")
print (scorer.score(doc, window_size=k))

0.0


Case 3: Assume randomly initialized but equal output embeddings and input embeddings as scales of each other.

In [102]:
vocab = ["a", "b", "c", "d", "e", "f", "i", "j", "k", "l"]
dims = 10
C_early = C_later = np.random.randn(len(vocab), dims)
W_early = np.random.randn(len(vocab), dims)
W_later = 2*W_early
w2i, i2w = {w:i for i,w in enumerate (vocab)},{i:w for i,w in enumerate (vocab)}
doc = list ("fadekelfi")
k=2

In [109]:
m_early = Model (C_early, W_early)
m_later = Model (C_later, W_later)
scorer = Scorer (m_early, m_later, (w2i, i2w), "e")
print (scorer.score(doc, window_size=k))

[1. 0. 0. 1. 2. 1. 0. 0. 2. 1.]
-29.977487567077777


In [107]:
W_later = 3*W_early
m_early = Model (C_early, W_early)
m_later = Model (C_later, W_later)
scorer = Scorer (m_early, m_later, (w2i, i2w), "e")
print (scorer.score(doc, window_size=k))

-29.977487567077777
