The Rosenfeld and Erk (R&E) model is a smart extension of SGNS which allows to learn diachronic embeddings i.e instead of one embedding per word like SGNS, learn one embedding per word for any given time.

The basic idea of the R&E model is to replace the embeddings used in the SGNS objective with the output from a non-linear function. The non-linear function is implemented as a neural network which outputs the embedding for an input tuple of word and time.

The advantages of the R&E model are:
- the diachronic models are neural
- time is treated as a continuous variable and used as an input (instead of initial preprocessing which divides the corpora in time chunks)
- embeddings at any time can be obtained

This notebook is decomposed into two parts. The first part is the implementation of the basic SGNS model. In the second part, we extend the SGNS model to the R&E model.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

In [2]:
import sys
if "../modules" not in sys.path:
    sys.path.append ("../modules")

In [3]:
from pytorchsgns import datastructures
from pytorchsgns import dataloader
from pytorchsgns import skipgram_models, difftime_models

# I. SGNS

**Model Architecture**

In [4]:
def sgns_loss (pos_score, neg_score):
    return -(pos_score + neg_score)

class W2V (object):
    def __init__ (self, textfile, 
                  min_count=5, 
                  embed_dims=32, 
                  negative=5, 
                  ns_exponent=0.75, 
                  window=5, 
                  learning_rate=0.001, 
                  subsampling=0.00001):
        self.V, self.input_output_pairs = dataloader.RawReader.readDocs (textfile, min_count=min_count, ws=window)
        self.V.subsample(sampling=subsampling, ns_exponent=ns_exponent)
        self.negative = negative
        self.model = skipgram_models.SkipGram (len(self.V.counts), embed_dims)
        self.optimizer = torch.optim.SGD(self.model.parameters (), lr=learning_rate) 
    
    def negative_sampler (self, context_word):
        vocab = [self.V.i2w[i] for i in range (len (self.V.i2w))]
        p = [self.V.counts[self.V.i2w[i]] for i in range (len (self.V.i2w))]
        sample = np.random.choice (vocab, size=self.negative, replace=True, p=p)
        for w in sample:
            if not w==context_word:
                yield w
    def lookup_tensor (self, w):
        return torch.tensor([self.V.w2i[w]], dtype=torch.long)
    
    def train (self, nEpochs, verbose=False):
        for epoch in range (nEpochs):
            total_loss = 0
            for input_word, output_word in self.input_output_pairs:
                self.model.zero_grad ()
                pos_score = self.model (self.lookup_tensor(input_word), self.lookup_tensor(output_word), sign=torch.ones(1))
                neg_score = pos_score - pos_score #initialize to zero!maybe I'll lose the gradient so this hack. Revise.
                for w in self.negative_sampler (output_word):
                    neg_score += self.model (self.lookup_tensor(input_word), self.lookup_tensor(w), sign=-torch.ones(1))
                loss = sgns_loss (pos_score, neg_score)
                loss.backward ()
                total_loss += loss.item()
                self.optimizer.step()
            if verbose: print (total_loss / len (self.input_output_pairs))

In [5]:
w2v = W2V ("mary_poppins.txt")
w2v.train(1, verbose=True)

15.77816900509995


# II. DiffTime

**Model Architecture**

In [8]:
def sgns_loss (pos_score, neg_score):
    return -(pos_score + neg_score)

class W2VTime (object):
    def __init__ (self, textfile, 
                  min_count=5, 
                  embed_dims=32, 
                  negative=5, 
                  ns_exponent=0.75, 
                  window=5, 
                  learning_rate=0.001, 
                  subsampling=0.00001):
        self.V, self.input_output_pairs = dataloader.RawReader.readDocs (textfile, min_count=min_count, ws=window)
        self.V.subsample(sampling=subsampling, ns_exponent=ns_exponent)
        self.negative = negative
        self.model = difftime_models.DiffTime ((10, 10), (10, 10), (10, 10), len(self.V.counts), embed_dims)
        self.optimizer = torch.optim.SGD(self.model.parameters (), lr=learning_rate) 
    
    def negative_sampler (self, context_word):
        vocab = [self.V.i2w[i] for i in range (len (self.V.i2w))]
        p = [self.V.counts[self.V.i2w[i]] for i in range (len (self.V.i2w))]
        sample = np.random.choice (vocab, size=self.negative, replace=True, p=p)
        for w in sample:
            if not w==context_word:
                yield w
    def lookup_tensor (self, w):
        return torch.tensor([self.V.w2i[w]], dtype=torch.long)
    
    def train (self, nEpochs, verbose=False):
        for epoch in range (nEpochs):
            total_loss = 0
            for input_word, output_word in self.input_output_pairs:
                time=torch.Tensor ([[1.0]])
                self.model.zero_grad ()
                pos_score = self.model (self.lookup_tensor(input_word), self.lookup_tensor(output_word), time, sign=torch.ones(1))
                neg_score = pos_score - pos_score #initialize to zero!maybe I'll lose the gradient so this hack. Revise.
                for w in self.negative_sampler (output_word):
                    neg_score += self.model (self.lookup_tensor(input_word), self.lookup_tensor(w), time, sign=-torch.ones(1))
                loss = sgns_loss (pos_score, neg_score)
                loss.backward ()
                total_loss += loss.item()
                self.optimizer.step()
            if verbose: print (total_loss / len (self.input_output_pairs))

In [10]:
w2v = W2VTime ("mary_poppins.txt")
w2v.train(3, verbose=True)

4.020568932844624
3.86865743110169
3.8406590070898674
