In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
from torch.utils.data import DataLoader

import torch.optim as optim

import spacy

# NVSM model

In [2]:
class NVSM(nn.Module):
    def __init__(self, n_doc, n_tok, dim_doc_emb, dim_tok_emb, neg_sampling_rate, 
                 pad_token_id):
        super(NVSM, self).__init__()
        self.doc_emb           = nn.Embedding(n_doc, embedding_dim = dim_doc_emb)
        self.tok_emb           = nn.Embedding(n_tok, embedding_dim = dim_tok_emb)
        self.tok_to_doc        = nn.Linear(dim_tok_emb, dim_doc_emb)
        self.bias              = nn.Parameter(torch.Tensor(dim_doc_emb))
        self.neg_sampling_rate = neg_sampling_rate
        self.pad_token_id      = pad_token_id
        
    def query_to_tensor(self, query):
        '''
        Computes the average of the word embeddings of the query. This method 
        corresponds to the function 'g' in the article.
        '''
        # Create a mask to ignore padding embeddings
        query_mask    = (query != self.pad_token_id).float()
        # Compute the number of tokens in each query to properly compute the 
        # average
        tok_by_input  = query_mask.sum(dim = 1)
        query_tok_emb = self.tok_emb(query)
        query_tok_emb = query_tok_emb * query_mask.unsqueeze(-1)
        # Compute the average of the embeddings
        query_emb     = query_tok_emb.sum(dim = 1) / tok_by_input.unsqueeze(-1)
        
        return query_emb
    
    def normalize_query_tensor(self, query_tensor):
        '''
        Divides each query tensor by its L2 norm. This method corresponds to 
        the function 'norm' in the article.
        '''
        norm = torch.norm(query_tensor, dim = 1) # we might have to detach this value 
                                                 # from the computation graph.
        return query_tensor / norm.unsqueeze(-1)
        
    def query_to_doc_space(self, query):
        '''
        Projects a query vector into the document vector space. This method corresponds 
        to the function 'f' in the article.
        '''
        return self.tok_to_doc(query)
    
    def score(self, query, document):
        '''
        Computes the cosine similarity between a query and a document embedding.
        This method corresponds to the function 'score' in the article.
        '''
        # batch dot product using batch matrix multiplication
        num   = torch.bmm(query.unsqueeze(1), document.unsqueeze(-1))
        denum = torch.norm(query, dim = 1) * torch.norm(document, dim = 1)
        
        return num / denum
        
    def non_stand_projection(self, n_gram):
        '''
        Computes the non-standard projection of a n-gram into the document vector 
        space. This method corresponds to the function 'T^~' in the article.
        '''
        n_gram_tensor      = self.query_to_tensor(n_gram)
        norm_n_gram_tensor = self.normalize_query_tensor(n_gram_tensor)
        projection         = self.query_to_doc_space(norm_n_gram_tensor)
        
        return projection
    
    def _custom_batchnorm(self, batch):
        '''
        Computes the variant of the batch normalization formula used in this article. 
        It only uses a bias and no weights.
        '''
        batch_feat_norm = (batch - batch.mean(dim = 0)) / batch.std(dim = 0)
        batch_feat_norm = batch_feat_norm + self.bias
        
        return batch_feat_norm
    
    def stand_projection(self, batch):
        '''
        Computes the standard projection of a n-gram into document vector space with
        a hardtanh activation. This method corresponds to the function 'T' in the 
        article.
        '''
        non_stand_proj = self.non_stand_projection(batch) 
        bn             = self._custom_batchnorm(non_stand_proj)
        activation     = F.hardtanh(bn)

        return activation
    
    def representation_similarity(self, query, document):
        '''
        Computes the similarity between a query and a document. This method corresponds 
        to the function 'P' in the article.
        '''
#         print('query.is_cuda', query.is_cuda)
#         print('document.is_cuda', query.is_cuda)
        document_emb  = self.doc_emb(document)
        query_proj    = self.stand_projection(query)
        # If we have a single document to match against each query, we have
        # to reshape the tensor to compute a simple dot product.
        # Otherwise, we compute a simple matrix multiplication to match the 
        # query against each document.
        if len(document_emb.shape) == 2:
            document_emb = document_emb.unsqueeze(1)
        if len(query_proj.shape) == 2:
            query_proj = query_proj.unsqueeze(-1)
        dot_product   = torch.bmm(document_emb, query_proj)
#        dot_product   = torch.bmm(document_emb, query_proj.unsqueeze(-1))
        similarity    = torch.sigmoid(dot_product)
        
        return similarity.squeeze()
    
    def forward(self, query, document):
        '''
        Approximates the probability of document given query by uniformly sampling 
        constrastive examples. This method corresponds to the 'P^~' function in the 
        article.
        '''
        # Positive term, this should be maximized as it indicates how similar the
        # correct document is to the query
        pos_repr = self.representation_similarity(query, document)
        
        # Sampling uniformly 'self.neg_sampling_rate' documents to compute the 
        # negative term. We first randomly draw the indices of the documents and 
        # then we compute the similarity with the query.
        device          = document.device
        z               = self.neg_sampling_rate # corresponds to the z variable in 
                                                 # the article
        n_docs          = self.doc_emb.num_embeddings
        neg_sample_size = (query.size(0), z)
        neg_sample      = torch.randint(low = 0, high = n_docs, size = neg_sample_size)
        neg_sample      = neg_sample.to(device)
        neg_repr        = self.representation_similarity(query, neg_sample)
        
        # Probability computation
        positive_term = torch.log(pos_repr)
        negative_term = torch.log(1 - neg_repr).sum(dim = 1)
        proba         = ((z + 1) / (2 * z)) * (z * positive_term + negative_term)
        
        return proba

In [3]:
def loss_function(nvsm, pred, lamb):
    output_term = pred.mean()
    sum_square  = lambda m: (m.weight * m.weight).sum()
    reg_term    = sum_square(nvsm.tok_emb) + \
                  sum_square(nvsm.doc_emb) + \
                  sum_square(nvsm.tok_to_doc)
    loss        = -output_term + (lamb / (2 * pred.shape[0])) * reg_term

    return loss

# Dataset creation

In [4]:
spacy_en = spacy.load('en')

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
def load_docs(filepaths):
    documents = []
    for filepath in filepaths:
        with open(filepath) as file:
            documents.append(file.read().strip().lower())

    return documents

In [6]:
def tokenize_docs(documents):
    tokenized_documents = [tokenize(doc) for doc in documents]
#     print([len(doc_tok) for doc_tok in tokenized_documents])
    
    return tokenized_documents

In [7]:
def create_vocabulary(tokenized_documents):
    vocabulary    = {token for doc in tokenized_documents for token in doc}
    stoi          = {token : i + 2 for i, token in enumerate(vocabulary)}
    stoi['<PAD>'] = 0
    stoi['<UNK>'] = 1
    itos          = {i : token for token, i in stoi.items()}
    
    return vocabulary, stoi, itos

In [8]:
def create_dataset(tok_docs, stoi, n):
    n_grams      = []
    document_ids = []
    for i, doc in enumerate(tok_docs):
        doc_tok_ids = [stoi[tok] for tok in doc]
        for n_gram in [doc_tok_ids[i : i + n] for i in range(len(doc) - n)]:
            n_grams.append(n_gram)
            document_ids.append(i)
            
    return n_grams, document_ids

In [9]:
def create_pytorch_datasets(n_grams, doc_ids, val_prop = 0.2):
    n_grams_tensor = torch.tensor(n_grams)
    doc_ids_tensor = torch.tensor(doc_ids)
    full_dataset   = TensorDataset(n_grams_tensor, doc_ids_tensor)
    total_size     = len(full_dataset)
    val_size       = round(total_size * val_prop)
    train, val     = random_split(full_dataset, [total_size - val_size, val_size])
    
    return train, val

In [10]:
def train(nvsm, device, optimizer, epochs, train_loader, lamb, print_every):
    for epoch in range(epochs):
        for i, (n_grams, doc_ids) in enumerate(train_loader):
            n_grams    = n_grams.to(device)
            doc_ids    = doc_ids.to(device)
            optimizer.zero_grad()
            pred_proba = nvsm(n_grams, doc_ids)
            loss       = loss_function(nvsm, pred_proba, lamb)
            loss.backward()
            optimizer.step()
            if i % print_every == 0:
                print(f'[{epoch},{i}]: {loss}')

In [11]:
def main():
    filepaths = [
        '../data/raw/language/Word_formation',
        '../data/raw/language/Terminology',    
        '../data/raw/history/Jacobin',
        '../data/raw/history/French_Revolution',
        '../data/raw/math/Game_theory',
        '../data/raw/math/Laplacian_matrix'
    ]
    documents             = load_docs(filepaths)
    tokenized_documents   = tokenize_docs(documents)
    voc, stoi, itos       = create_vocabulary(tokenized_documents)
    n_grams, document_ids = create_dataset(tokenized_documents, stoi, 10)
    train_data, val_data  = create_pytorch_datasets(n_grams, document_ids)
    train_loader          = DataLoader(train_data, batch_size = 10000, shuffle = True)
    device                = torch.device('cuda')
    lamb                  = 1e-3 # regularization weight in the loss
    nvsm                  = NVSM(
        n_doc             = len(tokenized_documents), 
        n_tok             = len(stoi), 
        dim_doc_emb       = 20, 
        dim_tok_emb       = 30,
        neg_sampling_rate = 4,
        pad_token_id      = stoi['<PAD>']
    ).to(device)
    optimizer             = optim.Adam(nvsm.parameters(), lr = 1e-3)
    train(nvsm, device, optimizer, 10, train_loader, lamb, 2)
    
    return filepaths, stoi, nvsm, device

In [12]:
paths, stoi, nvsm, device = main()

[0,0]: 7.684036731719971
[0,2]: 7.464227199554443
[0,4]: 7.105589866638184
[1,0]: 6.881085395812988
[1,2]: 6.722508907318115
[1,4]: 6.523367404937744
[2,0]: 6.288670063018799
[2,2]: 6.131076812744141
[2,4]: 5.858460903167725
[3,0]: 5.680097579956055
[3,2]: 5.619449138641357
[3,4]: 5.463902950286865
[4,0]: 5.313665390014648
[4,2]: 5.237552165985107
[4,4]: 5.092764377593994
[5,0]: 4.98193359375
[5,2]: 4.930790424346924
[5,4]: 4.743294715881348
[6,0]: 4.694338798522949
[6,2]: 4.613807201385498
[6,4]: 4.5076727867126465
[7,0]: 4.454799175262451
[7,2]: 4.385538578033447
[7,4]: 4.313291072845459
[8,0]: 4.229124546051025
[8,2]: 4.153205871582031
[8,4]: 4.130579471588135
[9,0]: 4.05439567565918
[9,2]: 4.035975933074951
[9,4]: 3.9590396881103516


In [13]:
doc_names = [path.split('/')[-1] for path in paths]
doc_names

['Word_formation',
 'Terminology',
 'Jacobin',
 'French_Revolution',
 'Game_theory',
 'Laplacian_matrix']

In [14]:
def create_query_dataset(queries, stoi):
    pad_token         = stoi['<PAD>']
    tokenized_queries = [tokenize(query) for query in queries]
    queries_tok_idx   = [[stoi.get(tok, stoi['<UNK>']) for tok in query] for query in tokenized_queries]
    max_len           = max(len(query) for query in queries_tok_idx)
    padded_queries    = [query + [pad_token] * (max_len - len(query)) for query in queries_tok_idx]
    queries_tensor    = torch.tensor(padded_queries)
    dataset           = TensorDataset(queries_tensor)
    
    return dataset

In [15]:
queries = [
    'violence king louis decapitated',
    'domain language translate'
]

In [16]:
batch_size = 32
query_dataset = create_query_dataset(queries, stoi)
test_loader = DataLoader(query_dataset, batch_size = 32)

In [17]:
document_indices = torch.stack([torch.arange(len(doc_names))] * batch_size)
document_indices = document_indices.to(device)
for (queries,) in test_loader:
    queries = queries.to(device)
    result = nvsm.representation_similarity(queries, document_indices[:queries.shape[0]])
    print(queries)
    print(result)

tensor([[ 7979,  7558,  3498,  5000],
        [ 3051, 10563,  2345,     0]], device='cuda:0')
tensor([[0.0328, 0.3955, 0.1654, 0.9765, 0.8775, 0.4411],
        [0.9298, 0.4050, 0.7378, 0.0940, 0.0907, 0.3873]], device='cuda:0',
       grad_fn=<SqueezeBackward0>)
