In [27]:
import pandas as pd
import csv
import torch

with open('gensim_w2v.txt') as f:
    first_line = f.readline().strip()
    vsize, dim = map(int,first_line.split(' '))
    
# vsize=1779969 # for w2v.txt
# vsize=829609 # for gensim_w2v.txt


def load_embeddings(path):
    print('Loading word embeddings...')
    words = pd.read_csv(path, sep=" ", index_col=0, skiprows=[0],
                        na_values=None, keep_default_na=False, header=None,
                        quoting=csv.QUOTE_NONE,
                        encoding='iso-8859-1')
    print('Loaded.')
    if len(words.columns) == 301:
        words=words.drop(columns=[301])
    matrix = words.values
    index_to_word = list(words[:n_words].index)
    word_to_index = {
        word: ind for ind, word in enumerate(index_to_word)
    }
    return matrix, word_to_index, index_to_word

In [4]:
matrix, word_to_index, index_to_word = load_embeddings("gensim_w2v.txt")

Loading word embeddings...
Loaded.


In [17]:
word_to_index["<unknown>"] = vsize

In [23]:
in_matrix = torch.tensor(matrix[:vsize])
mean_in = torch.mean(in_matrix,0).unsqueeze(0)
in_matrix = torch.cat((in_matrix,mean_in))

out_matrix = torch.tensor(matrix[vsize:])
mean_out = torch.mean(out_matrix,0).unsqueeze(0)
out_matrix = torch.cat((out_matrix,mean_out))

In [24]:
in_embedding = torch.nn.Embedding(vsize+1, dim)
out_embedding = torch.nn.Embedding(vsize+1, dim)
in_embedding.weight.data.copy_(in_matrix)
out_embedding.weight.data.copy_(out_matrix)


tensor([[-0.0032, -0.2789, -0.1183,  ..., -0.1143, -0.0386, -0.0409],
        [-0.4922, -0.0562,  0.0411,  ..., -0.2282,  0.1516,  0.0028],
        [-0.0645, -0.2697,  0.1118,  ...,  0.0509, -0.2716, -0.0535],
        ...,
        [ 0.0850, -0.0629, -0.2115,  ..., -0.2131,  0.0382, -0.0792],
        [ 0.1816, -0.0007, -0.1396,  ..., -0.0863,  0.0404, -0.0868],
        [ 0.1386,  0.0686, -0.1444,  ..., -0.1766,  0.0639, -0.0842]])

In [28]:
import csv
import torch
import pandas as pd

class Word2Vec:
    """
    Class for using global word embeddings.
    PMI estimated as inner product of
    input embedding of w1 with output embedding of w2
    """
    def __init__(self, device, model_spec, path):
        self.device = device
        with open(path) as f:
            first_line = f.readline().strip()
            vsize, dim = map(int,first_line.split(' '))
        self.vocabsize = vsize
        self.embedding_dim = dim
        print(f'Loading word embeddings from {path} ...')
        matrix, self.word_to_index = self._load_embeddings(path)
        self.in_embedding, self.out_embedding = self._prepare_embeddings(matrix)
        print('Loaded.')
        print(f"Embedding model '{model_spec}' initialized on {device}.")

    def _load_embeddings(self, path):
        words = pd.read_csv(path, sep=" ", index_col=0, skiprows=[0],
                            na_values=None, keep_default_na=False, header=None,
                            quoting=csv.QUOTE_NONE,
                            encoding='iso-8859-1')
        if len(words.columns) == self.embedding_dim+1:
            words = words.drop(columns=[self.embedding_dim+1])
        matrix = words.values
        index_to_word = list(words[:self.vocabsize].index)
        word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }
        return matrix, word_to_index

    def _prepare_embeddings(self, matrix):
        # Add on a mean vector embedding for unks
        in_matrix = torch.tensor(matrix[:self.vocabsize])
        mean_in = torch.mean(in_matrix, 0).unsqueeze(0)
        in_matrix = torch.cat((in_matrix, mean_in))
        out_matrix = torch.tensor(matrix[self.vocabsize:])
        mean_out = torch.mean(out_matrix, 0).unsqueeze(0)
        out_matrix = torch.cat((out_matrix, mean_out))

        in_embedding = torch.nn.Embedding(
            self.vocabsize+1, self.embedding_dim).to(self.device)
        out_embedding = torch.nn.Embedding(
            self.vocabsize+1, self.embedding_dim).to(self.device)
        in_embedding.weight.data.copy_(in_matrix.to(self.device))
        out_embedding.weight.data.copy_(out_matrix.to(self.device))
        return in_embedding, out_embedding

    def _encode(self, ptb_tokenlist):
        # word_to_index = self.word_to_index
        # word_to_index["<unknown>"] = self.vocabsize
        sentence_as_ids = [
            self.word_to_index.get(word, self.vocabsize)
            for word in ptb_tokenlist]
        return sentence_as_ids

    def ptb_tokenlist_to_pmi_matrix(
            self, ptb_tokenlist, add_special_tokens=True,
            pad_left=None, pad_right=None, verbose=True):
        """Maps tokenlist to PMI matrix,
        TODO: this just ignores the rest of the arguments,
        but this way no custom call from main.py"""

        sentence_as_ids = self._encode(ptb_tokenlist)
        sentence_as_ids = torch.tensor(sentence_as_ids).to(self.device)
        with torch.no_grad():
            in_sentence = self.in_embedding(sentence_as_ids)
            out_sentence = self.out_embedding(sentence_as_ids)
            pmi_matrix = torch.matmul(in_sentence, out_sentence.T)
        pseudo_loglik = 0  # meaningless for now
        return pmi_matrix, pseudo_loglik

In [29]:
MODEL=Word2Vec('cpu', 'w2v', 'gensim_w2v.txt')

Loading word embeddings from gensim_w2v.txt ...
Loaded.
Embedding model 'w2v' initialized on cpu.


In [30]:
ptb_tokenlist = ["queen","king","zoo","zoos"]

pmi_matrix, pseudo_loglik = MODEL.ptb_tokenlist_to_pmi_matrix(
    ptb_tokenlist, add_special_tokens=True, verbose=True,
    pad_left=None, pad_right=None)

In [31]:
pmi_matrix

tensor([[ 1.6373,  0.7513, -0.6049, -0.9455],
        [ 0.5922,  0.9947, -0.8691, -1.0358],
        [-0.5677, -0.6622,  1.4823,  1.2766],
        [-0.8843, -0.7736,  1.0375,  1.9962]])