In [11]:
import torch
from typing import List
from collections import Counter
import pickle5 as pickle 

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import average_precision_score

from transformers import (DistilBertTokenizerFast, DistilBertModel)

### CLASSES

In [12]:
class Vocab:

    def __init__(self):
        self._tok_counts = Counter()
        self._id_to_tok = {}

    def fit(self, data):
        for sequence in data:
            self._tok_counts.update(sequence)

        self._toks = (["</s>", "<unk>"] +
                      [tok for tok, _ in self._tok_counts.most_common()])
        self._tok_to_id = {tok: i for i, tok in enumerate(self._toks)}
        self._id_to_tok = {i: tok for i, tok in enumerate(self._toks)}

    def __len__(self):
        return len(self._toks)

class EmbedAverages(torch.nn.Module):

    def __init__(self, n_words, dim):

        super().__init__()
        # matrix of wordvector sums
        self.register_buffer("_sum", torch.zeros(n_words, dim))
        self.register_buffer("_ssq", torch.zeros(n_words, dim))
        self.register_buffer("_sum_normed", torch.zeros(n_words, dim))
        self.register_buffer("_counts", torch.zeros(n_words, dtype=torch.long))

    def add(self, ix, vec):
        self._counts[ix] += 1
        self._sum[ix] += vec
        self._ssq[ix] += vec ** 2

class Tokenizer:

    def __init__(self):
        self._t = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    def words(self, sequences: List[str]):
        return [s.split() for s in sequences]

    def __call__(self, sequences: List[str]):
        words = self.words(sequences)
        subw = self._t.batch_encode_plus(words,
                                         is_split_into_words=True,
                                         padding=True)
        return words, subw

In [13]:
def calculate_covariance(context_dict, baroni_set, ft, window):
    covariance = {}

    for word in tqdm(baroni_set):
        total = np.zeros((100,100))

        for c_word in context_dict[word]:
            total += np.outer((ft.get_word_vector(c_word) - ft.get_word_vector(word)), (ft.get_word_vector(c_word) - ft.get_word_vector(word)))

        covariance[word] = (total / (len(context_dict[word]) * window))

    return covariance


In [14]:

def create_word_covariance_matrix(embed_averages, i, word):
    # Extract the mean and variance tensors for the word at index i
    mean_tensor = embed_averages._sum[i] / embed_averages._counts[i]
    var_tensor = (embed_averages._ssq[i] - mean_tensor ** 2) / embed_averages._counts[i]

    # Stack the mean and variance tensors along the second dimension
    mean_var_tensor = torch.stack([mean_tensor, var_tensor], dim=1)

    # Calculate the covariance matrix for the word by taking the outer product of the mean_var_tensor
    # with itself
    cov_matrix = mean_var_tensor.t() @ mean_var_tensor

    return cov_matrix


In [15]:

def calculate_kl(covariance, wordpair):
    # Get the mean vectors and covariance matrices for the two words in the word pair
    mean1 = covariance.get(wordpair[0])[1]
    covariance_matrix1 = covariance.get(wordpair[0])[0]
    mean2 = covariance.get(wordpair[1])[1]
    covariance_matrix2 = covariance.get(wordpair[1])[0]

    # Create PyTorch multivariate normal distributions using the mean vectors and covariance matrices
    p = torch.distributions.multivariate_normal.MultivariateNormal(mean1, covariance_matrix=covariance_matrix1)
    q = torch.distributions.multivariate_normal.MultivariateNormal(mean2, covariance_matrix=covariance_matrix2)

    # Calculate the KL divergence between the two distributions
    kl = torch.distributions.kl.kl_divergence(p, q)

    return kl.item()

In [7]:

def import_baroni(neg_file, pos_file):
    filenames = ["neg_file", "pos_file"]

    for i, file in enumerate([neg_file, pos_file]):
        globals()['results_{}'.format(filenames[i])] = []
        
        with open(file) as f:
            line = f.readline()
            while line:
                globals()['results_{}'.format(filenames[i])].append(line.replace("-n", "").replace("\n", "").strip("").split("\t"))
                line = f.readline()
        f.close()

    baroni = sum(results_neg_file, []) + sum(results_pos_file, [])
    baroni_set = set(baroni)

    return results_neg_file, results_pos_file, baroni, baroni_set


## pipeline


In [34]:
neg_file = "../Data_Shared/eacl2012-data/negative-examples.txtinput"
pos_file = "../Data_Shared/eacl2012-data/positive-examples.txtinput"
results_neg_file, results_pos_file, baroni, baroni_set = import_baroni(neg_file, pos_file)

with open('../distrembed2/covariance_BERT.pickle', 'rb') as f:
        covariance_BERT = pickle.load(f)


first5pairs = {k: covariance_BERT[k] for k in list(covariance_BERT)[:5]}
print(first5pairs)

embavg = torch.load('../data_distrembed/roen.avgs.pt')

seqs = baroni
vocab = Vocab()
tok = Tokenizer()
vocab.fit(tok.words(seqs))


{'</s>': tensor([ 9.5763e-01,  1.4476e-01, -3.2078e-01,  3.7167e-01, -4.2366e-01,
        -7.2217e-01,  3.4849e-01, -6.2491e-01,  5.8231e-01,  9.3976e-02,
         1.1667e-01, -3.9195e-02,  2.1243e-01, -3.4680e-01, -7.7891e-01,
        -3.0265e-01, -9.0638e-02, -1.7158e-01,  2.8353e-01, -8.1263e-02,
         5.7646e-01, -2.1062e-01,  6.4107e-01,  2.1331e-01,  4.7829e-02,
         2.9211e-01, -4.9298e-01, -3.3402e-02, -4.8847e-01, -5.4379e-01,
        -4.1428e-01, -3.6745e-01, -3.2704e-01,  5.1564e-01,  2.1462e-01,
        -1.8343e-01,  4.0449e-01, -1.2654e-01, -4.2281e-01, -4.9294e-01,
        -5.0861e-01,  1.7674e-01, -3.7992e-01,  3.8343e-01,  1.7326e-02,
        -5.0651e-01,  3.9390e-01,  3.7145e-01, -7.1173e-02,  6.7646e-01,
         7.5316e-02,  3.7349e-01, -6.6496e-02,  1.7559e-01,  4.0772e-01,
         2.6465e-01,  3.7479e-01, -5.5124e-01,  7.6674e-02,  8.6606e-02,
         6.2406e-02,  6.1883e-01, -2.4968e-01, -6.2481e-02,  5.4148e-01,
         1.8917e-01, -1.1281e-01, -1.4996e

In [83]:
# after you made the lookup tabel you have to take the row vector from sum matrix 
# corrsponding to the taget word and devide it by the count (this gets you the mean?)

mean_word_vectors = {}

# Iterate over the words in the vocabulary
for key, ix in vocab._tok_to_id.items():
    # Get the average vector for the current word
    vec = embavg._sum[ix]
    count = embavg._counts[ix]
    # Add an entry to the dictionary that maps the current word to its average vector
    mean_word_vectors[key] = vec / count

first5pairs = {k: mean_word_vectors[k] for k in list(mean_word_vectors)[:5]}

print(first5pairs)
# with open('covariance_BERT.pickle', 'wb') as handle:
#     pickle.dump(word_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

tensor(4556902)
tensor(212182)
tensor(358)
tensor(8)
tensor(259)
tensor(126)
tensor(1552)
tensor(20)
tensor(96)
tensor(74)
tensor(78)
tensor(749)
tensor(1554)
tensor(184)
tensor(608)
tensor(30)
tensor(249)
tensor(21)
tensor(28)
tensor(108)
tensor(126)
tensor(158)
tensor(189)
tensor(368)
tensor(28)
tensor(43)
tensor(73)
tensor(21)
tensor(5)
tensor(57)
tensor(12)
tensor(623)
tensor(296)
tensor(249)
tensor(22)
tensor(13)
tensor(221)
tensor(336)
tensor(118)
tensor(356)
tensor(109)
tensor(291)
tensor(27)
tensor(1179)
tensor(31)
tensor(131)
tensor(93)
tensor(2448)
tensor(2401)
tensor(1051)
tensor(13)
tensor(206)
tensor(1375)
tensor(178)
tensor(10)
tensor(303)
tensor(140)
tensor(44)
tensor(124)
tensor(23)
tensor(71)
tensor(77)
tensor(2956)
tensor(579)
tensor(21)
tensor(112)
tensor(715)
tensor(655)
tensor(50)
tensor(2)
tensor(177)
tensor(68)
tensor(704)
tensor(282)
tensor(23)
tensor(1724)
tensor(1)
tensor(16)
tensor(198)
tensor(49)
tensor(3)
tensor(304)
tensor(0)
tensor(1362)
tensor(39)
tensor

tensor(34)
tensor(16)
tensor(11)
tensor(2)
tensor(11)
tensor(5)
tensor(48)
tensor(10)
tensor(75)
tensor(6)
tensor(16)
tensor(31)
tensor(63)
tensor(1)
tensor(1131)
tensor(15)
tensor(8)
tensor(20)
tensor(4)
tensor(9)
tensor(8)
tensor(120)
tensor(45)
tensor(131)
tensor(78)
tensor(47)
tensor(35)
tensor(11)
tensor(212)
tensor(19)
tensor(258)
tensor(29)
tensor(10)
tensor(4)
tensor(19)
tensor(62)
tensor(29)
tensor(41)
tensor(10)
tensor(36)
tensor(10)
tensor(1)
tensor(13)
tensor(2)
tensor(6)
tensor(15)
tensor(16)
tensor(11)
tensor(88)
tensor(49)
tensor(111)
tensor(156)
tensor(2)
tensor(34)
tensor(40)
tensor(12)
tensor(19)
tensor(47)
tensor(31)
tensor(176)
tensor(30)
tensor(36)
tensor(14)
tensor(263)
tensor(35)
tensor(13)
tensor(3)
tensor(882)
tensor(22)
tensor(36)
tensor(576)
tensor(89)
tensor(34)
tensor(0)
tensor(36)
tensor(2)
tensor(108)
tensor(65)
tensor(60)
tensor(17)
tensor(395)
tensor(18)
tensor(5)
tensor(11)
tensor(397)
tensor(42)
tensor(176)
tensor(26)
tensor(7)
tensor(46)
tensor(109)


In [82]:
wordpair = (results_pos_file + results_neg_file)[0]
print(wordpair)
print(covariance_BERT.get(wordpair[0]))
print(mean_word_vectors.get(wordpair[0]))
# Get the mean vectors and covariance matrices for the two words in the word pair

# covariance_BERT
# mean_word_vectors


mean1 = torch.tensor([covariance_BERT.get(wordpair[0])[1]])
covariance_matrix1 = mean_word_vectors.get(wordpair[0])[0]

mean2 = torch.tensor([covariance_BERT.get(wordpair[1])[1]])
covariance_matrix2 = mean_word_vectors.get(wordpair[1])[0]

# Create PyTorch multivariate normal distributions using the mean vectors and covariance matrices
p = torch.distributions.multivariate_normal.MultivariateNormal(mean1, covariance_matrix=covariance_matrix1)
q = torch.distributions.multivariate_normal.MultivariateNormal(mean2, covariance_matrix=covariance_matrix2)

# Calculate the KL divergence between the two distributions
kl = torch.distributions.kl.kl_divergence(p, q)

['abstraction', 'concept']
tensor([ 1.8056e-01, -3.1582e-02, -4.5247e-01,  6.6344e-02,  1.9978e-01,
        -1.6275e-01, -2.1587e-01,  2.0214e-01,  2.3823e-01, -6.1837e-01,
        -2.5986e-02,  4.1118e-01, -8.4869e-02, -1.0973e-01,  1.9991e-01,
        -1.3315e-01,  1.0551e-01, -3.7477e-02,  9.3630e-02,  3.8005e-02,
         1.3680e-01,  4.4131e-02, -9.8319e-02, -6.1561e-01,  2.2126e-02,
         1.7181e-01, -9.5097e-03, -3.2977e-02, -3.4707e-01,  5.0283e-01,
         6.3303e-02,  8.3258e-02,  1.8985e-01,  8.0960e-02, -2.7074e-01,
         4.3650e-03, -7.4133e-02, -1.7204e-01, -3.5969e-01, -2.3040e-01,
        -1.5746e-01, -1.9614e-01,  2.3147e-01,  3.9109e-02,  2.9004e-01,
        -2.3031e-01, -1.9902e-02, -3.8808e-01,  1.6343e-01,  1.2661e-01,
        -8.6249e-01,  1.2736e-01, -3.3573e-01,  1.3363e-01,  1.4708e-01,
         3.0512e-01,  3.3593e-01, -2.9607e-01,  1.1258e-01, -2.8836e-01,
         5.3712e-01, -5.4290e-02,  1.3202e-01,  2.5307e-01, -1.3117e-01,
         2.2205e-01,  2.

AttributeError: 'numpy.float32' object has no attribute 'dim'

In [79]:

# Calculate the 4x4 covariance matrix for each word


word_cov_matrices = {}

for i in range(len(embavg._counts)):
    word = vocab._id_to_tok[i]
    word_cov_matrix = create_word_covariance_matrix(embavg, i, word)
    mean = torch.mean(embavg._sum[i], dim=0, keepdim=True )
    word_cov_matrices[word] = [word_cov_matrix, mean]

In [57]:
wordpair = (results_pos_file + results_neg_file)[0]
print(wordpair)
print(word_cov_matrices.get(wordpair[0]))
# Get the mean vectors and covariance matrices for the two words in the word pair

mean1 = torch.tensor([word_cov_matrices.get(wordpair[0])[1]])
covariance_matrix1 = word_cov_matrices.get(wordpair[0])[0]

mean2 = torch.tensor([word_cov_matrices.get(wordpair[1])[1]])
covariance_matrix2 = word_cov_matrices.get(wordpair[1])[0]

# Create PyTorch multivariate normal distributions using the mean vectors and covariance matrices
p = torch.distributions.multivariate_normal.MultivariateNormal(mean1, covariance_matrix=covariance_matrix1)
q = torch.distributions.multivariate_normal.MultivariateNormal(mean2, covariance_matrix=covariance_matrix2)

# Calculate the KL divergence between the two distributions
kl = torch.distributions.kl.kl_divergence(p, q)

['abstraction', 'concept']
[tensor([[  88.5581, -111.9965],
        [-111.9965,  490.1674]]), tensor(-0.0877)]


ValueError: covariance_matrix must be at least two-dimensional, with optional leading batch dimensions

### dataframe

In [None]:

# baroni_pos_subset, baroni_neg_subset = create_combined_subset(word_cov_matrices, results_neg_file, results_pos_file, combined_set)

baroni_subset_label = []

for i in results_pos_file:
    baroni_subset_label.append([i, 1])

for i in results_neg_file:
    baroni_subset_label.append([i, 0])

# MAKE DATAFRAME
df1 = pd.DataFrame(baroni_subset_label, columns =['Wordpair', 'True label'])

### calculate KL and COS

In [None]:

# CALCULATE KL and COS
baroni_subset_kl = []
baroni_subset_cos = []

for wordpair in tqdm((results_pos_file + results_neg_file)):
    baroni_subset_kl.append(calculate_kl(word_cov_matrices, wordpair))
    # baroni_subset_cos.append(cosine_similarity(ft.get_word_vector(wordpair[0]).numpy(), ft.get_word_vector(wordpair[1]).numpy()))

df1['KL score'] = baroni_subset_kl
df1['COS score'] = baroni_subset_cos

with open('df1.pickle', 'wb') as handle:
    pickle.dump(df1, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(df1)
# print("COS AP: ", average_precision_score(df1["True label"], df1["COS score"]))
# print("KL AP: ", average_precision_score(df1["True label"], df1["KL score"]))