In [None]:
import torch
from typing import List
from collections import Counter
import pickle5 as pickle 
from tqdm import trange

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import average_precision_score

from transformers import (DistilBertTokenizerFast, DistilBertModel)

### CLASSES

In [None]:

class Vocab:

    def __init__(self):
        self._tok_counts = Counter()
        self._id_to_tok = {}

    def fit(self, data, word_list):
        for sequence in data:
            self._tok_counts.update([tok for tok in sequence if tok in word_list])

        self._toks = (["</s>", "<unk>"] +
                      [tok for tok, _ in self._tok_counts.most_common()])
        self._tok_to_id = {tok: i for i, tok in enumerate(self._toks)}

    def __len__(self):
        return len(self._toks)
    
class Tokenizer:

    def __init__(self):
        self._t = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    def words(self, sequences: List[str]):
        return [s.split() for s in sequences]

    def __call__(self, sequences: List[str]):
        words = self.words(sequences)
        subw = self._t.batch_encode_plus(words,
                                         is_split_into_words=True,
                                         padding=True)
        return words, subw

class EmbedAverages(torch.nn.Module):
    def __init__(self, n_words, dim):
        super().__init__()
        # matrix of wordvector sums
        self.register_buffer("_sum", torch.zeros(n_words, dim))
        self.register_buffer("_counts", torch.zeros(n_words, dtype=torch.long))
        self.register_buffer("_cov", torch.zeros(n_words, dim, dim))
    
    def add(self, ix, vec):
        self._counts[ix] += 1
        self._sum[ix] += vec
        self._cov[ix] += vec.reshape([len(vec), 1]) @ vec.reshape([1, len(vec)])
    
    def get_mean_covariance(self, ix):
#         print("self._counts[ix]", self._counts[ix])
#         print("self._sum[ix]", self._sum[ix])
        
        mean = self._sum[ix] / self._counts[ix]
        d = len(mean)
        cov = self._cov[ix] / self._counts[ix] - mean.reshape([d, 1])  @ mean.reshape([1, d])
        cov = .001 * torch.eye(d) + cov
        return mean, cov

In [3]:

def calculate_kl(wordpair):
    # Get the mean vectors and covariance matrices for the two words in the word pair
    mean1, covariance_matrix1 = embavg.get_mean_covariance(vocab._tok_to_id.get(wordpair[0])) 
    mean2, covariance_matrix2 = embavg.get_mean_covariance(vocab._tok_to_id.get(wordpair[1])) 
    
    # Create PyTorch multivariate normal distributions using the mean vectors and covariance matrices
    p = torch.distributions.multivariate_normal.MultivariateNormal(mean1, covariance_matrix=covariance_matrix1)
    q = torch.distributions.multivariate_normal.MultivariateNormal(mean2, covariance_matrix=covariance_matrix2)

    # Calculate the KL divergence between the two distributions
    kl = torch.distributions.kl.kl_divergence(p, q)

    return kl.item()

In [4]:

def import_baroni(neg_file, pos_file):
    filenames = ["neg_file", "pos_file"]

    for i, file in enumerate([neg_file, pos_file]):
        globals()['results_{}'.format(filenames[i])] = []
        
        with open(file) as f:
            line = f.readline()
            while line:
                globals()['results_{}'.format(filenames[i])].append(line.replace("-n", "").replace("\n", "").strip("").split("\t"))
                line = f.readline()
        f.close()

    baroni = sum(results_neg_file, []) + sum(results_pos_file, [])
    baroni_set = set(baroni)

    return results_neg_file, results_pos_file, baroni, baroni_set


In [5]:
# Open the file in read mode
with open("../data_distrembed/roen.vocab", "r") as f:
    # Read the contents of the file
    contents = f.read()

print(len(contents))  # prints the contents of the file

5163


## pipeline


In [56]:
neg_file = "../Data_Shared/eacl2012-data/negative-examples.txtinput"
pos_file = "../Data_Shared/eacl2012-data/positive-examples.txtinput"
results_neg_file, results_pos_file, baroni, baroni_set = import_baroni(neg_file, pos_file)

with open('../Data_Shared/wiki_subtext_preprocess.pickle', 'rb') as handle:
        seqs = pickle.load(handle)

import ast
  
# reading the data from the file
with open('../Data_shared/wiki_subset.txt') as f:
    data = f.read()
      
# reconstructing the data as a dictionary
# wikidata = ast.literal_eval(data)
seqs = seqs[:100000]

tok = Tokenizer()
vocab = Vocab()
vocab.fit(tok.words(seqs), baroni)


In [54]:
vocab._tok_to_id

{'</s>': 0,
 '<unk>': 1,
 'people': 2,
 'name': 3,
 'work': 4,
 'football': 5,
 'end': 6,
 'set': 7,
 'final': 8,
 'member': 9,
 'area': 10,
 'player': 11,
 'game': 12,
 'military': 13,
 'school': 14,
 'company': 15,
 'song': 16,
 'system': 17,
 'son': 18,
 'title': 19,
 'music': 20,
 'building': 21,
 'official': 22,
 'gas': 23,
 'man': 24,
 'term': 25,
 'water': 26,
 'male': 27,
 'record': 28,
 'professional': 29,
 'father': 30,
 'book': 31,
 'club': 32,
 'district': 33,
 'day': 34,
 'information': 35,
 'show': 36,
 'point': 37,
 'right': 38,
 'army': 39,
 'site': 40,
 'church': 41,
 'business': 42,
 'politician': 43,
 'playing': 44,
 'country': 45,
 'human': 46,
 'coach': 47,
 'wife': 48,
 'race': 49,
 'basketball': 50,
 'community': 51,
 'science': 52,
 'daughter': 53,
 'light': 54,
 'house': 55,
 'range': 56,
 'ship': 57,
 'performance': 58,
 'election': 59,
 'list': 60,
 'female': 61,
 'president': 62,
 'animal': 63,
 'attack': 64,
 'black': 65,
 'opening': 66,
 'ten': 67,
 'offic

In [26]:
embavg = torch.load('../data_distrembed/first100000.avgs.pt')

In [44]:
print(len(vocab._tok_counts))
print(vocab._tok_counts)
for key, item in vocab._tok_counts.items():
    if key not in baroni:
        print(key, item)

908
Counter({'people': 434, 'name': 248, 'work': 218, 'football': 206, 'end': 196, 'set': 191, 'final': 184, 'member': 181, 'area': 180, 'player': 180, 'game': 168, 'military': 167, 'school': 159, 'company': 152, 'song': 146, 'system': 146, 'son': 145, 'title': 140, 'music': 132, 'building': 131, 'official': 122, 'gas': 120, 'man': 119, 'term': 118, 'water': 117, 'male': 115, 'record': 109, 'professional': 108, 'father': 101, 'book': 100, 'club': 99, 'district': 96, 'day': 93, 'information': 90, 'show': 89, 'point': 89, 'right': 83, 'army': 82, 'site': 82, 'church': 81, 'business': 79, 'politician': 76, 'playing': 75, 'country': 73, 'human': 72, 'coach': 71, 'wife': 71, 'race': 71, 'basketball': 70, 'community': 69, 'science': 69, 'daughter': 69, 'light': 69, 'house': 68, 'range': 66, 'ship': 66, 'performance': 65, 'election': 64, 'list': 64, 'female': 64, 'president': 64, 'animal': 62, 'attack': 60, 'black': 60, 'opening': 60, 'ten': 59, 'officer': 58, 'collection': 57, 'road': 57, 'm

### dataframe

In [45]:


baroni_pos_subset = [x for x in results_pos_file if x[0] in vocab._tok_counts and x[1] in vocab._tok_counts]
baroni_neg_subset = [x for x in results_neg_file if x[0] in vocab._tok_counts and x[1] in vocab._tok_counts]



In [50]:

# baroni_pos_subset, baroni_neg_subset = create_combined_subset(word_cov_matrices, results_neg_file, results_pos_file, combined_set)

baroni_subset_label = []

for i in baroni_pos_subset:
    baroni_subset_label.append([i, 1])

for i in baroni_neg_subset:
    baroni_subset_label.append([i, 0])

# MAKE DATAFRAME
df1 = pd.DataFrame(baroni_subset_label, columns =['Wordpair', 'True label'])
print(df1)

                    Wordpair  True label
0     [abstraction, concept]           1
1           [acid, chemical]           1
2          [adjective, word]           1
3      [aesthetic, doctrine]           1
4       [affection, feeling]           1
...                      ...         ...
1298         [worker, pilot]           0
1299     [workplace, studio]           0
1300     [writer, dramatist]           0
1301          [writer, poet]           0
1302       [yellow, protein]           0

[1303 rows x 2 columns]


In [58]:
baroni_subset_cos = []

for wordpair in (baroni_pos_subset + baroni_neg_subset):
    A = embavg._sum(vocab._tok_to_id.get(wordpair[0]))
    B = embavg._sum(vocab._tok_to_id.get(wordpair[1]))
    baroni_subset_cos.append(cosine_similarity(A, B))
    
    

TypeError: 'Tensor' object is not callable

### calculate KL and COS

In [55]:

# CALCULATE KL and COS
baroni_subset_kl = []
baroni_subset_cos = []

for wordpair in tqdm((baroni_pos_subset + baroni_neg_subset)):
    baroni_subset_kl.append(calculate_kl(wordpair))
    # baroni_subset_cos.append(cosine_similarity(ft.get_word_vector(wordpair[0]).numpy(), ft.get_word_vector(wordpair[1]).numpy()))

df1['KL score'] = baroni_subset_kl
# df1['COS score'] = baroni_subset_cos

# with open('df1.pickle', 'wb') as handle:
#     pickle.dump(df1, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(df1)
# print("COS AP: ", average_precision_score(df1["True label"], df1["COS score"]))
print("KL AP: ", average_precision_score(df1["True label"], -df1["KL score"]))

  0%|          | 0/1303 [00:00<?, ?it/s]


IndexError: index 786 is out of bounds for dimension 0 with size 672