In [74]:
import torch
from typing import List
from collections import Counter
import pickle5 as pickle 
from tqdm import trange

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import average_precision_score

from transformers import (DistilBertTokenizerFast, DistilBertModel)

### CLASSES

In [75]:

class Vocab:

    def __init__(self):
        self._tok_counts = Counter()
        self._id_to_tok = {}

    def fit(self, data, word_list):
        for sequence in data:
            self._tok_counts.update([tok for tok in sequence if tok in word_list])

        self._toks = (["</s>", "<unk>"] +
                      [tok for tok, _ in self._tok_counts.most_common()])
        self._tok_to_id = {tok: i for i, tok in enumerate(self._toks)}

    def __len__(self):
        return len(self._toks)
    
class Tokenizer:

    def __init__(self):
        self._t = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    def words(self, sequences: List[str]):
        return [s.split() for s in sequences]

    def __call__(self, sequences: List[str]):
        words = self.words(sequences)
        subw = self._t.batch_encode_plus(words,
                                         is_split_into_words=True,
                                         padding=True)
        return words, subw

class EmbedAverages(torch.nn.Module):
    def __init__(self, n_words, dim):
        super().__init__()
        # matrix of wordvector sums
        self.register_buffer("_sum", torch.zeros(n_words, dim))
        self.register_buffer("_counts", torch.zeros(n_words, dtype=torch.long))
        self.register_buffer("_cov", torch.zeros(n_words, dim, dim))
    
    def add(self, ix, vec):
        self._counts[ix] += 1
        self._sum[ix] += vec
        self._cov[ix] += vec.reshape([len(vec), 1]) @ vec.reshape([1, len(vec)])
    
    def get_mean_covariance(self, ix):
#         print("self._counts[ix]", self._counts[ix])
#         print("self._sum[ix]", self._sum[ix])
        
        mean = self._sum[ix] / self._counts[ix]
        d = len(mean)
        cov = self._cov[ix] / self._counts[ix] - mean.reshape([d, 1])  @ mean.reshape([1, d])
        cov = .001 * torch.eye(d) + cov
        return mean, cov

In [76]:

def calculate_kl(wordpair):
    # Get the mean vectors and covariance matrices for the two words in the word pair
    mean1, covariance_matrix1 = embavg.get_mean_covariance(vocab._tok_to_id.get(wordpair[0])) 
    mean2, covariance_matrix2 = embavg.get_mean_covariance(vocab._tok_to_id.get(wordpair[1])) 
    
    # Create PyTorch multivariate normal distributions using the mean vectors and covariance matrices
    p = torch.distributions.multivariate_normal.MultivariateNormal(mean1, covariance_matrix=covariance_matrix1)
    q = torch.distributions.multivariate_normal.MultivariateNormal(mean2, covariance_matrix=covariance_matrix2)

    # Calculate the KL divergence between the two distributions
    kl = torch.distributions.kl.kl_divergence(p, q)

    return kl.item()

In [77]:
def cosine_similarity(a, b):
    nominator = torch.dot(a, b)
    
    a_norm = torch.sqrt(torch.sum(a**2))
    b_norm = torch.sqrt(torch.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [78]:

def import_baroni(neg_file, pos_file):
    filenames = ["neg_file", "pos_file"]

    for i, file in enumerate([neg_file, pos_file]):
        globals()['results_{}'.format(filenames[i])] = []
        
        with open(file) as f:
            line = f.readline()
            while line:
                globals()['results_{}'.format(filenames[i])].append(line.replace("-n", "").replace("\n", "").strip("").split("\t"))
                line = f.readline()
        f.close()

    baroni = sum(results_neg_file, []) + sum(results_pos_file, [])
    baroni_set = set(baroni)

    return results_neg_file, results_pos_file, baroni, baroni_set


In [103]:
# Open the file in read mode
with open("../data_distrembed/roen.vocab", "r") as f:
    # Read the contents of the file
    contents = f.read()

print(len(contents))  # prints the contents of the file

5163


## pipeline


In [183]:
neg_file = "../Data_Shared/eacl2012-data/negative-examples.txtinput"
pos_file = "../Data_Shared/eacl2012-data/positive-examples.txtinput"
results_neg_file, results_pos_file, baroni, baroni_set = import_baroni(neg_file, pos_file)

with open('../Data_Shared/wiki_subtext_preprocess.pickle', 'rb') as handle:
        seqs = pickle.load(handle)

import ast
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')

  
# reading the data from the file
with open('../Data_shared/wiki_subset.txt') as f:
    data = f.read()
      
# reconstructing the data as a dictionary
wikidata = ast.literal_eval(data)

seqs = [sentence.strip() for seq in wikidata for sentence in seq.split(".")]
tok = Tokenizer()
vocab = Vocab()
vocab.fit(tok.words(seqs), baroni)


In [186]:
import ast
with open('../Data_shared/wiki_subset.txt') as f:
    data = f.read()

wikidata = ast.literal_eval(data)

wikidata = wikidata["text"][:5000]   

max_length = 200

wikidata = [sentence[:max_length].strip() if len(sentence.split()) > max_length else sentence.strip()
        for seq in tqdm(wikidata)
        for sentence in seq.split(".")]


100%|██████████| 5000/5000 [00:00<00:00, 14664.77it/s]


In [203]:
for i in wikidata:
    if len(i.split()) > 198:
        print(len(i.split()))

199
199


In [82]:
embavg = torch.load('../data_distrembed/first100000.avgs.pt')

In [85]:
print(len(vocab._tok_counts))
# print(vocab._tok_counts)
for key, item in vocab._tok_counts.items():
    if key not in baroni:
        print(key, item)

670


### dataframe

In [86]:


baroni_pos_subset = [x for x in results_pos_file if x[0] in vocab._tok_counts and x[1] in vocab._tok_counts]
baroni_neg_subset = [x for x in results_neg_file if x[0] in vocab._tok_counts and x[1] in vocab._tok_counts]



In [87]:

# baroni_pos_subset, baroni_neg_subset = create_combined_subset(word_cov_matrices, results_neg_file, results_pos_file, combined_set)

baroni_subset_label = []

for i in baroni_pos_subset:
    baroni_subset_label.append([i, 1])

for i in baroni_neg_subset:
    baroni_subset_label.append([i, 0])

# MAKE DATAFRAME
df1 = pd.DataFrame(baroni_subset_label, columns =['Wordpair', 'True label'])
print(df1)

                 Wordpair  True label
0        [acid, chemical]           1
1    [affection, feeling]           1
2     [aircraft, vehicle]           1
3         [alpha, symbol]           1
4        [antiquity, era]           1
..                    ...         ...
732     [woman, mistress]           0
733         [wood, maple]           0
734          [work, bird]           0
735   [writer, dramatist]           0
736        [writer, poet]           0

[737 rows x 2 columns]


In [89]:
baroni_subset_cos = []

for wordpair in (baroni_pos_subset + baroni_neg_subset):
    A = embavg._sum[vocab._tok_to_id.get(wordpair[0])]
    B = embavg._sum[vocab._tok_to_id.get(wordpair[1])]
    baroni_subset_cos.append(torch.cosine_similarity(A, B))
    
    

### calculate KL and COS

In [94]:

# CALCULATE KL and COS
baroni_subset_kl = []
baroni_subset_cos = []

for wordpair in tqdm((baroni_pos_subset + baroni_neg_subset)):
    baroni_subset_kl.append(calculate_kl(wordpair))
    baroni_subset_cos.append(cosine_similarity(embavg._sum[vocab._tok_to_id.get(wordpair[0])], 
                                               embavg._sum[vocab._tok_to_id.get(wordpair[1])]))

df1['KL score'] = baroni_subset_kl
df1['COS score'] = baroni_subset_cos

# with open('df1.pickle', 'wb') as handle:
#     pickle.dump(df1, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(df1)
print("COS AP: ", average_precision_score(df1["True label"], df1["COS score"]))
print("KL AP: ", average_precision_score(df1["True label"], -df1["KL score"]))

100%|██████████| 737/737 [01:02<00:00, 11.71it/s]

                 Wordpair  True label      KL score       COS score
0        [acid, chemical]           1  22588.404297  tensor(0.7882)
1    [affection, feeling]           1  24089.642578  tensor(0.7085)
2     [aircraft, vehicle]           1  22337.914062  tensor(0.7861)
3         [alpha, symbol]           1  47081.132812  tensor(0.6093)
4        [antiquity, era]           1  34338.492188  tensor(0.6267)
..                    ...         ...           ...             ...
732     [woman, mistress]           0  33531.156250  tensor(0.6877)
733         [wood, maple]           0  25387.968750  tensor(0.7393)
734          [work, bird]           0  37608.054688  tensor(0.6506)
735   [writer, dramatist]           0  50024.500000  tensor(0.6341)
736        [writer, poet]           0  19814.851562  tensor(0.8291)

[737 rows x 4 columns]
COS AP:  0.6932439050975777
KL AP:  0.6869634675826695



