In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from os.path import join
from sklearn.metrics.pairwise import cosine_similarity
from itertools import product

In [2]:
class WordEmbeddings():
    """
    provides uniform access to word vectors (fastText, GloVe, ConceptNet Numberbatch)
    """

    def __init__(self, embedding_type, base_dir='./word_vectors'):
        self.embedding_type = embedding_type
        self.word_vectors, self.vector_labels = read_embeddings(embedding_type, base_dir)
        self.dict = make_embedding_map(self.word_vectors, self.vector_labels)

    def __getitem__(self, key):
        """
        dict-like access to word vectors
        """
        if self.embedding_type.lower() in ['word2vec', 'glove', 'conceptnet', 'conceptnetnb']:
            # lowercase key: no capitals in word2vec, GloVe and ConceptNet Numberbatch
            key = key.lower()
        return self.dict.get(key, None)
    
    def get_cosine_similarity(self, w1, w2):

        v1 = self[w1]
        if v1 is not None:
            v1 = v1.reshape(1, -1)
        else:
            print(f'no vectors for word {w1}')
            return None

        v2 = self[w2]
        if v2 is not None:
            v2 = v2.reshape(1, -1)
        else:
            print(f'no vectors for word {w2}')
            return None

        return cosine_similarity(v1, v2).item()

    def __len__(self):
        return len(self.__dict__)


def embeddings_reader(file_name, encoding=None):
    """
    generator function for reading in word vector files
    """
    for line in tqdm(open(file_name, "r", encoding=encoding)):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        yield word, coefs


def read_embeddings(embedding_type, base_dir='./word_vectors'):
    """
    read in files word embedding files
    input:
        - type of word embeddings (word2vec, GloVe, ConceptNet Numberbatch)
        - directory where files are stored (defaults to data/word_vectors in this repo)
    output:
        - numpy.ndarray with word vectors, list with corresponding labels
    """

    if embedding_type.casefold() == 'word2vec':

        w2w_path = join(base_dir, 'ger_word2vec_vectors.txt')

        word_vectors = list()
        vector_labels = list()

        for word, coefs in embeddings_reader(w2w_path):
            word_vectors.append(coefs)
            vector_labels.append(word[2:-1])

    elif embedding_type.casefold() == 'glove':

        glove_path = join(base_dir, 'ger_glove_vectors.txt')

        word_vectors = list()
        vector_labels = list()

        for word, coefs in embeddings_reader(glove_path):
            word_vectors.append(coefs)
            vector_labels.append(word)

        # convert to np array
        word_vectors = np.vstack(word_vectors)

    elif embedding_type.casefold() in ['conceptnetnb', 'conceptnet']:

        cnnb_path = join(base_dir, 'conceptnet_vectors.txt')

        word_vectors = list()
        vector_labels = list()

        prefix = '/c/de/'

        for word, coefs in embeddings_reader(cnnb_path):
            if word.startswith(prefix):
                word_vectors.append(coefs)
                vector_labels.append(word.replace(prefix, ''))

    else:
        raise ValueError(
            f'invalid embedding type {embedding_type}: Choose either word2vec, GloVe or ConceptNetNB!')

    return word_vectors, vector_labels


def make_embedding_map(word_vectors, vector_labels):
    """
    transform word vectors and vector labels as dict (with structure word -> vector)
    """

    embedding_map = dict(zip(vector_labels, word_vectors))

    return embedding_map


In [3]:
List_A = ["Trommel", "Vorhang", "Glocke", "Kaffee", "Schule", "Eltern", "Mond", "Garten", "Hut", "Bauer", "Nase", "Truthahn", "Farbe", "Haus", "Fluss"]
List_C = ["Geige", "Fenster", "Lampe", "Museum", "Tee", "Reise", "Sonne", "Wiese", "Treppe", "Maurer", "Zunge", "Tiger", "Musik", "Stadt", "See"]
List_D = ["Horn", "Tür", "Seil", "Kakao", "Gericht", "Wagen", "Sterne", "Baum", "Mantel", "Pfarrer", "Mund", "Gans", "Form", "Land", "Regen"]

lists = [List_A, List_C, List_D]
list_labels = ['List_A', 'List_C', 'List_D']
named_lists = list(zip(list_labels, lists))

In [4]:
for embedding_type in ['word2vec', 'GloVe', 'ConceptNet']:

    print(f'Load {embedding_type} embeddings...')
    embeddings = WordEmbeddings(embedding_type)

    for list_name, list_words in tqdm(named_lists):

        key = f'{list_name}_{embedding_type}'
        
        print(f'compute similarities for {list_name} with {embedding_type} embeddings')
        
        df = pd.DataFrame(columns=list_words, index=list_words)
        word_combinations = list(product(list_words, repeat=2))

        for w1, w2 in word_combinations:
            df[w1][w2] = embeddings.get_cosine_similarity(w1, w2)
        
        fname = join('./out', f'{key}.csv')
        print(f'save to file {fname}')
        df.to_csv(fname)

Load word2vec embeddings...


854776it [00:34, 24672.75it/s]
100%|██████████| 3/3 [00:00<00:00, 13.37it/s]


compute similarities for List_A with word2vec embeddings
save to file ./out/List_A_word2vec.csv
compute similarities for List_C with word2vec embeddings
save to file ./out/List_C_word2vec.csv
compute similarities for List_D with word2vec embeddings
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
no vectors for word Tür
save to file ./out/List_D_word2vec.csv
Load GloVe embed

1309281it [00:56, 23221.41it/s]
 33%|███▎      | 1/3 [00:00<00:00,  6.52it/s]

compute similarities for List_A with GloVe embeddings
save to file ./out/List_A_GloVe.csv
compute similarities for List_C with GloVe embeddings


100%|██████████| 3/3 [00:00<00:00, 10.33it/s]


save to file ./out/List_C_GloVe.csv
compute similarities for List_D with GloVe embeddings
save to file ./out/List_D_GloVe.csv
Load ConceptNet embeddings...


9161913it [06:11, 24673.20it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 13.08it/s]

compute similarities for List_A with ConceptNet embeddings
save to file ./out/List_A_ConceptNet.csv
compute similarities for List_C with ConceptNet embeddings
save to file ./out/List_C_ConceptNet.csv
compute similarities for List_D with ConceptNet embeddings


100%|██████████| 3/3 [00:00<00:00, 14.05it/s]

save to file ./out/List_D_ConceptNet.csv



