# Lets get some basic understanding in word embeddings
Implement an analogy solver for the eg. 'king - man ≈ queen - woman' analogy

In [None]:
#already installed on my local machine
#%pip install flair


Obtaining the embeddings for words A, B and C and computing the embedding D

In [10]:
from flair.data import Sentence

def compute_embedding_for_D(A, B, C, embedding):
    wordsABC_sentence = Sentence(' '.join([A, B, C]))
    embedding.embed(wordsABC_sentence)
    
    A_embedded = wordsABC_sentence[0].embedding
    B_embedded = wordsABC_sentence[1].embedding
    C_embedded = wordsABC_sentence[2].embedding
    
    D_embedding = B_embedded + C_embedded - A_embedded

    return D_embedding.tolist()

In [11]:
from flair.embeddings import WordEmbeddings
fasttext = WordEmbeddings('crawl')
D = compute_embedding_for_D('king', 'man', 'queen', fasttext)

print(D)

[0.13569998741149902, -0.3856000006198883, -0.1022999957203865, -0.13539999723434448, -0.01410001516342163, -0.04699999839067459, -0.10929998755455017, 0.01759999990463257, 0.01919996738433838, -0.04389999806880951, -0.09710001945495605, 0.3399999737739563, 0.18649999797344208, 0.22010000050067902, -0.32850000262260437, -0.031500011682510376, -0.15630000829696655, 0.2533000111579895, -0.1753000020980835, -0.22199997305870056, -0.09129999577999115, 0.03869999945163727, -0.2296999990940094, -0.382099986076355, 0.2353999763727188, -0.22460001707077026, -0.4235000014305115, -0.19739998877048492, -0.03290000557899475, -0.062199998646974564, 0.11219999194145203, -0.1899999976158142, -0.0601000040769577, -0.4790000319480896, -0.08500000834465027, -0.22340001165866852, 0.09390000998973846, -0.0032000020146369934, 0.22579999268054962, 0.04739999771118164, 0.14300000667572021, -0.042600005865097046, -0.10939997434616089, 0.1727999895811081, 0.5296000242233276, 0.029300004243850708, 0.10900001227

This long list contains embeddings, that should be similar to the embedding for the word 'woman'

Obtaining embeddings for all English words in Flair

In [12]:
from flair import datasets
from flair.data import Sentence

def get_embedded_english_vocab(embedding):
    dataset = datasets.UD_ENGLISH()
    vocab_list = dataset.make_vocab_dictionary().get_items()
    vocab = Sentence(' '.join(vocab_list))
    embedding.embed(vocab)
    return vocab

In [13]:
print(get_embedded_english_vocab(fasttext)[6].embedding)

2022-12-03 16:23:27,930 Reading data from /home/stefan/.flair/datasets/ud_english
2022-12-03 16:23:27,930 Train: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-12-03 16:23:27,931 Dev: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-12-03 16:23:27,931 Test: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-test.conllu
tensor([-1.7500e-02, -2.1890e-01,  3.5300e-02,  3.4500e-02,  9.1300e-02,
         2.6900e-02, -1.6700e-01, -2.7590e-01,  4.2830e-01,  3.4900e-02,
         1.3200e-02,  1.0780e-01, -5.5800e-02, -2.1500e-02, -3.8900e-02,
         3.7900e-02, -1.1800e-01,  1.6400e-02,  6.3100e-02,  4.5900e-02,
         3.3100e-02, -1.7600e-02,  1.0000e-03,  1.7600e-02,  3.5100e-02,
        -5.4500e-02,  4.3000e-02, -3.0000e-02,  3.5800e-02,  2.5880e-01,
         5.7600e-02, -8.5400e-02,  6.3200e-02,  2.5500e-02,  8.0000e-04,
         1.5200e-02, -8.4300e-02, -2.4900e-02,  6.5100e-02, -9.6100e-02,
        -1.0700e-02,  1.6490e-01,  3.0000e-04, -2.8750e-01

In [14]:
from sklearn.metrics.pairwise import cosine_similarity as sim

def find_closest_matching_word(D, vocab, ABC):
    max_match = -1
    for word in vocab:
        match = sim([D], [word.embedding.tolist()])[0][0]
        if match > max_match and word.text not in ABC:
            max_match = match
            closest_matching_word = word.text
    return closest_matching_word

In [15]:
def A_is_to_B_as_C_is_to(A, B, C):
    fasttext = WordEmbeddings('crawl')
    result = compute_embedding_for_D(A, B, C, fasttext)
    vocab = get_embedded_english_vocab(fasttext)
    D = find_closest_matching_word(result, vocab, {A, B, C})

    print(f'{A} is to {B} as {C} is to {D}')

Lets do some tests

In [16]:
A_is_to_B_as_C_is_to("king", "man", "queen")

2022-12-03 16:26:02,910 Reading data from /home/stefan/.flair/datasets/ud_english
2022-12-03 16:26:02,910 Train: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-12-03 16:26:02,911 Dev: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-12-03 16:26:02,912 Test: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-test.conllu
king is to man as queen is to woman


In [17]:
A_is_to_B_as_C_is_to("do", "did", "go")

2022-12-03 16:27:15,314 Reading data from /home/stefan/.flair/datasets/ud_english
2022-12-03 16:27:15,315 Train: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-12-03 16:27:15,316 Dev: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-12-03 16:27:15,316 Test: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-test.conllu
do is to did as go is to went


In [18]:
A_is_to_B_as_C_is_to("bread","baker","meat")

2022-12-03 16:27:53,104 Reading data from /home/stefan/.flair/datasets/ud_english
2022-12-03 16:27:53,105 Train: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-12-03 16:27:53,106 Dev: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-12-03 16:27:53,106 Test: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-test.conllu
bread is to baker as meat is to butcher


In [19]:
A_is_to_B_as_C_is_to("big", "bigger", "small")

2022-12-03 16:28:38,935 Reading data from /home/stefan/.flair/datasets/ud_english
2022-12-03 16:28:38,936 Train: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-12-03 16:28:38,936 Dev: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-12-03 16:28:38,937 Test: /home/stefan/.flair/datasets/ud_english/en_ewt-ud-test.conllu
big is to bigger as small is to smaller
