In [1]:
## creating a word2vec model to find similar words to a given word from a corpus from scratch

In [2]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

In [3]:
import re

def tokenize(text):
    text = text.lower()
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)

In [4]:
tokenized = tokenize(text)

In [6]:
def mapping(tokenized):
    word2id = {}
    id2word = {}
    for i, word in enumerate(tokenized):
        word2id[word] = i
        id2word[i] = word
    return word2id, id2word

In [8]:
word2id, id2word = mapping(tokenized=tokenized)

In [14]:

import numpy as np

def one_hot(word2id, id2word):
    vocab_size = len(word2id)
    one_hot = np.zeros((vocab_size, vocab_size))
    for i in range(vocab_size):
        one_hot[i, i] = 1
    return one_hot

one_hot = one_hot(word2id=word2id, id2word=id2word)

In [15]:

def window(tokenized, window_size):
    for i, word in enumerate(tokenized):
        for j in range(1, window_size+1):
            if i-j >= 0:
                yield word, tokenized[i-j]
            if i+j < len(tokenized):
                yield word, tokenized[i+j]

window = window(tokenized=tokenized, window_size=2)


In [24]:

def word2vec(window, word2id, id2word, one_hot, learning_rate=0.01, epochs=1000):

    vocab_size = len(word2id)
    weights_0_1 = np.random.rand(vocab_size, vocab_size) - 0.5
    weights_1_2 = np.random.rand(vocab_size, vocab_size) - 0.5

    for i in range(epochs):
        for center_word, context_word in window:
            center_word_id = word2id[center_word]
            context_word_id = word2id[context_word]
            center_word_vector = one_hot[center_word_id]
            context_word_vector = one_hot[context_word_id]
            context_word_prediction = np.dot(weights_0_1, center_word_vector)
            context_word_prediction = np.dot(weights_1_2, context_word_prediction)
            error = context_word_vector - context_word_prediction
            weights_0_1 += learning_rate * np.outer(error, center_word_vector)
            weights_1_2 += learning_rate * np.outer(error, context_word_prediction)

    return weights_0_1, weights_1_2

In [25]:

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))


w1, w2 = word2vec(window=window, word2id=word2id, id2word=id2word, one_hot=one_hot)

In [28]:

def similar_words(word, word2id, id2word, w1, w2):
    word_id = word2id[word]
    word_vector = w1[word_id]
    scores = np.dot(w2, word_vector)
    scores = softmax(scores)
    scores = list(zip(scores, id2word.values()))
    scores.sort(reverse=True)
    return scores

In [29]:

similar_words(word='machine', word2id=word2id, id2word=id2word, w1=w1, w2=w2)

[(0.06192880426898899, 'model'),
 (0.048472293523117695, 'or'),
 (0.04578084882078851, 'data'),
 (0.04392987555000341, 'so'),
 (0.03191532715774098, 'do'),
 (0.03127402324165843, 'known'),
 (0.03015574450460308, 'to'),
 (0.03012783594071001, 'algorithms'),
 (0.0300010530192087, 'computer'),
 (0.02534138143630273, 'in'),
 (0.0237606665724067, 'learning'),
 (0.02349957436157881, 'mathematical'),
 (0.02283146449290461, 'of'),
 (0.02158901328803288, 'sample'),
 (0.021193616597964, 'training'),
 (0.020429782897815794, 'learning'),
 (0.02027805971379298, 'wide'),
 (0.01918669346373154, 'build'),
 (0.017672554685059755, 'experience'),
 (0.01713739769174345, 'order'),
 (0.015907122983488713, 'in'),
 (0.015545745403052338, 'explicitly'),
 (0.015218817959237293, 'artificial'),
 (0.014531269886488592, 'improve'),
 (0.01451535755939659, 'without'),
 (0.014374461462038426, 'a'),
 (0.014204905267399122, 'make'),
 (0.014182781581766663, 'on'),
 (0.013940244092935983, 'learning'),
 (0.0139090376622492