In [1]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

In [2]:
import re

def tokenize(text):
    text = text.lower()
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)



In [3]:
tokenized = tokenize(text)

In [4]:
def mapping(tokenized):
    word2id = {}
    id2word = {}
    for i, word in enumerate(tokenized):
        word2id[word] = i
        id2word[i] = word
    return word2id, id2word

In [5]:
word2id, id2word = mapping(tokenized=tokenized)

In [6]:

import numpy as np

def one_hot(word2id, id2word):
    vocab_size = len(word2id)
    one_hot = np.zeros((vocab_size, vocab_size))
    for i in range(vocab_size):
        one_hot[i, i] = 1
    return one_hot

one_hot = one_hot(word2id=word2id, id2word=id2word)

In [7]:

def window(tokenized, window_size):
    for i, word in enumerate(tokenized):
        for j in range(1, window_size+1):
            if i-j >= 0:
                yield word, tokenized[i-j]
            if i+j < len(tokenized):
                yield word, tokenized[i+j]

window = window(tokenized=tokenized, window_size=2)


In [14]:
def word2vec(window, word2id, id2word, one_hot, learning_rate=0.01, epochs=1000):
    vocab_size = len(word2id)
    weights_0_1 = np.random.rand(vocab_size, vocab_size) - 0.5
    weights_1_2 = np.random.rand(vocab_size, vocab_size) - 0.5
    for epoch in range(epochs):
        for word, context in window:
            x = one_hot[word2id[word]]
            y = one_hot[word2id[context]]
            layer_1 = np.dot(weights_0_1, x)
            layer_2 = np.dot(weights_1_2, layer_1)
            error = layer_2 - y
            layer_2_delta = error
            layer_1_delta = np.dot(weights_1_2.T, layer_2_delta)
            weights_1_2 -= learning_rate * np.outer(layer_2_delta, layer_1)
            weights_0_1 -= learning_rate * np.outer(layer_1_delta, x)
    return weights_0_1, weights_1_2


In [15]:

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))



w1, w2 = word2vec(window=window, word2id=word2id, id2word=id2word, one_hot=one_hot)

In [16]:

def similar_words(word, word2id, id2word, w1, w2):
    word_id = word2id[word]
    word_vector = w1[word_id]
    scores = np.dot(w2, word_vector)
    scores = softmax(scores)
    scores = list(zip(scores, id2word.values()))
    scores.sort(reverse=True)
    return scores

In [17]:

similar_words(word='machine', word2id=word2id, id2word=id2word, w1=w1, w2=w2)[:5]

[(0.06285710819784264, 'intelligence'),
 (0.04786534684039764, 'build'),
 (0.04573188767027622, 'do'),
 (0.03996979111798652, 'are'),
 (0.038588823005043155, 'a')]