In [1]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

In [2]:
import re

def tokenize(text):
    text = text.lower()
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text)



In [3]:
tokenized = tokenize(text)

In [4]:
def mapping(tokenized):
    word2id = {}
    id2word = {}
    for i, word in enumerate(tokenized):
        word2id[word] = i
        id2word[i] = word
    return word2id, id2word

In [5]:
word2id, id2word = mapping(tokenized=tokenized)

In [6]:

import numpy as np

def one_hot(word2id, id2word):
    vocab_size = len(word2id)
    one_hot = np.zeros((vocab_size, vocab_size))
    for i in range(vocab_size):
        one_hot[i, i] = 1
    return one_hot

one_hot = one_hot(word2id=word2id, id2word=id2word)

In [7]:

def window(tokenized, window_size):
    for i, word in enumerate(tokenized):
        for j in range(1, window_size+1):
            if i-j >= 0:
                yield word, tokenized[i-j]
            if i+j < len(tokenized):
                yield word, tokenized[i+j]

window = window(tokenized=tokenized, window_size=2)


In [14]:
def word2vec(window, word2id, id2word, one_hot, learning_rate=0.01, epochs=1000):
    vocab_size = len(word2id)
    weights_0_1 = np.random.rand(vocab_size, vocab_size) - 0.5
    weights_1_2 = np.random.rand(vocab_size, vocab_size) - 0.5
    for epoch in range(epochs):
        for word, context in window:
            x = one_hot[word2id[word]]
            y = one_hot[word2id[context]]
            layer_1 = np.dot(weights_0_1, x)
            layer_2 = np.dot(weights_1_2, layer_1)
            error = layer_2 - y
            layer_2_delta = error
            layer_1_delta = np.dot(weights_1_2.T, layer_2_delta)
            weights_1_2 -= learning_rate * np.outer(layer_2_delta, layer_1)
            weights_0_1 -= learning_rate * np.outer(layer_1_delta, x)
    return weights_0_1, weights_1_2


In [15]:

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

w1, w2 = word2vec(window=window, word2id=word2id, id2word=id2word, one_hot=one_hot)

In [16]:

def similar_words(word, word2id, id2word, w1, w2):
    word_id = word2id[word]
    word_vector = w1[word_id]
    scores = np.dot(w2, word_vector)
    scores = softmax(scores)
    scores = list(zip(scores, id2word.values()))
    scores.sort(reverse=True)
    return scores

In [17]:

similar_words(word='machine', word2id=word2id, id2word=id2word, w1=w1, w2=w2)[:5]

[(0.06285710819784264, 'intelligence'),
 (0.04786534684039764, 'build'),
 (0.04573188767027622, 'do'),
 (0.03996979111798652, 'are'),
 (0.038588823005043155, 'a')]

In [20]:
w1[word2id['machine']]

array([ 0.13468326, -0.19684243,  0.43857472, -0.28476414,  0.43530193,
        0.20447942,  0.25002282,  0.30948924,  0.0208748 ,  0.10558158,
       -0.13203783, -0.02213319,  0.11203683,  0.40720176, -0.21561734,
        0.29040601, -0.26544885, -0.33283   ,  0.14124175,  0.08274958,
        0.29810613, -0.34943502,  0.46019817,  0.40337427,  0.05323945,
        0.1453813 ,  0.36344768,  0.32756699, -0.37171059,  0.29042301,
       -0.25503605,  0.13323524,  0.20662686,  0.15085281, -0.21277151,
        0.48674378,  0.37128361,  0.05037762, -0.46280021, -0.25772193,
        0.17332507,  0.16785123,  0.36351297, -0.14005013, -0.48910117,
       -0.49207818,  0.28811948, -0.06668405, -0.05826786, -0.07328144,
       -0.09096476, -0.39326109,  0.23098222, -0.39833556,  0.26990071,
        0.25530312, -0.10826231, -0.32549625, -0.18167538,  0.48704778])

In [21]:
w1[word2id['learning']]

array([-0.29216858,  0.47645538, -0.19693512, -0.01367317,  0.24567181,
        0.42751174,  0.09396516, -0.25403183, -0.38774624, -0.17775924,
        0.32292971,  0.20740634,  0.38552175, -0.04574454, -0.40140449,
        0.18973747,  0.43054018,  0.28205653,  0.24641479, -0.39899754,
        0.29038271,  0.22560332,  0.1574376 ,  0.22181724, -0.4854422 ,
       -0.25787095, -0.01659284, -0.02262975,  0.0963914 , -0.39273031,
        0.14829205, -0.17104214, -0.49802357,  0.34730546,  0.24221292,
        0.14039679,  0.1809301 , -0.02383346, -0.00268695, -0.02623146,
        0.35729833,  0.16347112,  0.18387644, -0.29997197, -0.33602763,
        0.17678612,  0.26208991,  0.04608929, -0.03542888,  0.21480819,
        0.22672015,  0.11188807, -0.43929241, -0.48027932, -0.07656317,
        0.35428917, -0.38303344,  0.4661639 , -0.32523375, -0.25215594])

In [25]:
w1[word2id['data']]

array([ 0.25810638, -0.13163294,  0.16932508, -0.49139265,  0.38721403,
        0.35591421, -0.06708646, -0.11569176, -0.09385104,  0.26142946,
       -0.36942467,  0.01446076,  0.22392839,  0.40771571,  0.4036527 ,
        0.10397787, -0.20667134, -0.22792709,  0.37990906,  0.11773941,
       -0.17916121,  0.40423394,  0.33439644,  0.0058892 , -0.03760155,
       -0.35430086,  0.11343007,  0.33909895,  0.42559444,  0.3420938 ,
        0.36898447, -0.13713464, -0.15831006,  0.18651214,  0.2235192 ,
        0.47448655, -0.49198456,  0.33806272,  0.30222539, -0.18782532,
        0.47874395,  0.19001745,  0.07843145, -0.17508503,  0.25764148,
        0.30170378,  0.08040033,  0.34119958,  0.06547649,  0.385133  ,
        0.23335953,  0.27634001,  0.49539856,  0.21438253, -0.14357298,
        0.29809227,  0.22003126,  0.17160774, -0.43936182, -0.33146934])