In [62]:
import numpy as np

text_data = """Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.
Learning can be supervised, semi-supervised or unsupervised.
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
Artificial neural networks  were inspired by information processing and distributed communication nodes in biological systems. 
ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, 
while the biological brain of most living organisms is dynamic (plastic) and analog.
The adjective "deep" in deep learning refers to the use of multiple layers in the network. 
Early work showed that a linear perceptron cannot be a universal classifier, but that a network with a nonpolynomial activation function with one hidden layer of unbounded width can.
Deep learning is a modern variation which is concerned with an unbounded number of layers of bounded size, which permits practical application and optimized implementation, while retaining theoretical universality under mild conditions.
In deep learning the layers are also permitted to be heterogeneous and to deviate widely from biologically informed connectionist models, for the sake of efficiency, trainability and understandability, hence the "structured" part.
Deep neural networks are generally interpreted in terms of the universal approximation theorem or probabilistic inference.
The classic universal approximation theorem concerns the capacity of feedforward neural networks with a single hidden layer of finite size to approximate continuous functions.
In 1989, the first proof was published by George Cybenko for sigmoid activation functions and was generalised to feed-forward multi-layer architectures in 1991 by Kurt Hornik.
Recent work also showed that universal approximation also holds for non-bounded activation functions such as the rectified linear unit.
The universal approximation theorem for deep neural networks concerns the capacity of networks with bounded width but the depth is allowed to grow. 
Lu et al. proved that if the width of a deep neural network with ReLU activation is strictly larger than the input dimension,
then the network can approximate any Lebesgue integrable function; If the width is smaller or equal to the input dimension, then a deep neural network is not a universal approximator.
The probabilistic interpretation derives from the field of machine learning. 
It features inference, as well as the optimization concepts of training and testing, related to fitting and generalization, respectively. 
More specifically, the probabilistic interpretation considers the activation nonlinearity as a cumulative distribution function.
The probabilistic interpretation led to the introduction of dropout as regularizer in neural networks. 
The probabilistic interpretation was introduced by researchers including Hopfield, Widrow and Narendra and popularized in surveys such as the one by Bishop."""

In [64]:

# implementing word2vec from scratch

# preprocessing the data
text_data = text_data.lower()
text_data = text_data.replace('.', ' .')

words = text_data.split(' ')
words = [word for word in words if word != '']

word_to_id = {}
id_to_word = {}
for i, word in enumerate(set(words)):
    word_to_id[word] = i
    id_to_word[i] = word

vocab_size = len(word_to_id)
corpus = [word_to_id[w] for w in words]

In [65]:
# creating the co-occurrence matrix
window_size = 4
vocab_size = len(word_to_id)
C = np.zeros((vocab_size, vocab_size))

for idx, word_id in enumerate(corpus):
    for i in range(1, window_size+1):
        left_idx = idx - i
        right_idx = idx + i

        if left_idx >= 0:
            left_word_id = corpus[left_idx]
            C[word_id, left_word_id] += 1

        if right_idx < len(corpus):
            right_word_id = corpus[right_idx]
            C[word_id, right_word_id] += 1

In [68]:
# implementing the skip-gram model

# center word
def get_target(words, idx, window_size=1):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])

    return list(target_words)

# context word
def get_context(words, idx, window_size=1):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])

    return list(target_words)

# softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [69]:
# training the model using gradient descent

# center word
def get_target(words, idx, window_size=1):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])

    return list(target_words)

# context word
def get_context(words, idx, window_size=1):
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])

    return list(target_words)

# softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

# training the model using gradient descent
def train(C, vocab_size, lr=0.05, epochs=10000):
    W = np.random.uniform(-1, 1, (vocab_size, vocab_size))
    for t in range(1, epochs+1):
        i = np.random.randint(vocab_size)
        j = np.random.randint(vocab_size)
        x = C[i]
        y = softmax(np.dot(W, x))
        dW = np.outer(y, x)
        W[i, j] -= lr * dW[i, j]

    return W

In [70]:
W = train(C, vocab_size)

In [71]:
# visualizing the word vectors
def word_vecs(W, word_to_id, id_to_word):
    for word, word_id in word_to_id.items():
        print(word, W[word_id])

word_vecs(W, word_to_id, id_to_word)


climate [-0.24542238  0.48631267  0.95626402  0.94502325 -0.02202529 -0.14728107
 -0.24754488 -0.27165495 -0.34692404 -0.92710892 -0.09221876  0.35724847
 -0.11925441  0.15403696 -0.90168468  0.70600164  0.97561418 -0.92892179
  0.64105316 -0.94068883 -0.42325149  0.90197998 -0.56124494  0.29875257
  0.54037308  0.99922241 -0.31321827 -0.25641169  0.24172858  0.58616028
 -0.36028966 -0.35320262 -0.84632743  0.53875878 -0.46267454  0.69869602
 -0.37413814  0.14626629  0.93572152 -0.74157528  0.03168167 -0.70551554
  0.49330074  0.96647562  0.787504   -0.73614777 -0.19527487 -0.44323333
  0.89889937 -0.11637919  0.66857684  0.77722325  0.71172118  0.53604303
 -0.72881471  0.41320334 -0.47197674 -0.51636948  0.33848022  0.46196467
  0.6128486   0.66702675  0.98127438 -0.05213833  0.36353101  0.31318249
  0.4631677   0.06742083  0.79437073 -0.23920843  0.9075677  -0.70412696
 -0.68041891 -0.10098466 -0.12388542  0.25534746  0.47280816  0.09859996
 -0.82411482  0.12093023 -0.68089941  0.794

In [75]:
##inding similar words

# cosine similarity
def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    return np.dot(nx, ny)

def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # convert word to word_id
    if query not in word_to_id:
        print('%s is not found' % query)
        return
    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    # calculate similarity
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    # sort by similarity
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))
        
        count += 1
        if count >= top:
            return 

In [76]:
most_similar('deep', word_to_id, id_to_word, W, top=5)


[query] deep
 hence: 0.14395037982554826
 been: 0.12909935911598178
 approximation: 0.12221454224657015
 board: 0.12179243860315811
 function: 0.11902708149211588


In [79]:
most_similar('learning', word_to_id, id_to_word, W, top=5)


[query] learning
 modern: 0.14456700909795173
 
it: 0.1404285874348921
 sigmoid: 0.11616978279283942
 symbolic,: 0.11570995690911347
 an: 0.11088999323967183


In [80]:
most_similar('neural', word_to_id, id_to_word, W, top=5)


[query] neural
 application: 0.15259353453774752
 bishop: 0.1462072999207243
 linear: 0.14320519413196348
 (plastic): 0.13318996276798264
 unbounded: 0.13274088135074552


In [81]:
## co-occurrence matrix

# creating the co-occurrence matrix

window_size = 4
vocab_size = len(word_to_id)
C = np.zeros((vocab_size, vocab_size))

for idx, word_id in enumerate(corpus):
    for i in range(1, window_size+1):
        left_idx = idx - i
        right_idx = idx + i

        if left_idx >= 0:
            left_word_id = corpus[left_idx]
            C[word_id, left_word_id] += 1

        if right_idx < len(corpus):
            right_word_id = corpus[right_idx]
            C[word_id, right_word_id] += 1

In [84]:
C

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [96]:
corpus = """I Like Deep Learning. I Like NLP. I enjoy flying."""

corpus = corpus.lower()
words = corpus.split()
word_to_id = {}
id_to_word = {}
for i, word in enumerate(set(words)):
    word_to_id[word] = i
    id_to_word[i] = word

corpus = [word_to_id[w] for w in words]
corpus

window_size = 4
vocab_size = len(word_to_id)
C = np.zeros((vocab_size, vocab_size))

for idx, word_id in enumerate(corpus):
    for i in range(1, window_size+1):
        left_idx = idx - i
        right_idx = idx + i

        if left_idx >= 0:
            left_word_id = corpus[left_idx]
            C[word_id, left_word_id] += 1

        if right_idx < len(corpus):
            right_word_id = corpus[right_idx]
            C[word_id, right_word_id] += 1

In [97]:
S, sigma, S_t = np.linalg.svd(C, full_matrices=False)

In [98]:
S, sigma, S_t

(array([[-3.23513147e-01,  6.25563306e-01, -1.98113886e-01,
         -3.25996120e-01, -5.00000000e-01,  1.80480708e-01,
         -2.75524820e-01],
        [-2.67526643e-01,  3.38561106e-01,  3.64074864e-01,
         -3.24911046e-01,  5.00000000e-01,  2.92773257e-01,
          4.89869369e-01],
        [-2.55219219e-01, -2.93769754e-01, -4.05102073e-01,
          2.84577988e-01,  5.24360860e-16,  7.71306570e-01,
          9.25004741e-02],
        [-5.01272367e-01, -1.84064452e-01, -5.15647675e-01,
         -1.70266417e-01, -2.83545910e-16, -4.94244711e-01,
          4.19148279e-01],
        [-2.05490637e-01,  4.57313247e-01, -2.10661012e-01,
          5.81770876e-01,  5.00000000e-01, -1.84841474e-01,
         -2.85720857e-01],
        [-1.49504133e-01,  1.70311047e-01,  3.51527738e-01,
          5.82855950e-01, -5.00000000e-01, -7.25489247e-02,
          4.79673332e-01],
        [-6.65417174e-01, -3.68409119e-01,  4.79844094e-01,
         -2.37651160e-03,  1.99181586e-16, -5.55801945e-02

In [90]:
corpus

[6, 3, 1, 0, 6, 3, 2, 6, 4, 5]

## using singular value decomposition on co-occurence matrix

In [94]:
import numpy as np
S,sigma,vt = np.linalg.svd(C, full_matrices=False)

In [95]:
S, sigma, vt

(array([[-3.23513147e-01,  6.25563306e-01, -1.98113886e-01,
         -3.25996120e-01, -5.00000000e-01,  1.80480708e-01,
         -2.75524820e-01],
        [-2.67526643e-01,  3.38561106e-01,  3.64074864e-01,
         -3.24911046e-01,  5.00000000e-01,  2.92773257e-01,
          4.89869369e-01],
        [-2.55219219e-01, -2.93769754e-01, -4.05102073e-01,
          2.84577988e-01,  5.24360860e-16,  7.71306570e-01,
          9.25004741e-02],
        [-5.01272367e-01, -1.84064452e-01, -5.15647675e-01,
         -1.70266417e-01, -2.83545910e-16, -4.94244711e-01,
          4.19148279e-01],
        [-2.05490637e-01,  4.57313247e-01, -2.10661012e-01,
          5.81770876e-01,  5.00000000e-01, -1.84841474e-01,
         -2.85720857e-01],
        [-1.49504133e-01,  1.70311047e-01,  3.51527738e-01,
          5.82855950e-01, -5.00000000e-01, -7.25489247e-02,
          4.79673332e-01],
        [-6.65417174e-01, -3.68409119e-01,  4.79844094e-01,
         -2.37651160e-03,  1.99181586e-16, -5.55801945e-02