## Word2Vec Skip-gram Implementation

### Preprocessing

In [2]:
import utils

In [3]:
fu = utils.FileUtil()

In [4]:
corpus = 'Natural language processing is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human languages.'

In [5]:
def pre_processing(doc):
    """
    return tokens
    """
    words = fu.get_words(doc, lower_case=True, remove_stop_words=False)
    return words

In [6]:
words = pre_processing(corpus)

### Settings

In [7]:
settings = {
    'n': 10,  # dimensions
    'window_size': 2,
    'learning_rate': 0.01
}

In [8]:
word_n = len(set(words))
word_list = list(set(words))
word_index = dict((word, i) for i, word in enumerate(word_list))
index_word = dict((i, word) for i, word in enumerate(word_list))

In [21]:
index_word

{0: 'natural',
 1: 'subfield',
 2: 'computer',
 3: 'of',
 4: 'science',
 5: 'interactions',
 6: 'artificial',
 7: 'is',
 8: 'concerned',
 9: 'the',
 10: 'human',
 11: 'computers',
 12: 'intelligence',
 13: 'processing',
 14: 'information',
 15: 'a',
 16: 'engineering',
 17: 'with',
 18: 'language',
 19: 'and',
 20: 'languages',
 21: 'between'}

### One-hot Encoding

In [10]:
import numpy as np

In [11]:
def one_hot_encoding(word):
    word_vec = np.zeros(word_n)
    # Get ID of word from word_index
    word_ind = word_index[word]
    # Change value from 0 to 1 according to ID of the word
    word_vec[word_ind] = 1
    return word_vec

In [12]:
one_hot_encoding('natural')

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [14]:
settings['window_size']

2

In [20]:
sent_len = len(words)
for i, word in enumerate(words):
    print(word)
    w_target = one_hot_encoding(words[i])
    w_context = []

    for j in range(i - settings['window_size'], i + settings['window_size'] + 1):
        if j != i and j <= sent_len-1 and j >= 0:
            w_context.append(one_hot_encoding(words[j]))

    print(w_context)

natural
[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0.])]
language
[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])]
processing
[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0.])]
is
[array([0., 0., 0., 0., 0., 0.