<a href="https://colab.research.google.com/github/oserikov/data-science-nlp/blob/master/5_w2v_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Skip-gram Word2Vec with numpy from scratch
**actually char2vec to make everything small and neat :)**

## Source code
imports, math and the model itself.

In [0]:
from typing import List
import numpy as np

In [0]:
def softmax(x):
    z = x - np.max(x)
    sm = np.exp(z) / np.sum(np.exp(z), axis=0)
    return sm

In [0]:
class IntermediateComputationLog:
    def __init__(self, input_ohe_value, hidden_l_value, output_in_value, prediction):
        self.input_value = input_ohe_value
        self.hidden_l_value = hidden_l_value
        self.output_in_value = output_in_value
        self.prediction = prediction


class W2vSkipgramModel:
    def __init__(self, vocab_size, hidden_size, learning_rate):
        # weight matrices are of inverted shape
        # to ease the computations treating input, output
        # and intermediate layer as single column vectors
        # i.e input->hidden is W_ih * input_v
        #     results in column vector hidden_v
        #     of shape (hidden_size, 1)
        self.W_ih = np.random.randn(hidden_size, vocab_size)
        self.W_ho = np.random.randn(vocab_size, hidden_size)
        self.learning_rate = learning_rate

        self.w2ix = dict()
        self.ix2w = dict()

    def one_hot_encode_word(self, word: str):
        word_ohe = np.zeros((len(self.w2ix.keys()), 1))
        word_ohe[self.w2ix[word]] = 1
        return word_ohe

    def one_hot_decode_word(self, word_ohe):
        word_idx = np.where(word_ohe == 1)
        return self.ix2w[word_idx]

    def forward(self, input_word_ohe):

        hidden_value = self.W_ih @ input_word_ohe
        output_in_value = self.W_ho @ hidden_value

        prediction = softmax(output_in_value)

        intermediate_computations_log = IntermediateComputationLog(
            input_ohe_value=input_word_ohe,
            hidden_l_value=hidden_value,
            output_in_value=output_in_value,
            prediction=prediction
        )

        return intermediate_computations_log, prediction

    def backward(self,
                 intermediate_computations_log: IntermediateComputationLog,
                 context_words_ohes):

        # just pull out the used log fields to narrow code lines down
        input_value = intermediate_computations_log.input_value
        prediction = intermediate_computations_log.prediction

        dL_dOutputIn = np.mean([prediction - cw_ohe for cw_ohe in context_words_ohes],
                               axis=0)

        dL_dWho = dL_dOutputIn @ (self.W_ih @ input_value).T

        dL_dWih = np.outer(self.W_ho.T @ dL_dOutputIn, input_value)

        self.W_ih -= self.learning_rate * dL_dWih
        self.W_ho -= self.learning_rate * dL_dWho

    def train(self, sentences: List[List[str]], context_window_size, epochs_num):
        vocab = {word for sentence in sentences for word in sentence}
        for idx, word in enumerate(vocab):
            self.w2ix[word] = idx
            self.ix2w[idx] = word

        training_set = []
        for sentence in sentences:
            for word_num, word in enumerate(sentence):
                context_window_left_bound = max(0, word_num - context_window_size)
                context_window_right_bound = min(word_num + context_window_size,
                                                 len(sentence))

                left_context_word_nums = list(range(context_window_left_bound,
                                                    word_num))
                right_context_word_nums = list(range(word_num + 1,
                                                     context_window_right_bound))

                context_word_nums = left_context_word_nums + right_context_word_nums
                context_ws_ohes = [sentence[w_num] for w_num in context_word_nums]

                word_ohe = self.one_hot_encode_word(word)
                context_words_ohes = [self.one_hot_encode_word(context_word)
                                      for context_word in context_ws_ohes]

                training_set.append((word_ohe, context_words_ohes))

        # training set is a list of tuples 
        #   formed of one-hot encoded words w, 
        #   and their context words cw1, .., cwn
        # like this: 
        # training_set = [(OHE(w1), [OHE(cw11), OHE(cw12), ..]),
        #                 ..,
        #                 (OHE(wn), [OHE(cwn1), OHE(cwn2), ..])]

        for epoch_num in range(epochs_num):
            epoch_mean_losses = []
            for input_w_ohe, context_ws_ohes in training_set:
                fw_computations_log, prediction = self.forward(input_w_ohe)

                losses = [self.calc_loss(prediction, context_w_ohe)
                          for context_w_ohe in context_ws_ohes]
                epoch_mean_losses.append(np.mean(losses))

                self.backward(fw_computations_log, context_ws_ohes)

            yield np.mean(epoch_mean_losses)

    @staticmethod
    def calc_loss(prediction, context_w_ohe):
        correct_word_ohe_idx = np.where(context_w_ohe == 1)
        loss = -np.log(prediction[correct_word_ohe_idx])
        return loss

## Usage

In [0]:
!wget https://raw.githubusercontent.com/oserikov/nn_harmony_np/master/data/tur_apertium_words.txt

sentences = [[c for c in line.strip()]
             for line in open("tur_apertium_words.txt", encoding="utf-8")]

vocab = {c for sentence in sentences for c in sentence}
w2vmodel = W2vSkipgramModel(len(vocab), 10, 0.001)

In [0]:
for loss_v in w2vmodel.train(sentences, 2, 10):
    print(loss_v)