# word2vec from half-scratch

In [11]:
import tensorflow as tf
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

In [17]:
texts=["I’m real cautious. I didn’t have any expensive tastes. I lived well within my means in a one-bedroom apartment and decided not to sacrifice quality of life.", 
       "Self-doubt is popularly considered a monster, something I should be working to outrun. But the further I wade into adulthood, and the more I notice platitudes of self-belief becoming prerequisites for doing anything, the more I think it might be self-doubt that keeps me going.",
       "I finished that book and tried to get agents. This was really before the internet, so you’d go through the agents in the Writer’s Guide and highlight the ones that did science fiction.",
       "The story of the relationship between self-doubt and self-esteem is hardly simple. Markway says that it’s important not to let negative thoughts master you; it can be helpful to ask yourself, 'Is this thought true?'"]

In [None]:
Tokenizer

In [31]:
vocab_size = 100
word2idx = [one_hot(text=text, n=vocab_size) for text in texts]
print(word2idx)

[[46, 86, 49, 17, 64, 14, 12, 76, 77, 17, 96, 66, 14, 45, 69, 35, 71, 10, 58, 54, 60, 95, 86, 16, 24, 5, 10, 91], [27, 90, 19, 41, 14, 71, 53, 90, 17, 99, 30, 71, 16, 84, 5, 60, 76, 17, 80, 44, 19, 60, 60, 20, 17, 74, 81, 10, 27, 78, 63, 10, 30, 57, 8, 60, 20, 17, 84, 34, 32, 30, 27, 90, 56, 53, 31, 58], [17, 38, 56, 64, 60, 41, 16, 81, 79, 60, 13, 63, 67, 60, 69, 53, 8, 71, 72, 60, 79, 35, 60, 18, 74, 60, 46, 60, 81, 56, 13, 37, 20], [60, 9, 10, 60, 43, 87, 27, 90, 60, 27, 18, 19, 79, 64, 46, 21, 56, 69, 27, 86, 16, 91, 23, 62, 66, 66, 34, 57, 30, 80, 16, 73, 10, 92, 60, 50, 53, 55]]


In [34]:
pad_len = 45
pad4idx = pad_sequences(word2idx, maxlen=pad_len, padding="post")
print(pad4idx)

[[46 86 49 17 64 14 12 76 77 17 96 66 14 45 69 35 71 10 58 54 60 95 86 16
  24  5 10 91  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [41 14 71 53 90 17 99 30 71 16 84  5 60 76 17 80 44 19 60 60 20 17 74 81
  10 27 78 63 10 30 57  8 60 20 17 84 34 32 30 27 90 56 53 31 58]
 [17 38 56 64 60 41 16 81 79 60 13 63 67 60 69 53  8 71 72 60 79 35 60 18
  74 60 46 60 81 56 13 37 20  0  0  0  0  0  0  0  0  0  0  0  0]
 [60  9 10 60 43 87 27 90 60 27 18 19 79 64 46 21 56 69 27 86 16 91 23 62
  66 66 34 57 30 80 16 73 10 92 60 50 53 55  0  0  0  0  0  0  0]]


In [40]:
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.initializers import glorot_uniform, uniform

In [41]:
def word2vec(vocab_size, embeded_dim, pad_len):
    model = Sequential()
    model.add(Embedding(vocab_size, embeded_dim, input_length=pad_len, embeddings_initializer=uniform(seed=20190219)))
    model.add(Flatten())
    model.add(Dense(units=vocab_size, use_bias=True,kernel_initializer=glorot_uniform(seed=20190219)))
    model.add(Activation("softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="RMSprop", metrics=["categorical_accuracy"])
    return model

In [45]:
embeded_dim=200
model = word2vec(vocab_size, embeded_dim, pad_len)

In [46]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 45, 200)           20000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 9000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               900100    
_________________________________________________________________
activation_2 (Activation)    (None, 100)               0         
Total params: 920,100
Trainable params: 920,100
Non-trainable params: 0
_________________________________________________________________
