https://www.tensorflow.org/tutorials/text/word_embeddings

This tutorial contains an introduction to word embeddings. You will train your own word embeddings using a simple Keras model for a sentiment classification task, and then visualize them in the Embedding Projector (shown in the image below).

Two possibilities to create vector are

- dense (using unique index) *is arbitrary*
- sparse (one hot encoding)


Word embeddings give us way of an effecient representation which captures features for every word. Words with similar meaning are in n dimensional space situated near by.

Hyperparameter **N**.

In [6]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

In [7]:
from datetime import datetime
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [8]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)


['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [10]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [13]:
print(os.path.dirname(dataset))
dataset

.


'.\\aclImdb_v1.tar.gz'

In [14]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [15]:
batch_size = 1024
seed = 123
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2, 
    subset='training', seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2, 
    subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [18]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (without

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
AUTOTUNE

-1

In [21]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [22]:
result = embedding_layer(tf.constant([1,2,3]))
result.numpy()

array([[-0.00937921,  0.04978042, -0.02434739,  0.00344201, -0.03033596],
       [ 0.04537511, -0.00696659, -0.03137805,  0.00928062,  0.0054173 ],
       [ 0.03652516,  0.02918113,  0.04473901, -0.04474849,  0.03087263]],
      dtype=float32)

In [24]:
result = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
result.shape

result


<tf.Tensor: shape=(2, 3, 5), dtype=float32, numpy=
array([[[ 0.02979917, -0.04279706,  0.0329759 ,  0.01592753,
         -0.01588236],
        [-0.00937921,  0.04978042, -0.02434739,  0.00344201,
         -0.03033596],
        [ 0.04537511, -0.00696659, -0.03137805,  0.00928062,
          0.0054173 ]],

       [[ 0.03652516,  0.02918113,  0.04473901, -0.04474849,
          0.03087263],
        [-0.03659192,  0.00933876,  0.02018538,  0.03482069,
         -0.01848713],
        [-0.04189425,  0.00962301,  0.03287759, -0.04864116,
         -0.00630798]]], dtype=float32)>

# Text preprocessing

In [25]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')

# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [27]:
embedding_dim=16

In [28]:
model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [29]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [30]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [31]:
model.fit(
    train_ds,
    validation_data=val_ds, 
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x199b75ed610>

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [34]:
%load_ext tensorboard
%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 5596), started 0:09:39 ago. (Use '!kill 5596' to kill it.)

In [36]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()




In [42]:
vectorize_layer

<tensorflow.python.keras.layers.preprocessing.text_vectorization.TextVectorization at 0x199b7686820>

In [38]:
vocab

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'out',
 'if',
 'some',
 'there',
 'what',
 'good',
 'more',
 'when',
 'very',
 'even',
 'she',
 'up',
 'my',
 'no',
 'would',
 'time',
 'only',
 'which',
 'really',
 'story',
 'their',
 'see',
 'had',
 'were',
 'can',
 'me',
 'we',
 'than',
 'much',
 'well',
 'been',
 'will',
 'get',
 'people',
 'bad',
 'also',
 'other',
 'do',
 'into',
 'great',
 'first',
 'because',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'movies',
 'then',
 'them',
 'way',
 'make',
 'films',
 'could',
 'any',
 'after',
 'too',
 'characters',
 'think',
 'watch',
 'being',
 'two',
 'many',
 'seen',
 'character',
 'plot',
 'never',
 'little',
 'acting',
 'where',
 'best',
 '

In [39]:
weights

array([[-0.00484474, -0.00164267, -0.06144508, ..., -0.07915676,
         0.03368951, -0.04211714],
       [ 0.0757916 ,  0.03239109, -0.01235005, ..., -0.04264027,
         0.01871334,  0.07386676],
       [ 0.01321536, -0.02348537, -0.17622595, ..., -0.06710466,
         0.14880931,  0.02832861],
       ...,
       [ 0.24915454,  0.20344096,  0.17055969, ...,  0.21186633,
        -0.15832615,  0.17838088],
       [ 0.09537298,  0.11918845,  0.11387329, ...,  0.08769809,
        -0.07916372,  0.07616784],
       [ 0.06334344,  0.09275149,  0.08400302, ...,  0.12024333,
        -0.08502059,  0.10590328]], dtype=float32)

In [40]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

In [41]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()