### Unsupervised dimensionality reduction using a 1 Hidden-layer perceptron where label == ground truth
### For NLP, we can say somewhat say that word2vec and autoencoders are similiar.

> Dimensionality reduction works only if the inputs are correlated (like images from the same domain). It fails if we pass in completely random inputs each time we train an autoencoder. So in the end, an autoencoder can produce lower dimensional output (at the encoder) given an input much like Principal Component Analysis (PCA). And since we don’t have to use any labels during training, it’s an unsupervised model as well.

In [27]:
import os
from random import randint
from collections import Counter
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
import tensorflow as tf

In [28]:
corpus = "the quick brown fox jumped over the lazy dog from the quick tall fox".split()
test_corpus = "the quick brown fox jumped over the lazy dog from the quick tall fox".split()
corpus[:10]

['the',
 'quick',
 'brown',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'dog',
 'from']

In [29]:
def build_vocab(words, vocab_size):
    """ Build vocabulary of VOCAB_SIZE most frequent words """
    dictionary = dict()
    count = [('UNK', -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    for word, _ in count:
        dictionary[word] = index
        index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

In [30]:
vocabulary, reverse_vocabulary = build_vocab(corpus, 100)

In [31]:
vocabulary

{'UNK': 0,
 'brown': 4,
 'dog': 8,
 'fox': 3,
 'from': 9,
 'jumped': 5,
 'lazy': 7,
 'over': 6,
 'quick': 2,
 'tall': 10,
 'the': 1}

In [32]:
def index_words_in_corpus(corpus):
    return [vocabulary[token] if token in vocabulary else 0 for token in corpus]

In [33]:
corpus = index_words_in_corpus(corpus)
test_corpus = index_words_in_corpus(test_corpus)

In [34]:
test_corpus

[1, 2, 4, 3, 5, 6, 1, 7, 8, 9, 1, 2, 10, 3]

In [35]:
vocabulary_size = len(vocabulary)
vocabulary_size

11

In [36]:
def one_hot_encode(index):
    row = np.zeros(vocabulary_size, dtype=np.int32)
    row[index] = 1
    return row

In [37]:
data = np.array([one_hot_encode(i) for i in corpus])
test_data = np.array([one_hot_encode(i) for i in test_corpus])

In [38]:
print("(TRAIN: Total number of words, Vocabulary size):", data.shape)
print("(TEST:  Total number of words, Vocabulary size):", test_data.shape)

(TRAIN: Total number of words, Vocabulary size): (14, 11)
(TEST:  Total number of words, Vocabulary size): (14, 11)


In [41]:
data[randint(1, data.shape[0]-1)]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int32)

In [42]:
X = tf.placeholder(tf.float32, shape=(None, vocabulary_size))
Y = tf.placeholder(tf.float32, shape=(None, vocabulary_size))

In [43]:
w1 = tf.Variable(tf.random_normal(shape=(vocabulary_size, 1000), stddev=0.01), name='weights1')
b1 = tf.Variable(tf.zeros([1, 1000]), name="bias1")
layer1 = tf.nn.relu(tf.add(tf.matmul(X, w1), b1))

w2 = tf.Variable(tf.random_normal(shape=(1000, 250), stddev=0.01), name='weights2')
b2 = tf.Variable(tf.zeros([1, 250]), name="bias2")
layer2 = tf.nn.relu(tf.add(tf.matmul(layer1, w2), b2))

w = tf.Variable(tf.random_normal(shape=(250, 50), stddev=0.01), name='weights')
b = tf.Variable(tf.zeros([1, 50]), name="bias")
code = tf.nn.relu(tf.add(tf.matmul(layer2, w), b))

w3 = tf.Variable(tf.random_normal(shape=(50, 250), stddev=0.01), name='weights3')
b3 = tf.Variable(tf.zeros([1, 250]), name="bias3")
layer3 = tf.nn.relu(tf.add(tf.matmul(code, w3), b3))

w4 = tf.Variable(tf.random_normal(shape=(250, 1000), stddev=0.01), name='weights4')
b4 = tf.Variable(tf.zeros([1, 1000]), name="bias4")
layer4 = tf.nn.relu(tf.add(tf.matmul(layer3, w4), b4))

w5 = tf.Variable(tf.random_normal(shape=(1000, vocabulary_size), stddev=0.01), name='weights5')
b5 = tf.Variable(tf.zeros([1, vocabulary_size]), name="bias5")
decoder = tf.nn.sigmoid(tf.add(tf.matmul(layer4, w5), b5))

In [44]:
# entropy = tf.nn.softmax_cross_entropy_with_logits(logits=decoder, labels=Y)
loss = tf.reduce_mean(tf.pow(X - decoder, 2))

In [45]:
LEARNING_RATE = 0.01
optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
init = tf.global_variables_initializer()

In [46]:

NUM_TRAIN_STEPS = 1000
SKIP_STEP = 10 # how many steps to skip before reporting the loss

In [47]:
with tf.Session() as sess:
    sess.run(init)
    
    for i in range(NUM_TRAIN_STEPS):
        _, loss_val = sess.run([optimizer, loss], feed_dict={X: data})
            
        if i % SKIP_STEP == 0:
                print("EPOCH {}/{}, LOSS {}".format(i , NUM_TRAIN_STEPS, loss_val))
                
    test_data_compressed = sess.run(decoder, feed_dict={X: test_data})
#     np.save(outfile, test_data_compressed)
        

EPOCH 0/1000, LOSS 0.25
EPOCH 10/1000, LOSS 0.2494572252035141
EPOCH 20/1000, LOSS 0.24854058027267456
EPOCH 30/1000, LOSS 0.24700184166431427
EPOCH 40/1000, LOSS 0.24444027245044708
EPOCH 50/1000, LOSS 0.24024827778339386
EPOCH 60/1000, LOSS 0.23362566530704498
EPOCH 70/1000, LOSS 0.22383129596710205
EPOCH 80/1000, LOSS 0.2106136977672577
EPOCH 90/1000, LOSS 0.1934209018945694
EPOCH 100/1000, LOSS 0.16473940014839172
EPOCH 110/1000, LOSS 0.10684426873922348
EPOCH 120/1000, LOSS 0.08202390372753143
EPOCH 130/1000, LOSS 0.08009893447160721
EPOCH 140/1000, LOSS 0.07989192754030228
EPOCH 150/1000, LOSS 0.07984042167663574
EPOCH 160/1000, LOSS 0.07981313019990921
EPOCH 170/1000, LOSS 0.07979805767536163
EPOCH 180/1000, LOSS 0.08045154809951782
EPOCH 190/1000, LOSS 0.08049511909484863
EPOCH 200/1000, LOSS 0.08030319958925247
EPOCH 210/1000, LOSS 0.08018342405557632
EPOCH 220/1000, LOSS 0.08013363927602768
EPOCH 230/1000, LOSS 0.0801125019788742
EPOCH 240/1000, LOSS 0.08009006083011627
EPOCH

In [48]:
test_data_compressed.shape

(14, 11)

In [49]:
test_data_compressed

array([[ 0.00196448,  0.20408842,  0.13413024,  0.13414782,  0.06589694,
         0.06589296,  0.06589328,  0.06589003,  0.06589994,  0.06592028,
         0.0659008 ],
       [ 0.00196448,  0.20408842,  0.13413024,  0.13414782,  0.06589694,
         0.06589296,  0.06589328,  0.06589003,  0.06589994,  0.06592028,
         0.0659008 ],
       [ 0.00196448,  0.20408842,  0.13413024,  0.13414782,  0.06589694,
         0.06589296,  0.06589328,  0.06589003,  0.06589994,  0.06592028,
         0.0659008 ],
       [ 0.00196448,  0.20408842,  0.13413024,  0.13414782,  0.06589694,
         0.06589296,  0.06589328,  0.06589003,  0.06589994,  0.06592028,
         0.0659008 ],
       [ 0.00196448,  0.20408842,  0.13413024,  0.13414782,  0.06589694,
         0.06589296,  0.06589328,  0.06589003,  0.06589994,  0.06592028,
         0.0659008 ],
       [ 0.00196448,  0.20408842,  0.13413024,  0.13414782,  0.06589694,
         0.06589296,  0.06589328,  0.06589003,  0.06589994,  0.06592028,
         0.065

### Since our compressed data is in probabilities, we'll convert  to whole nums to look up words

In [50]:
test_data_compressed[test_data_compressed>0] = 1

In [51]:
test_data_compressed

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]], dtype=float32)

In [52]:
test_data

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

### Tadaa!!! And here's our prediction
This show's how well our compression is able to recover data
> Remember that Autoencoders are lossy compression which means you will never be able to full reconstruct that data

In [53]:
sent = np.ndarray.tolist(test_data_compressed)[0]
print(' '.join([reverse_vocabulary[i] if sent[i] == 1. else "" for i in range(len(sent))]))

UNK the quick fox brown jumped over lazy dog from tall
