##  Word Embeddings

In [None]:
import io
import matplotlib.pyplot as plt
import tensorflow as tf 
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow_datasets as tfds


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
def get_batch_data():
  
  (train_data, test_data), info = tfds.load('imdb_reviews/subwords8k',
                                          split=(tfds.Split.TRAIN, tfds.Split.TEST),
                                          with_info=True, as_supervised=True)
  encoder = info.features['text'].encoder 
  padded_shapes = ([None], ())
  train_batches = train_data.shuffle(1000).padded_batch(10,
                                                        padded_shapes=padded_shapes)
  test_batches = test_data.shuffle(1000).padded_batch(10, 
                                                      padded_shapes=padded_shapes)
  return train_batches, test_batches, encoder

In [None]:
def get_model(encoder, embedding_dim=16):
  embedding_dim =16
  model = keras.Sequential([layers.Embedding(encoder.vocab_size,embedding_dim),
                          layers.GlobalAveragePooling1D(),
                          layers.Dense(1, activation='sigmoid')
                          ])

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  return model



In [None]:
def plot_history(history):
  history_dict = history.history
  acc = history_dict['accuracy']
  val_acc = history_dict['val_accuracy']
  epochs = range(1, len(acc)+1)
  print(type(epochs),epochs)
  print(acc)
  plt.figure(figsize=(12, 9))
  plt.plot(epochs, acc, 'bo', label='Training acc')
  plt.plot(epochs, val_acc, 'b', label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend(loc='lower right')
  plt.ylim((0.5, 1))
  plt.show()

In [None]:

def retrieve_embeddings(model, encoder):
  out_vectors = io.open('vecs.tsv', 'w', encoding='utf-8')
  out_metadata = io.open('meta.tsv', 'w', encoding='utf-8')
  weights = model.layers[0].get_weights()[0] # at 0 -> embeddig layer
  for num,word in enumerate(encoder.subwords):
    vec = weights[num+1]
    out_metadata.write(word + '\n')
    out_metadata.write('\t'.join([str(x) for x in vec])+'\n')
  out_vectors.close()
  out_metadata.close()

In [None]:

train_batches, test_batches, encoder = get_batch_data()
model = get_model(encoder)
history = model.fit(train_batches, epochs=10, validation_data=test_batches,
                    validation_steps=20)
plot_history(history)

retrieve_embeddings(model, encoder)