# Using the Embedding Projector in TensorBoard

Based on the tensorboard tutorial in this [link](https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin).

Data can be found [here](https://www.kaggle.com/akudnaver/amazon-reviews-dataset).


## Setup

For this tutorial, we will be using TensorBoard to visualize an embedding layer generated for classifying amazon review data.

In [30]:
'''try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass'''

%load_ext tensorboard

In [1]:
import os
import tensorflow as tf
from tensorboard.plugins import projector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.utils import to_categorical
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils

from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
stopwords = set(stopwords.words('english'))

In [3]:
print(tf.__version__)
print(pd.__version__)
print(nltk.__version__)

2.8.0
1.3.4
3.6.7


In [4]:
pd.set_option('max_colwidth', None)

In [5]:
df = pd.read_excel('review-details.xlsx', engine = 'openpyxl', usecols= ['review_title', 'review_text', 'review_rating'])

In [6]:
df.sample()

Unnamed: 0,review_rating,review_title,review_text
2198,5,Excellent price.,This is a great offer works out £5 a bag of 38. Will buy again.


In [7]:
df.isnull().sum()

review_rating     0
review_title     98
review_text       0
dtype: int64

In [8]:
X = df.drop('review_rating', axis=1)
messages = X.copy()
messages = messages.reset_index()

In [11]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['review_text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
corpus[110]

'exactli say kitchen sink alway get block sinc put block last month'

In [15]:
LE = LabelEncoder()
df['relevance_enc'] = LE.fit_transform(df['review_rating'])
X = df.review_text #the column text contains textual data to extract features from
y = df.relevance_enc #this is the column we are learning to predict.

In [16]:
# Code to produce a dictionary to retrieve correct tags
text_tags = df['relevance_enc'].unique()
text_tags = list(np.sort(text_tags))

In [17]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y,
    train_size=0.9,
    test_size=0.1,
    # random but same for all run
    random_state=2022,
    # keep same proportion of 'target' in test and target data
    stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr,
    train_size=0.8,
    test_size=0.2,
    # random but same for all run
    random_state=2022,
    # keep same proportion of 'target' in test and target data
    stratify=y_tr
)

In [18]:
train_texts = X_train
train_labels = y_train
test_texts = X_test
test_labels = y_test
val_texts = X_val
val_labels = y_val

In [19]:
MAX_NB_WORDS = 1000
MAX_SEQUENCE_LENGTH = 20
EMBEDDING_DIM = 30

In [20]:
#tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(corpus)
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(test_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
word_index = tokenizer.word_index

In [21]:
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

trainvalid_labels = to_categorical(np.asarray(train_labels))
test_labels = to_categorical(np.asarray(test_labels))
val_labels = to_categorical(np.asarray(val_labels))

x_train = trainvalid_data
y_train = trainvalid_labels
x_val = val_data
y_val = val_labels

#w = 0

# Keras Embedding Layer

A [Keras Embedding Layer](https://keras.io/layers/embeddings/) can be used to train an embedding for each word in your vocabulary. Each word (or sub-word in this case) will be associated with a 16-dimensional vector (or embedding) that will be trained by the model.

See [this tutorial](https://www.tensorflow.org/tutorials/text/word_embeddings?hl=en) to learn more about word embeddings.

In [17]:
# Run only to clean model info

#tf.keras.backend.clear_session()

In [22]:
# Create an embedding layer.
#embedding_dim = 16
embedding = tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM)#encoder.vocab_size, embedding_dim)
# Configure the embedding layer as part of a keras model.
model = tf.keras.Sequential(
    [
        embedding, # The embedding layer should be the first layer in a model.
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(EMBEDDING_DIM, activation="relu"),
        tf.keras.layers.Dense(10, activation="relu"),
        tf.keras.layers.Dense(5),
    ]
)

# Compile model.
model.compile(
    optimizer="adam",
    loss= 'categorical_crossentropy',#tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

# Train model for one epoch.
history = model.fit(
    x_train, y_train, epochs=10, validation_data=(val_data, val_labels), validation_steps=20
)

Epoch 1/10


2022-03-08 22:56:24.472164: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Saving Data for TensorBoard

TensorBoard reads tensors and metadata from the logs of your tensorflow projects. The path to the log directory is specified with `log_dir` below. For this tutorial, we will be using `/logs/amazon-example/`.

In order to load the data into Tensorboard, we need to save a training checkpoint to that directory, along with metadata that allows for visualization of a specific layer of interest in the model. 

In [23]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir='./logs/amazon-example/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [24]:
vect = CountVectorizer(max_features=MAX_NB_WORDS) #instantiate a vectorizer

X_train_dtm = vect.fit_transform(X_train)

In [None]:
'she did a great job' = [213, 59, 96, 128, 0, 0, ..., 0]

In [None]:
'In order to load the data into Tensorboard, \
we need to save a training checkpoint to that directory, \
along with metadata that allows for visualization of a specific layer of \
interest in the model. ' = [..., 123, ..., ]

In [25]:
# Show elements:
print(list(vect.vocabulary_.keys())[0:5])
print(list(vect.vocabulary_.values())[0:5])

['ok', 'absolutely', 'delicious', 'the', 'kids']
[600, 12, 215, 869, 451]


In [26]:
#vect.vocabulary_.keys()
def getList(dict):
    list = []
    for key in dict.keys():
        list.append(key)
          
    return list

encoder = getList(vect.vocabulary_)

In [27]:
# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for subwords in encoder:
        f.write("{}\n".format(subwords))
    # Fill in the rest of the labels with "unknown".
    for unknown in range(1, len(encoder)):
        f.write("unknown #{}\n".format(unknown))

In [28]:
# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(model.layers[0].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [32]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir /Users/anieto/Downloads/logs/amazon-example/

Reusing TensorBoard on port 6006 (pid 19003), started 0:00:09 ago. (Use '!kill 19003' to kill it.)

<!-- <img class="tfo-display-only-on-site" src="images/embedding_projector.png?raw=1"/> -->