# Using the Embedding Projector in TensorBoard

Based on the tensorboard tutorial in this [link](https://www.tensorflow.org/tensorboard/tensorboard_projector_plugin).

Data can be found [here](https://www.kaggle.com/akudnaver/amazon-reviews-dataset).


## Setup

For this tutorial, we will be using TensorBoard to visualize an embedding layer generated for classifying amazon review data.

In [1]:
'''try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass'''

%load_ext tensorboard

In [2]:
import os

# NN libraries and embeddings
import tensorflow as tf
from tensorboard.plugins import projector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.utils import to_categorical
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils

from sklearn.feature_extraction.text import CountVectorizer

# Data processing
import pandas as pd
import numpy as np
import nltk
import re

# Text preprocessing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim import utils
import gensim.parsing.preprocessing as gsp

In [3]:
stopwords = set(stopwords.words('english'))

In [4]:
print(tf.__version__)
print(pd.__version__)
print(nltk.__version__)
'''2.8.0
1.3.4
3.6.7
'''

2.5.0
1.3.4
3.7


'2.8.0\n1.3.4\n3.6.7\n'

In [5]:
pd.set_option('max_colwidth', None)

In [7]:
df = pd.read_excel('review-details.xlsx', engine = 'openpyxl', usecols= ['review_title', 'review_text', 'review_rating'])

In [13]:
df.sample(3)

Unnamed: 0,review_rating,review_title,review_text
2008,5,Great savings,Love this smell
315,5,Does what its made for,Brilliant product! Left the drains clear and clean
170,5,,I will be buying again thank


In [16]:
def clean_text(text):
    import re
    from nltk.corpus import stopwords

    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    remove_words = stopwords.words('english')
    remove_words.remove('not')
    remove_words.remove('no')
    STOPWORDS = set(remove_words)
    text = text.lower()  # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    #text = ' '.join(word for word in text.split() if word not in remove_pn)

    return text

contraction_dict = {"ain’t": "is not", "aren’t": "are not","can’t": "cannot", "’cause": "because",
                    "could’ve": "could have", "couldn’t": "could not", "didn’t": "did not",
                    "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not", "hasn’t": "has not",
                    "haven’t": "have not", "he’d": "he would","he’ll": "he will", "he’s": "he is",
                    "how’d": "how did", "how’d’y": "how do you", "how’ll": "how will", "how’s": "how is",
                    "I’d": "I would", "I’d’ve": "I would have", "I’ll": "I will", "I’ll’ve": "I will have",
                    "I’m": "I am", "I’ve": "I have", "i’d": "i would", "i’d've": "i would have",
                    "i’ll": "i will",  "i’ll’ve": "i will have","i’m": "i am", "i’ve": "i have",
                    "isn’t": "is not", "it’d": "it would", "it’d’ve": "it would have", "it’ll": "it will",
                    "it’ll've": "it will have","it’s": "it is", "let’s": "let us", "ma’am": "madam",
                    "mayn’t": "may not", "might’ve": "might have","mightn’t": "might not",
                    "mightn’t’ve": "might not have", "must’ve": "must have", "mustn’t": "must not",
                    "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have",
                    "o’clock": "of the clock", "oughtn’t": "ought not", "oughtn’t’ve": "ought not have",
                    "shan’t": "shall not", "sha’n’t": "shall not", "shan’t’ve": "shall not have",
                    "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will",
                    "she’ll’ve": "she will have", "she’s": "she is", "should’ve": "should have",
                    "shouldn’t": "should not", "shouldn’t’ve": "should not have", "so’ve": "so have",
                    "so’s": "so as", "this’s": "this is","that’d": "that would", "that’d’ve": "that would have",
                    "that’s": "that is", "there’d": "there would", "there’d’ve": "there would have",
                    "there’s": "there is", "here’s": "here is","they’d": "they would",
                    "they’d’ve": "they would have", "they’ll": "they will", "they’ll’ve": "they will have",
                    "they’re": "they are", "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not",
                    "we’d": "we would", "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have",
                    "we’re": "we are", "we’ve": "we have", "weren’t": "were not", "what’ll": "what will",
                    "what’ll’ve": "what will have", "what’re": "what are",  "what’s": "what is",
                    "what’ve": "what have", "when’s": "when is", "when’ve": "when have", "where’d": "where did",
                    "where’s": "where is", "where’ve": "where have", "who’ll": "who will",
                    "who’ll’ve": "who will have", "who’s": "who is", "who’ve": "who have", "why’s": "why is",
                    "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t’ve": "will not have",
                    "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have",
                    "y’all": "you all", "y’all’d": "you all would","y’all’d’ve": "you all would have",
                    "y’all’re": "you all are","y’all’ve": "you all have","you’d": "you would",
                    "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have",
                    "you’re": "you are", "you’ve": "you have"}

def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

contractions, contractions_re = _get_contractions(contraction_dict)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

def remove_URL(text):
    return re.sub(r"http\S+", "", text)

filters_key = [
        gsp.strip_tags, 
        gsp.strip_punctuation,
        gsp.strip_multiple_whitespaces,
        gsp.strip_numeric,
        #gsp.remove_stopwords, 
        #gsp.strip_short, 
        #gsp.stem_text
        ]

def clean_gsm_key(s):
    #s = s.lower()
    s = utils.to_unicode(s)
    for f in filters_key:
        s = f(s)
    return s

filters = [
        gsp.strip_tags, 
        gsp.strip_punctuation,
        gsp.strip_multiple_whitespaces,
        gsp.strip_numeric,
        #gsp.remove_stopwords, 
        gsp.strip_short, 
        #gsp.stem_text
        ]

def clean_gsm(s):
    #s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

def contrac_lemm(df, comments):
    '''
    Trying to lemmatize and clean.
    Assumes text is lower case.
    '''
    #import contractions
    import re
    import string
    import nltk
    #nltk.download('wordnet')
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    def lemmatize_text(text):
        return [lemmatizer.lemmatize(w, "v") for w in w_tokenizer.tokenize(text)]
    #df['no_contract'] = df[comments_column].apply(lambda x: [contractions.fix(word) for word in x.split()])
    df['list_lemmatized'] = df[comments].apply(lemmatize_text)
    df[comments] = [' '.join(map(str, l)) for l in df['list_lemmatized']]
    return df

In [18]:
df.head(2)

Unnamed: 0,review_rating,review_title,review_text,clean_comments
0,5,Dove Men’s + Deodorant,"As you get older, you know what you like and what is suitable for your body. I like all Dove products. Gives you that fresh all over, wide awake feeling and no dandruff or flakey skin. No smelly a/pits!",get older know like suitable body like dove products gives fresh wide awake feeling no dandruff flakey skin no smelly pits
1,5,Great for a marmite lover!,"Three gigantic marmite jars that will last probably a whole life! What else would you possibly wish for? Order came in time, when mentioned, safely packed. Very happy with it.",three gigantic marmite jars last probably whole life else would possibly wish order came time mentioned safely packed happy


In [17]:
df["clean_comments"] = df["review_text"].apply(lambda x: str(x).lower())
df['clean_comments'] = df['clean_comments'].str.replace('\'', '’')
df['clean_comments'] = df['clean_comments'].str.replace('´', '’')
df['clean_comments'] = df['clean_comments'].str.replace('‚Äô', '’')
df['clean_comments'] = df['clean_comments'].str.replace('‚äô', '’')
df['clean_comments'] = df['clean_comments'].str.replace('â€™', '’')
df["clean_comments"] = df["clean_comments"].apply(lambda x: remove_URL(x))
df['clean_comments'] = df['clean_comments'].apply(lambda x: replace_contractions(x))
df['clean_comments'] = df['clean_comments'].apply(lambda x: clean_text(x))
df['clean_comments'] = df['clean_comments'].str.replace('\d+', ' ')

  # Remove the CWD from sys.path while we load stuff.


In [19]:
df.isnull().sum()

review_rating      0
review_title      98
review_text        0
clean_comments     0
dtype: int64

In [20]:
X = df.drop('review_rating', axis=1)
messages = X.copy()
messages = messages.reset_index()

In [21]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['clean_comments'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords]
    review = ' '.join(review)
    corpus.append(review)

In [22]:
corpus[110]

'exactli say kitchen sink alway get block sinc put block last month'

In [23]:
LE = LabelEncoder()
df['relevance_enc'] = LE.fit_transform(df['review_rating'])
X = df.clean_comments #the column text contains textual data to extract features from
y = df.relevance_enc #this is the column we are learning to predict.

In [24]:
# Code to produce a dictionary to retrieve correct tags
text_tags = df['relevance_enc'].unique()
text_tags = list(np.sort(text_tags))

In [52]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y,
    train_size=0.9,
    test_size=0.1,
    # random but same for all run
    random_state=2022,
    # keep same proportion of 'target' in test and target data
    stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr,
    train_size=0.8,
    test_size=0.2,
    # random but same for all run
    random_state=2022,
    # keep same proportion of 'target' in test and target data
    stratify=y_tr
)

In [53]:
train_texts = X_train
train_labels = y_train
test_texts = X_test
test_labels = y_test
val_texts = X_val
val_labels = y_val

In [54]:
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 100

In [55]:
#tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(corpus)
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(test_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
word_index = tokenizer.word_index

In [56]:
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

trainvalid_labels = to_categorical(np.asarray(train_labels))
test_labels = to_categorical(np.asarray(test_labels))
val_labels = to_categorical(np.asarray(val_labels))

x_train = trainvalid_data
y_train = trainvalid_labels
x_val = val_data
y_val = val_labels

#w = 0

# Keras Embedding Layer

A [Keras Embedding Layer](https://keras.io/layers/embeddings/) can be used to train an embedding for each word in your vocabulary. Each word (or sub-word in this case) will be associated with a 16-dimensional vector (or embedding) that will be trained by the model.

See [this tutorial](https://www.tensorflow.org/tutorials/text/word_embeddings?hl=en) to learn more about word embeddings.

In [57]:
# Run only to clean model info

tf.keras.backend.clear_session()

In [58]:
# Create an embedding layer.
#embedding_dim = 16
embedding = tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM)#encoder.vocab_size, embedding_dim)
# Configure the embedding layer as part of a keras model.
model = tf.keras.Sequential(
    [
        embedding, # The embedding layer should be the first layer in a model.
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(EMBEDDING_DIM, activation="relu"),
        tf.keras.layers.Dense(10, activation="relu"),
        tf.keras.layers.Dense(5),
    ]
)

In [59]:
# Compile model.
model.compile(
    optimizer="adam",
    loss= 'categorical_crossentropy',#tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

# Train model for one epoch.
history = model.fit(
    x_train, y_train, epochs=10, validation_data=(val_data, val_labels), validation_steps=20
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Saving Data for TensorBoard

TensorBoard reads tensors and metadata from the logs of your tensorflow projects. The path to the log directory is specified with `log_dir` below. For this tutorial, we will be using `/logs/amazon-example/`.

In order to load the data into Tensorboard, we need to save a training checkpoint to that directory, along with metadata that allows for visualization of a specific layer of interest in the model. 

In [60]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir='./logs/amazon-example/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [61]:
vect = CountVectorizer(max_features=MAX_NB_WORDS) #instantiate a vectorizer

X_train_dtm = vect.fit_transform(X_train)

In [None]:
'she did a great job' = [213, 59, 96, 128, 0, 0, ..., 0]

In [None]:
'In order to load the data into Tensorboard, \
we need to save a training checkpoint to that directory, \
along with metadata that allows for visualization of a specific layer of \
interest in the model. ' = [..., 123, ..., ]

In [33]:
# Show elements:
print(list(vect.vocabulary_.keys())[0:5])
print(list(vect.vocabulary_.values())[0:5])

['ok', 'absolutely', 'delicious', 'kids', 'not']
[594, 2, 210, 438, 582]


In [62]:
#vect.vocabulary_.keys()
def getList(dict):
    list = []
    for key in dict.keys():
        list.append(key)
          
    return list

encoder = getList(vect.vocabulary_)

In [63]:
# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for subwords in encoder:
        f.write("{}\n".format(subwords))
    # Fill in the rest of the labels with "unknown".
    for unknown in range(1, len(encoder)):
        f.write("unknown #{}\n".format(unknown))

In [64]:
# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(model.layers[0].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [65]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir ./logs/amazon-example/

Reusing TensorBoard on port 6006 (pid 26470), started 0:12:01 ago. (Use '!kill 26470' to kill it.)

<!-- <img class="tfo-display-only-on-site" src="images/embedding_projector.png?raw=1"/> -->