# Visualizing Word Embeddings on the Tensorboard

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import zipfile
from tensorboard.plugins import projector
import csv

## Read the GloVe file

Here we first need to download the GloVe word embeddings (`glove.6B.zip`) found at this [website](https://nlp.stanford.edu/projects/glove/). **Download it and place it in the `Appendix` folder.** Then we read the GloVe file to get the first 50000 words in the file. We will be using 50 dimensional word vectors

In [5]:
vocabulary_size = 50000

embedding_df = [] 
index = []
# Open the zip file
with zipfile.ZipFile('glove.6B.zip') as glovezip:
    # Read the file with 50 dimensional embeddings
    with glovezip.open('glove.6B.50d.txt') as glovefile:
        # Read line by line
        for li, line in enumerate(glovefile):
            # Print progress
            if (li+1)%10000==0: print('.',end='')
                
            # Get the word and the corresponding vector
            line_tokens = line.decode('utf-8').split(' ')
            word = line_tokens[0]
            vector = [float(v) for v in line_tokens[1:]]
            
            assert len(vector)==50
            index.append(word)
            # Update the embedding matrix
            embedding_df.append(np.array(vector))
            
            # If the first 50000 words being read, finish
            if li >= vocabulary_size-1:
                break

embedding_df = pd.DataFrame(embedding_df, index=index)
embedding_df.head(n=10)

.....

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
",",0.013441,0.23682,-0.16899,0.40951,0.63812,0.47709,-0.42852,-0.55641,-0.364,-0.23938,...,-0.080262,0.63003,0.32111,-0.46765,0.22786,0.36034,-0.37818,-0.56657,0.044691,0.30392
.,0.15164,0.30177,-0.16763,0.17684,0.31719,0.33973,-0.43478,-0.31086,-0.44999,-0.29486,...,-6.4e-05,0.068987,0.087939,-0.10285,-0.13931,0.22314,-0.080803,-0.35652,0.016413,0.10216
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,...,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
to,0.68047,-0.039263,0.30186,-0.17792,0.42962,0.032246,-0.41376,0.13228,-0.29847,-0.085253,...,-0.094375,0.018324,0.21048,-0.03088,-0.19722,0.082279,-0.09434,-0.073297,-0.064699,-0.26044
and,0.26818,0.14346,-0.27877,0.016257,0.11384,0.69923,-0.51332,-0.47368,-0.33075,-0.13834,...,-0.069043,0.36885,0.25168,-0.24517,0.25381,0.1367,-0.31178,-0.6321,-0.25028,-0.38097
in,0.33042,0.24995,-0.60874,0.10923,0.036372,0.151,-0.55083,-0.074239,-0.092307,-0.32821,...,-0.48609,-0.008027,0.031184,-0.36576,-0.42699,0.42164,-0.11666,-0.50703,-0.027273,-0.53285
a,0.21705,0.46515,-0.46757,0.10082,1.0135,0.74845,-0.53104,-0.26256,0.16812,0.13182,...,0.13813,0.36973,-0.64289,0.024142,-0.039315,-0.26037,0.12017,-0.043782,0.41013,0.1796
"""",0.25769,0.45629,-0.76974,-0.37679,0.59272,-0.063527,0.20545,-0.57385,-0.29009,-0.13662,...,0.030498,-0.39543,-0.38515,-1.0002,0.087599,-0.31009,-0.34677,-0.31438,0.75004,0.97065
's,0.23727,0.40478,-0.20547,0.58805,0.65533,0.32867,-0.81964,-0.23236,0.27428,0.24265,...,-0.12342,0.65961,-0.51802,-0.82995,-0.082739,0.28155,-0.423,-0.27378,-0.007901,-0.030231


## Create TensorFlow Variable and config

Here we create a TensorFlow variable to store the embeddings we read above and save it to the disk. This is necessary for the visualization. Along with that we will save metadata, which has the labels for each embedding.

In [6]:
# Create a directory to save our model
log_dir = 'embeddings'
os.makedirs(log_dir, exist_ok=True)

# Save the weights we want to analyse as a variable. 
embeddings = tf.Variable(embedding_df.values)
print(f"weights.shape: {embeddings.shape}")

# Create a checkpoint from embedding
checkpoint = tf.train.Checkpoint(embedding=embeddings)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

with open(os.path.join(log_dir, 'metadata.tsv'), 'w', encoding='utf-8') as f:
    for w in embedding_df.index:
        f.write(w+'\n')

weights.shape: (50000, 50)


## Running the TensorBoard

* cd into the `packt_nlp_tf2/Appendix` directory
* Run `tensorboard --logdir embeddings`

## Visualizing the embeddings

In [4]:
config = projector.ProjectorConfig()

# You can add multiple embeddings. Here we add only one.
embedding_config = config.embeddings.add()
embedding_config.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
# Link this tensor to its metadata file (e.g. labels).
embedding_config.metadata_path = 'metadata.tsv'

# TensorBoard will read this file during startup.
projector.visualize_embeddings(log_dir, config)