This notebook was constructed from the python code samples found in the Chapter_4 folder of the repo

https://github.com/PacktPublishing/Deep-Learning-with-TensorFlow-and-Keras-3rd-edition

Numerous tweaks were needed to get the code to run without errors. 


In [None]:
# All runs without errors. 
# Run time is quick because all of the needed data and models have already been downloaded.

# Run Date: Tuesday, February 14, 2023
# Run Time: 00:01:26

In [1]:
import time
from datetime import date

startTime = time.time()
todaysDate = date.today()

## Create your own embeddings using Gensim.

In [2]:
#!pip install gensim

(Code source: Chapter_4/create_embedding_with_text8.py)

In [3]:
import gensim.downloader as api
from gensim.models import Word2Vec

In [4]:
import gensim
gensim.__version__

'4.3.0'

In [5]:
info = api.info("text8")
assert(len(info) > 0)

In [6]:
%%time
dataset = api.load("text8")

CPU times: user 1.86 ms, sys: 1.2 ms, total: 3.06 ms
Wall time: 78.4 ms


In [7]:
%%time
model = Word2Vec(dataset)

CPU times: user 1min 42s, sys: 356 ms, total: 1min 42s
Wall time: 34.5 s


In [8]:
model.save("data/text8-word2vec.bin")

## Exploring the embedding space with Gensim.

(Code source: Chapter_4/explore_text8_embedding.py)

In [9]:
from gensim.models import KeyedVectors

In [10]:
model = KeyedVectors.load("data/text8-word2vec.bin")
word_vectors = model.wv

We can take a look at the first few words in the vocabulary and check to see if specific words are
available:

In [11]:
# get words in the vocabulary ... this next line throws this error ... 
# words = word_vectors.vocab.keys()

# AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
# Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
# See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
import random
random_word = random.choice(model.wv.index_to_key)
random_word

# these next 2 lines will also not work 
# print([x for i, x in enumerate(words) if i < 10])
# assert("king" in words)


'mmi'

In [12]:
rock_idx = model.wv.key_to_index["rock"]  
rock_cnt = model.wv.get_vecattr("rock", "count")  
vocab_len = len(model.wv) 
print(rock_idx)
print(rock_cnt)
print(vocab_len)

579
2819
71290


In [13]:
# print the first 10 words of the model
print([x for i, x in enumerate(model.wv.index_to_key) if i < 10])

['the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero', 'nine', 'two']


In [14]:
def print_most_similar(word_conf_pairs, k):
    for i, (word, conf) in enumerate(word_conf_pairs):
        print("{:.3f} {:s}".format(conf, word))
        if i >= k-1:
            break
    if k < len(word_conf_pairs):
        print("...")

In [15]:
word_vectors.most_similar("king")

[('prince', 0.7539743781089783),
 ('queen', 0.730122447013855),
 ('throne', 0.7136245369911194),
 ('emperor', 0.7093201875686646),
 ('kings', 0.6822531223297119),
 ('pharaoh', 0.6782265901565552),
 ('regent', 0.6712974309921265),
 ('elector', 0.6605132818222046),
 ('vii', 0.6580088138580322),
 ('herod', 0.6558883190155029)]

In [16]:
print("# words similar to king")
print_most_similar(word_vectors.most_similar("king"), 5)

# words similar to king
0.754 prince
0.730 queen
0.714 throne
0.709 emperor
0.682 kings
...


In [17]:
print("# vector arithmetic with words (cosine similarity)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar(
    positive=["france", "berlin"], negative=["paris"]), 1
)

# vector arithmetic with words (cosine similarity)
# france + berlin - paris = ?
0.773 germany
...


In [18]:
print("# vector arithmetic with words (Levy and Goldberg)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar_cosmul(
    positive=["france", "berlin"], negative=["paris"]), 1
)

# vector arithmetic with words (Levy and Goldberg)
# france + berlin - paris = ?
0.944 germany
...


In [19]:
print("# find odd one out")
print("# [hindus, parsis, singapore, christians]")
print(word_vectors.doesnt_match(["hindus", "parsis", 
    "singapore", "christians"]))

# find odd one out
# [hindus, parsis, singapore, christians]
singapore


In [20]:
print("# similarity between words")
for word in ["woman", "dog", "whale", "tree"]:
    print("similarity({:s}, {:s}) = {:.3f}".format(
        "man", word,
        word_vectors.similarity("man", word)
    ))

# similarity between words
similarity(man, woman) = 0.739
similarity(man, dog) = 0.441
similarity(man, whale) = 0.280
similarity(man, tree) = 0.262


In [21]:
print("# similar by word")
print(print_most_similar(
    word_vectors.similar_by_word("singapore"), 5)
)

# similar by word
0.867 malaysia
0.846 indonesia
0.809 thailand
0.806 zambia
0.804 philippines
...
None


In [22]:
print("# distance between vectors")
print("distance(singapore, malaysia) = {:.3f}".format(
    word_vectors.distance("singapore", "malaysia")
))

# distance between vectors
distance(singapore, malaysia) = 0.133


In [23]:
vec_song = word_vectors["song"]
print("\n# output vector obtained directly, shape:", vec_song.shape)


# output vector obtained directly, shape: (100,)


In [24]:
# this next line throws an error ....
#vec_song_2 = word_vectors.word_vec("song", use_norm=True)
# TypeError: get_vector() got an unexpected keyword argument 'use_norm'

# Removing 'use_norm=True)' generates another warning ...
# vec_song_2 = word_vectors.word_vec("song")
# DeprecationWarning: Call to deprecated `word_vec` (Use get_vector instead).
# vec_song_2 = word_vectors.word_vec("song")

# This is the correct implementation of this ...
vec_song_2 = word_vectors.get_vector("song")
print("# output vector obtained using word_vec, shape:", vec_song_2.shape)

# output vector obtained using word_vec, shape: (100,)


## Using word embeddings for spam detection

(Code source: Chapter_4/spam_classifier.py)

(The code in the above file differs from the code shown in the book)

In [25]:
import argparse
import gensim.downloader as api
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix

2023-02-14 17:33:52.129218: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-14 17:33:52.240194: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Getting the data

In [26]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    p = tf.keras.utils.get_file(local_file, url, 
        extract=True, cache_dir=".")
    labels, texts = [], []
    local_file = os.path.join("datasets", "SMSSpamCollection")
    with open(local_file, "r") as fin:
        for line in fin:
            label, text = line.strip().split('\t')
            labels.append(1 if label == "spam" else 0)
            texts.append(text)
    return texts, labels

In [27]:
DATA_DIR = "data"
EMBEDDING_NUMPY_FILE = os.path.join(DATA_DIR, "E.npy")
DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
EMBEDDING_MODEL = "glove-wiki-gigaword-300"
EMBEDDING_DIM = 300
NUM_CLASSES = 2
BATCH_SIZE = 128
NUM_EPOCHS = 3

In [28]:
# data distribution is 4827 ham and 747 spam (total 5574), which 
# works out to approx 87% ham and 13% spam, so we take reciprocals
# and this works out to being each spam (1) item as being approximately
# 8 times as important as each ham (0) message.
CLASS_WEIGHTS = { 0: 1, 1: 8 }

tf.random.set_seed(42)

# parser = argparse.ArgumentParser()
# parser.add_argument("--mode", help="run mode",
#     choices=[
#         "scratch",
#         "vectorizer",
#         "finetuning"
#     ])
# args = parser.parse_args()  # This line blows up in a notebook!
# run_mode = args.mode

In [29]:
parser = argparse.ArgumentParser()

In [30]:
parser.add_argument("--mode", help="run mode",
    choices=[
        "scratch",
        "vectorizer",
        "finetuning"
    ])

_StoreAction(option_strings=['--mode'], dest='mode', nargs=None, const=None, default=None, type=None, choices=['scratch', 'vectorizer', 'finetuning'], help='run mode', metavar=None)

In [31]:
#  https://stackoverflow.com/questions/46477770/jupyternotebook-with-args-parser
args = parser.parse_args(args=[])

In [32]:
# https://stackoverflow.com/questions/51039271/how-to-use-argument-parser-in-jupyter-notebook#51043537
args = parser.parse_args('')

In [33]:
run_mode = args.mode

In [34]:
# read data
texts, labels = download_and_read(DATASET_URL)

### Making the data ready for use

In [35]:
# tokenize and pad text
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences)
num_records = len(text_sequences)
max_seqlen = len(text_sequences[0])
print("{:d} sentences, max length: {:d}".format(num_records, max_seqlen))

5574 sentences, max length: 189


In [36]:
# labels
cat_labels = tf.keras.utils.to_categorical(labels, num_classes=NUM_CLASSES)

In [37]:
# vocabulary
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
word2idx["PAD"] = 0
idx2word[0] = "PAD"
vocab_size = len(word2idx)
print("vocab size: {:d}".format(vocab_size))

vocab size: 9010


In [38]:
# dataset
dataset = tf.data.Dataset.from_tensor_slices((text_sequences, cat_labels))
dataset = dataset.shuffle(10000)
test_size = num_records // 4
val_size = (num_records - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

2023-02-14 17:33:53.661235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 17:33:53.661433: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 17:33:53.663750: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 17:33:53.663965: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 17:33:53.664109: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

In [39]:
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

### Building the embedding matrix

In [40]:
def build_embedding_matrix(sequences, word2idx, embedding_dim, 
        embedding_file):
    if os.path.exists(embedding_file):
        E = np.load(embedding_file)
    else:
        vocab_size = len(word2idx)
        E = np.zeros((vocab_size, embedding_dim))
        word_vectors = api.load(EMBEDDING_MODEL)
        for word, idx in word2idx.items():
            try:
                E[idx] = word_vectors.word_vec(word)
            except KeyError:   # word not in embedding
                pass
            # except IndexError: # UNKs are mapped to seq over VOCAB_SIZE as well as 1
            #     pass
        np.save(embedding_file, E)
    return E

In [41]:
# embedding
E = build_embedding_matrix(text_sequences, word2idx, EMBEDDING_DIM,
    EMBEDDING_NUMPY_FILE)
print("Embedding matrix:", E.shape)

Embedding matrix: (9010, 300)


### Defining the spam classifier

In [42]:
class SpamClassifierModel(tf.keras.Model):
    
    def __init__(self, vocab_sz, embed_sz, input_length,
            num_filters, kernel_sz, output_sz, 
            run_mode, embedding_weights, 
            **kwargs):
        super(SpamClassifierModel, self).__init__(**kwargs)
        if run_mode == "scratch":
            self.embedding = tf.keras.layers.Embedding(vocab_sz, 
                embed_sz,
                input_length=input_length,
                trainable=True)
        elif run_mode == "vectorizer":
            self.embedding = tf.keras.layers.Embedding(vocab_sz, 
                embed_sz,
                input_length=input_length,
                weights=[embedding_weights],
                trainable=False)
        else:
            self.embedding = tf.keras.layers.Embedding(vocab_sz, 
                embed_sz,
                input_length=input_length,
                weights=[embedding_weights],
                trainable=True)
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.conv = tf.keras.layers.Conv1D(filters=num_filters,
            kernel_size=kernel_sz,
            activation="relu")
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        self.dense = tf.keras.layers.Dense(output_sz, 
            activation="softmax"
        )
        
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.dropout(x)
        x = self.conv(x)
        x = self.pool(x)
        return self.dense(x)
        

In [43]:
# model definition
conv_num_filters = 256
conv_kernel_size = 3
model = SpamClassifierModel(
    vocab_size, EMBEDDING_DIM, max_seqlen, 
    conv_num_filters, conv_kernel_size, NUM_CLASSES,
    run_mode, E)
# model.build(input_shape=(None, max_seqlen))
# model.summary()

In [44]:
# https://www.tensorflow.org/guide/keras/custom_layers_and_models

#     class ResNet(tf.keras.Model):

#         def __init__(self, num_classes=1000):
#             super(ResNet, self).__init__()
#             self.block_1 = ResNetBlock()
#             self.block_2 = ResNetBlock()
#             self.global_pool = layers.GlobalAveragePooling2D()
#             self.classifier = Dense(num_classes)

#         def call(self, inputs):
#             x = self.block_1(inputs)
#             x = self.block_2(x)
#             x = self.global_pool(x)
#             return self.classifier(x)


#     resnet = ResNet()

In [45]:
# This next lines throws an error ....
# model.build(input_shape=(None, max_seqlen))
#    NotImplementedError: Unimplemented `tf.keras.Model.call()`: if you intend to create a `Model` with the Functional API, 
#    please provide `inputs` and `outputs` arguments. Otherwise, subclass `Model` with an overridden `call()` method.
model.build(input_shape=(None, max_seqlen))

In [46]:
model.summary()

Model: "spam_classifier_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  2703000   
                                                                 
 spatial_dropout1d (SpatialD  multiple                 0         
 ropout1D)                                                       
                                                                 
 conv1d (Conv1D)             multiple                  230656    
                                                                 
 global_max_pooling1d (Globa  multiple                 0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               multiple                  514       
                                                                 
Total params: 2,934,170
Trainable params: 2,9

Finally, we compile the model using the categorical cross entropy loss function and the Adam optimizer:

In [47]:
# compile and train
model.compile(optimizer="adam", loss="categorical_crossentropy",
    metrics=["accuracy"])

### Training and evalutating the model

In [48]:
# train model
model.fit(train_dataset, epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    class_weight=CLASS_WEIGHTS)

Epoch 1/3


2023-02-14 17:33:55.057450: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5f60168df0>

In [49]:
# evaluate against test set
labels, predictions = [], []
for Xtest, Ytest in test_dataset:
    Ytest_ = model.predict_on_batch(Xtest)
    ytest = np.argmax(Ytest, axis=1)
    ytest_ = np.argmax(Ytest_, axis=1)
    labels.extend(ytest.tolist())
    predictions.extend(ytest.tolist())


In [50]:
print("test accuracy: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

test accuracy: 1.000
confusion matrix
[[1091    0]
 [   0  189]]


### Running the spam detector

In [51]:
# spam_classifier --mode [scratch|vectorizer|finetune]

## Neural embeddings - not just for words

### node2vec

(code source: Chapter_4/neurips_papers_node2vec.py)

In [52]:
import gensim
import logging
import numpy as np
import os
import shutil
import tensorflow as tf

from scipy.sparse import csr_matrix
# from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [54]:
DATA_DIR = "./data"
UCI_DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00371/NIPS_1987-2015.csv"

NUM_WALKS_PER_VERTEX = 32
MAX_PATH_LENGTH = 40
RESTART_PROB = 0.15

RANDOM_WALKS_FILE = os.path.join(DATA_DIR, "random-walks.txt")
W2V_MODEL_FILE = os.path.join(DATA_DIR, "w2v-neurips-papers.model")

In [55]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    p = tf.keras.utils.get_file(local_file, url, cache_dir=".")
    row_ids, col_ids, data = [], [], []
    rid = 0
    f = open(p, "r")
    for line in f:
        line = line.strip()
        if line.startswith("\"\","):
            # header
            continue
        if rid % 100 == 0:
            print("{:d} rows read".format(rid))
        # compute non-zero elements for current row
        counts = np.array([int(x) for x in line.split(',')[1:]])
        nz_col_ids = np.nonzero(counts)[0]
        nz_data = counts[nz_col_ids]
        nz_row_ids = np.repeat(rid, len(nz_col_ids))
        rid += 1
        # add data to big lists
        row_ids.extend(nz_row_ids.tolist())
        col_ids.extend(nz_col_ids.tolist())
        data.extend(nz_data.tolist())
    print("{:d} rows read, COMPLETE".format(rid))
    f.close()
    TD = csr_matrix((
        np.array(data), (
            np.array(row_ids), np.array(col_ids)
            )
        ),
        shape=(rid, counts.shape[0]))
    return TD

In [56]:
%%time
# read data and convert to Term-Document matrix
TD = download_and_read(UCI_DATA_URL)
# compute undirected, unweighted edge matrix
E = TD.T * TD
# binarize
E[E > 0] = 1
print(E.shape)

0 rows read
100 rows read
200 rows read
300 rows read
400 rows read
500 rows read
600 rows read
700 rows read
800 rows read
900 rows read
1000 rows read
1100 rows read
1200 rows read
1300 rows read
1400 rows read
1500 rows read
1600 rows read
1700 rows read
1800 rows read
1900 rows read
2000 rows read
2100 rows read
2200 rows read
2300 rows read
2400 rows read
2500 rows read
2600 rows read
2700 rows read
2800 rows read
2900 rows read
3000 rows read
3100 rows read
3200 rows read
3300 rows read
3400 rows read
3500 rows read
3600 rows read
3700 rows read
3800 rows read
3900 rows read
4000 rows read
4100 rows read
4200 rows read
4300 rows read
4400 rows read
4500 rows read
4600 rows read
4700 rows read
4800 rows read
4900 rows read
5000 rows read
5100 rows read
5200 rows read
5300 rows read
5400 rows read
5500 rows read
5600 rows read
5700 rows read
5800 rows read
5900 rows read
6000 rows read
6100 rows read
6200 rows read
6300 rows read
6400 rows read
6500 rows read
6600 rows read
6700 ro

In [57]:
def construct_random_walks(E, n, alpha, l, ofile):
    """ NOTE: takes a long time to do, consider using some parallelization
        for larger problems.
    """
    if os.path.exists(ofile):
        print("random walks generated already, skipping")
        return
    f = open(ofile, "w")
    for i in range(E.shape[0]):  # for each vertex
        if i % 100 == 0:
            print("{:d} random walks generated from {:d} starting vertices"
                .format(n * i, i))
        if i <= 3273:
            continue
        for j in range(n):       # construct n random walks
            curr = i
            walk = [curr]
            target_nodes = np.nonzero(E[curr])[1]
            for k in range(l):   # each of max length l, restart prob alpha
                # should we restart?
                if np.random.random() < alpha and len(walk) > 5:
                    break
                # choose one outgoing edge and append to walk
                try:
                    curr = np.random.choice(target_nodes)
                    walk.append(curr)
                    target_nodes = np.nonzero(E[curr])[1]
                except ValueError:
                    continue
            f.write("{:s}\n".format(" ".join([str(x) for x in walk])))

    print("{:d} random walks generated from {:d} starting vertices, COMPLETE"
        .format(n * i, i))
    f.close()

This next cell takes a real long time to run, and the source repo already has a copy of the output file, so simply replicate that 'random-walks.txt' from the source repo into the data subfolder, BEFORE running this next cell. (You can thank me later ;)

In [58]:
%%time
# construct random walks (caution: long process!)
construct_random_walks(E, NUM_WALKS_PER_VERTEX, RESTART_PROB, 
    MAX_PATH_LENGTH, RANDOM_WALKS_FILE)

random walks generated already, skipping
CPU times: user 236 µs, sys: 4 µs, total: 240 µs
Wall time: 155 µs


In [59]:
class Documents(object):
    def __init__(self, input_file):
        self.input_file = input_file

    def __iter__(self):
        with open(self.input_file, "r") as f:
            for i, line in enumerate(f):
                if i % 1000 == 0:
                    if i % 1000 == 0:
                        logging.info("{:d} random walks extracted".format(i))
                yield line.strip().split()

In [60]:
def train_word2vec_model(random_walks_file, model_file):
    if os.path.exists(model_file):
        print("Model file {:s} already present, skipping training"
            .format(model_file))
        return
    docs = Documents(random_walks_file)
    model = gensim.models.Word2Vec(
        docs,
        vector_size=128,    # size of embedding vector
        window=10,   # window size
        sg=1,        # skip-gram model
        min_count=2,
        workers=4
    )
    model.train(
        docs, 
        total_examples=model.corpus_count,
        epochs=50)
    model.save(model_file)

The next cell uses TensorFlow but it does NOT run on the GPU ... 

It generates the ./data/w2v-neurips-papers.model file.

In [61]:
%%time
# train model
train_word2vec_model(RANDOM_WALKS_FILE, W2V_MODEL_FILE)

Model file ./data/w2v-neurips-papers.model already present, skipping training
CPU times: user 253 µs, sys: 5 µs, total: 258 µs
Wall time: 222 µs


In [62]:
# This is the code that came with the repo. It has a lot of problems ... 
def evaluate_model(td_matrix, model_file, source_id):
    model = gensim.models.Word2Vec.load(model_file).wv
    most_similar = model.most_similar(str(source_id))
    scores = [x[1] for x in most_similar]
    target_ids = [x[0] for x in most_similar]
    # compare top 10 scores with cosine similarity between source and each target
    X = np.repeat(td_matrix[source_id].todense(), 10, axis=0)
    Y = td_matrix[target_ids].todense()
    cosims = [cosine_similarity(X[i], Y[i])[0, 0] for i in range(10)]
    for i in range(10):
        print("{:d} {:s} {:.3f} {:.3f}".format(
            source_id, target_ids[i], cosims[i], scores[i]))

In [63]:
# And this is the fixed version of the above pos code ... yes, it runs just fine. 
def evaluate_model(td_matrix, model_file, source_id):
    model = gensim.models.Word2Vec.load(model_file).wv
    most_similar = model.most_similar(str(source_id))
    scores = [x[1] for x in most_similar]
    # target_ids = [x[0] for x in most_similar]
    target_ids = [int(x[0]) for x in most_similar]
    # compare top 10 scores with cosine similarity between source and each target
    X = np.repeat(td_matrix[source_id].todense(), 10, axis=0)
    Y = td_matrix[target_ids].todense()
    cosims = [cosine_similarity(np.asarray(X[i]), np.asarray(Y[i]))[0, 0] for i in range(10)]
    for i in range(10):
        print("{:d} {:d} {:.3f} {:.3f}".format(source_id, target_ids[i], cosims[i], scores[i]))

In [64]:
# evaluate
source_id = np.random.choice(E.shape[0])
evaluate_model(TD, W2V_MODEL_FILE, source_id)

2023-02-14 17:34:26,122 : INFO : loading Word2Vec object from ./data/w2v-neurips-papers.model
2023-02-14 17:34:26,125 : INFO : loading wv recursively from ./data/w2v-neurips-papers.model.wv.* with mmap=None
2023-02-14 17:34:26,126 : INFO : setting ignored attribute cum_table to None
2023-02-14 17:34:26,155 : INFO : Word2Vec lifecycle event {'fname': './data/w2v-neurips-papers.model', 'datetime': '2023-02-14T17:34:26.155721', 'gensim': '4.3.0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.15.0-60-generic-x86_64-with-glibc2.29', 'event': 'loaded'}


5239 3956 0.004 0.342
5239 17 0.027 0.324
5239 3285 0.005 0.319
5239 4552 0.008 0.314
5239 806 0.012 0.305
5239 2408 0.046 0.303
5239 5509 0.072 0.300
5239 4682 0.031 0.300
5239 5498 0.003 0.299
5239 4164 0.006 0.296


## Dynamic embeddings

(code source: Chapter_4/elmo_from_tfhub.py)

In [65]:
import tensorflow as tf
import tensorflow_hub as hub

elmo = hub.load("https://tfhub.dev/google/elmo/3")

2023-02-14 17:34:26,196 : INFO : Using /tmp/tfhub_modules to cache modules.


In [66]:
embeddings = elmo.signatures["default"](
    tf.constant([
      "i like green eggs and ham",
      "would you eat them in a box"
    ]))["elmo"]

print(embeddings.shape)


(2, 7, 1024)


(code source: Chapter_4/elmo_keraslayer.py)

In [67]:
embed = hub.KerasLayer(
    "https://tfhub.dev/google/elmo/3",
    input_shape=[],     # Expects a tensor of shape [batch_size] as input.
    dtype=tf.string)    # Expects a tf.string input tensor.





In [68]:
model = tf.keras.Sequential([embed])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 1024)              93600852  
                                                                 
Total params: 93,600,852
Trainable params: 0
Non-trainable params: 93,600,852
_________________________________________________________________


In [69]:
embeddings = model.predict([
   "i i like green eggs and ham",
   "would you eat them in a box"
])
print(embeddings.shape)

(2, 1024)


## Sentence and paragraph embeddings

(code source: Chapter_4/google_sent_enc_from_tfub.py)

In [70]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/4")

In [71]:
embeddings = embed([
   "i like green eggs and ham",
   "would you eat them in a box"
])["outputs"]
print(embeddings.shape)

(2, 512)


In [72]:
endTime = time.time()

elapsedTime = time.strftime("%H:%M:%S", time.gmtime(endTime - startTime))

print(todaysDate.strftime('# Run Date: %A, %B %d, %Y'))
print(f"# Run Time: {elapsedTime}")

# Run Date: Tuesday, February 14, 2023
# Run Time: 00:01:26
