In [1]:
import pandas as pd
df = pd.read_csv('./DBpedia_train_terms_augmented.csv')
document_list = df['text'].to_numpy().astype('str')
document_list.dtype

print('Finished reading df')

Finished reading df


In [2]:
len(df)

240942

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,l1,l2,l3,terms
0,0,"William Alexander Massey (October 7, 1856 – Ma...",Agent,Politician,Senator,massey
1,1,Lions is the sixth studio album by American ro...,Work,MusicalWork,Album,album
2,2,"Pirqa (Aymara and Quechua for wall, hispaniciz...",Place,NaturalPlace,Mountain,pirqa
3,3,Cancer Prevention Research is a biweekly peer-...,Work,PeriodicalLiterature,AcademicJournal,oncology
4,4,The Princeton University Chapel is located on ...,Place,Building,HistoricBuilding,chapel


In [4]:
# encoding topics to create the adjacency matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np

VIRTUAL_NODE_NAME = 'ZZ_VIRTUAL'

labelEncoder = LabelEncoder()
topics = np.concatenate((df['l1'].unique(), df['l2'].unique(), df['l3'].unique()))
# For virtual node when expanding topic taxonomy
topics = np.append(topics, VIRTUAL_NODE_NAME)
labelEncoder.fit(topics)

def encode_topic(topic):
    print(type(topic))
    return labelEncoder.transform(topic)

df['l1_encoded'] = labelEncoder.transform(df['l1'])
df['l2_encoded'] = labelEncoder.transform(df['l2'])
df['l3_encoded'] = labelEncoder.transform(df['l3'])


In [5]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2023-05-30 17:29:32.483716: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-30 17:29:33.060611: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-30 17:29:36.341097: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /software/spackages_prod/apps/linux-ubuntu20.04-zen2/gcc-9.4.0/cudnn-8.2.4.15-11.4-aa4j

Num GPUs Available:  1


In [6]:
from transformers import BertTokenizer
import numpy as np
import re

max_len = 512

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_topic_to_tokenized_dict = {}
for topic in topics:
    # dbpedia categories are in PascalCase, so this makes them spaced
    spaced_words = re.sub( r"([A-Z])", r" \1", topic)[1:]
    tokenized_sequence = tokenizer.encode_plus(spaced_words, add_special_tokens=True, max_length=max_len, padding='max_length')['input_ids']

    encoded_topic_to_tokenized_dict[labelEncoder.transform([topic])[0]] = tokenized_sequence

In [7]:
# documents = df['text'].apply(lambda doc: np.array(tokenizer.encode_plus(doc, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True)['input_ids'])).to_numpy()
# documents_labels = labelEncoder.transform(df['l3'].to_numpy())
# documents_fixed = np.empty(shape=(len(documents), max_len))
# for i, doc in enumerate(documents):
#     documents_fixed[i] = doc
# terms = df['terms'].apply(lambda doc: np.array(tokenizer.encode_plus(doc, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True)['input_ids'])).to_numpy()

In [8]:
# import pickle
# with open("tokenized_dbpedia.pkl", "wb") as f:
#     pickle.dump([documents_fixed, documents_labels], f)

# import pickle
# with open("tokenized_dbpedia_terms.pkl", "wb") as f:
#     pickle.dump(terms, f)

In [9]:
import pickle

with (open("./tokenized_dbpedia.pkl", "rb")) as f:
    documents, documents_labels = pickle.load(f)

with (open("./tokenized_dbpedia_terms.pkl", "rb")) as f:
    terms = pickle.load(f)
    
print('Finished getting tokenized files')

Finished getting tokenized files


In [10]:
graph_dict = {}

for i, row in df.iterrows():
    l1 = row['l1_encoded']
    l2 = row['l2_encoded']
    l3 = row['l3_encoded']

    if l1 not in graph_dict:
        graph_dict[l1] = {}
    if l2 not in graph_dict[l1]:
        graph_dict[l1][l2] = {} 

    graph_dict[l1][l2][l3] = 1

In [11]:
# creating node features
x = np.arange(298)
x = labelEncoder.inverse_transform(x)
feature_array = x.reshape(298, -1)

In [12]:
# loading GloVe model to get topic word embeddings
# from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
import torchtext

glove = torchtext.vocab.GloVe(name="6B", dim=50)
print('Finished getting GlOVE embedder')

Finished getting GlOVE embedder


In [13]:
from spektral.data import Graph
import numpy as np
import re

def create_ego_graph(l1_topic, l2_topic, l3_topic, graph_dict, feature_array, is_virtual=False):
    # Creating "ego-graphs" (each node is seperated into a graph with itself, parent, and siblings)
    # The base node (so the node itself) will be masked, aka. have a [MASK] embedding
    # The sibling nodes need to have a negative relationship with the base node (so negative value in adjacency matrix?)
    if l3_topic != None:
        siblings_list = list(graph_dict[l1_topic][l2_topic].keys())
        siblings_list.remove(l3_topic)
        base = l3_topic
        parent = l2_topic
        grandparent = l1_topic

        all_nodes_list = siblings_list.copy()
        all_nodes_list.append(base)
        all_nodes_list.append(parent)
        all_nodes_list.append(grandparent)
        
        n_nodes = len(all_nodes_list)
        adj_matrix = np.zeros((n_nodes, n_nodes))

        node_label_encoder = LabelEncoder()
        node_label_encoder.fit(all_nodes_list)
        
        encoded_base = node_label_encoder.transform([base])[0]
        encoded_parent = node_label_encoder.transform([parent])[0]
        encoded_grandparent = node_label_encoder.transform([grandparent])[0]

        adj_matrix[encoded_base][encoded_parent] = 1
        adj_matrix[encoded_parent][encoded_base] = 1

        adj_matrix[encoded_grandparent][encoded_parent] = 1
        adj_matrix[encoded_parent][encoded_grandparent] = 1

        for sibling in siblings_list: 
            encoded_sibling = node_label_encoder.transform([sibling])[0]
            adj_matrix[encoded_sibling][encoded_base] = -1
            adj_matrix[encoded_base][encoded_sibling] = -1
        
    elif l2_topic != None:
        siblings_list = list(graph_dict[l1_topic].keys())
        siblings_list.remove(l2_topic)
        base = l2_topic
        parent = l1_topic

        all_nodes_list = siblings_list.copy()
        all_nodes_list.append(base)
        all_nodes_list.append(parent)
        
        n_nodes = len(all_nodes_list)
        adj_matrix = np.zeros((n_nodes, n_nodes))

        node_label_encoder = LabelEncoder()
        node_label_encoder.fit(all_nodes_list)
        
        encoded_base = node_label_encoder.transform([base])[0]
        encoded_parent = node_label_encoder.transform([parent])[0]

        adj_matrix[encoded_base][encoded_parent] = 1
        adj_matrix[encoded_parent][encoded_base] = 1

        for sibling in siblings_list: 
            encoded_sibling = node_label_encoder.transform([sibling])[0]
            adj_matrix[encoded_sibling][encoded_base] = -1
            adj_matrix[encoded_base][encoded_sibling] = -1
    
    elif l1_topic != None:
        siblings_list = list(graph_dict.keys())
        siblings_list.remove(l1_topic)
        base = l1_topic

        all_nodes_list = siblings_list.copy()
        all_nodes_list.append(base)
        
        n_nodes = len(all_nodes_list)
        adj_matrix = np.zeros((n_nodes, n_nodes))

        node_label_encoder = LabelEncoder()
        node_label_encoder.fit(all_nodes_list)
        
        encoded_base = node_label_encoder.transform([base])[0]

        for sibling in siblings_list: 
            encoded_sibling = node_label_encoder.transform([sibling])[0]
            adj_matrix[encoded_sibling][encoded_base] = -1
            adj_matrix[encoded_base][encoded_sibling] = -1

    ego_features = np.zeros((n_nodes, 50))
    encoded_nodes_list = node_label_encoder.transform(all_nodes_list)

    for i, node in enumerate(all_nodes_list):
        # Masking base node, setting the embedding to all 0's
        if (node == base):
            embedding_avg = glove['MASK']
        else:
            feature = feature_array[node]
            split_words_list = re.sub( r"([A-Z])", r" \1", feature[0]).split()
            n_words = len(split_words_list)
            embedding_avg = np.array([glove[word.lower()].numpy() for word in split_words_list]).sum(axis=0)/n_words

        ego_features[encoded_nodes_list[i]] = embedding_avg

    return Graph(a=adj_matrix, x=ego_features, y=(l1_topic, l2_topic, l3_topic))

graph_list = []

for l1_topic in graph_dict:
    for l2_topic in graph_dict[l1_topic]:
        for l3_topic in graph_dict[l1_topic][l2_topic]:
            graph_list.append(create_ego_graph(l1_topic, l2_topic, l3_topic, graph_dict, feature_array))
        graph_list.append(create_ego_graph(l1_topic, l2_topic, None, graph_dict, feature_array))
    graph_list.append(create_ego_graph(l1_topic, None, None, graph_dict, feature_array))

graph_list = np.array(graph_list)

In [14]:
from spektral.data import Dataset

class MyDataset(Dataset):
    """
    """
    def __init__(self, graph_list: list[Graph], **kwargs):
        self.graph_list = graph_list
        super().__init__(**kwargs)

    def read(self):
        # We must return a list of Graph objects
        num_l =  np.random.permutation(len(self.graph_list))
        return [self.graph_list[i] for i in num_l]
    
dataset = MyDataset(graph_list)

In [15]:
from layers.Bilinear import Bilinear
from layers.ContextEmbedding import ContextEmbedding
from layers.TopicAttentiveEmbedding import TopicAttentiveEmbedding

from utils.TopicExpanTrainGen import TopicExpanTrainGen

def sequence_to_document_embedding(sequence_embedding: tf.Tensor):
    # gets the document representation/embedding from a BERT sequence embedding
    # by getting the mean-pooling of the sequence 
    return tf.math.reduce_mean(sequence_embedding, axis=1)

In [16]:
# replacing half the leaf nodes with parent node for evaluating the expansion, used as "ground-truth"
leaf_nodes = df['l3_encoded'].unique()
percent_to_remove = 0.3333
leaves_to_remove = set(np.random.choice(leaf_nodes, int(len(leaf_nodes)*percent_to_remove), replace=False))
replaced_document_labels = np.copy(documents_labels)

for i, l3_topic in enumerate(replaced_document_labels):
    if l3_topic not in leaves_to_remove:
        continue
    parent_l2_topic = df[df['l3_encoded'] == l3_topic].iloc[0]['l2_encoded']
    replaced_document_labels[i] = parent_l2_topic

In [17]:
133 in leaves_to_remove

False

In [18]:
replaced_document_labels[:10]

array([204,   6, 167,   0, 129, 223, 160, 144,  59, 188])

In [None]:
from tensorflow import int64
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Lambda
from spektral.layers import GCNConv, GlobalAvgPool
import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
import tensorflow as tf
from transformers import TFBertModel
import keras_nlp

learning_rate = 5e-5  # Learning rate
epochs = 2  # Number of training epochs
batch_size = 32  # Batch size
weight_decay = 5e-6
mini_batch_size = 2 # a mini-batch will always have 1 positive triple, and (n-1) negative triples
                    # i.e. with a mini_batch_size of 4, we have 1 pos. doc. and 3 neg. docs.
batch_ratio = int(batch_size / mini_batch_size)

# empty phrase for training with negative documents
# negative_document_phrase = tokenizer.encode_plus('', add_special_tokens=True, max_length=max_len, padding='max_length')['input_ids']

max_len = 512
vocab_size = tokenizer.vocab_size
infoNCE_temprature = 0.1

optimizer = Adam(learning_rate, weight_decay=weight_decay)
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
loss_fn_binary = tf.keras.losses.BinaryCrossentropy(from_logits=True)
loss_fn_crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def infoNCE(labels, logits, infoNCE_temprature, mini_batch_size: int):
    with_temprature = tf.reshape(logits / infoNCE_temprature, shape=(mini_batch_size, -1))
    reshaped_labels = tf.reshape(labels, shape=(mini_batch_size, -1))
    
    softmaxed = tf.nn.softmax(with_temprature)
    with_log = -tf.math.log(softmaxed)
    multiplied = tf.math.multiply(reshaped_labels, with_log)
    
    return tf.reduce_sum(multiplied)
    
n_out = dataset.n_labels
topic_embedding_dimension = 300

In [None]:
labels_sample = tf.convert_to_tensor(np.array([1, 0, 0 ,0, 0, 0, 1, 0, 0, 0, 0, 1]), dtype=tf.float64)
logits_sample = tf.convert_to_tensor(np.array([1.5, 0.4, 1 ,0.3, 0.4, 0.7, 0.6, 0.8, 0.2, 0.1, 0.9, 0.5]))
infoNCE(labels_sample, logits_sample, 0.1, mini_batch_size)

In [None]:
import sys
# Custom training loop
class ModelWithNCE(Model):
    @tf.function(experimental_relax_shapes=True)
    def train_step(self, data):
        inputs, target = data
        with tf.GradientTape() as tape:
            similarity_prediction, phrase_prediction = self(inputs, training=True)
            # similarity_prediction_infonce = tf.reshape(similarity_prediction / infoNCE_temprature, shape=(mini_batch_size, -1))

            # infoNCE_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(target[0], shape=(mini_batch_size, -1)), logits=similarity_prediction_infonce))
            # infoNCE_loss = infoNCE(tf.cast(target[0], tf.float32), similarity_prediction, infoNCE_temprature, mini_batch_size)
            # infoNCE_loss = loss_fn_crossentropy(tf.reshape(target[0], shape=(mini_batch_size, -1)), similarity_prediction_infonce)
            bce_loss = loss_fn_binary(target[0], similarity_prediction)
            
            # TODO: calculate phrase loss only for POSITIVE documents (i.e. ignore negative documents)
            # removing negative documents from phrase loss
            phrase_mask = tf.cast(tf.reshape(target[0], shape=(-1, )), dtype=tf.bool)
            phrase_loss = loss_fn(tf.boolean_mask(target[1], phrase_mask), tf.boolean_mask(phrase_prediction, phrase_mask))

            # tf.print(phrase_mask[:4], bce_loss, tf.boolean_mask(phrase_prediction, phrase_mask).shape, tf.boolean_mask(target[1], phrase_mask).shape, output_stream=sys.stderr)
        gradients = tape.gradient([bce_loss, phrase_loss], self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.compiled_metrics.update_state((target[0], tf.boolean_mask(target[1], phrase_mask)), (similarity_prediction, tf.boolean_mask(phrase_prediction, phrase_mask)))
        return {m.name: m.result() for m in self.metrics}

In [None]:
from transformers import TFEncoderDecoderModel as EncoderDecoderModel

EMBEDDING_SIZE=384

shared_bilinear = Bilinear(topic_embedding_dimension, EMBEDDING_SIZE, 1)

# GNNs (Topic Encoder)
X_in = Input(shape=(dataset.n_node_features))
A_in = Input(shape=(None,), sparse=True)
I_in = Input(shape=(), dtype=int64)

topic_embedding = GCNConv(topic_embedding_dimension, activation='relu')([X_in, A_in])
topic_embedding = GCNConv(topic_embedding_dimension, activation='relu')([topic_embedding, A_in])
topic_embedding = GlobalAvgPool(name='topic_embedding')([topic_embedding, I_in])

# BERT Embedding (Document Encoder)
max_seq_length = max_len
bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-MiniLM-L6-v2")

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
embedding = bert2bert.encoder(input_ids)[0]

# Transformer Decoders (Phrase Generator)
# try a pre-trained decoder
decoder_tokens_input = Input(shape=(max_len,), dtype=tf.int32, name="decoder_phrase_input")
# decoder_embedding = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=vocab_size, sequence_length=max_len, embedding_dim=EMBEDDING_SIZE, mask_zero=True)(decoder_tokens_input)

# Getting context embedding for decoder
topic_attentive_embedding = TopicAttentiveEmbedding()(topic_embedding, embedding, shared_bilinear, training=False)
topic_attentive_embedding = tf.keras.layers.Reshape((max_len, 1))(topic_attentive_embedding)
context_embedding = ContextEmbedding()([topic_attentive_embedding, embedding])

# transformer_decoder = keras_nlp.layers.TransformerDecoder(
#     num_heads=16, 
#     intermediate_dim=max_len,
#     dropout=0.2
# )(decoder_embedding, context_embedding)

out2 = bert2bert.decoder(decoder_tokens_input, encoder_hidden_states=context_embedding)[0]

# Transformer Output
# out2 = Dense(vocab_size)(transformer_decoder)

# Output Bilinear layer (Similarity Predictor)
document_embedding = Lambda(sequence_to_document_embedding, name='document_embedding')(embedding)
out = shared_bilinear([topic_embedding, document_embedding])

# Outputs
model = ModelWithNCE(inputs=[X_in, A_in, I_in, input_ids, decoder_tokens_input], outputs=[out, out2])

# compiling model and adding metrics
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy', perplexity], run_eagerly=True)

# TODO: Remove half of the leaf nodes, and replace the elements in documents_labels that have a removed leaf node with the parent node BEFORE passing to the Generator
topic_expan_generator = TopicExpanTrainGen(graph_list, documents[:-20000], replaced_document_labels[:-20000], terms[:-20000], batch_size, mini_batch_size, encoded_topic_to_tokenized_dict)

In [None]:
tokenizer.decode(topic_expan_generator.__getitem__(0)[1][1][28], skip_special_tokens=True)

In [None]:
model.summary()

In [None]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
from tqdm.keras import TqdmCallback

# NOTE: Ignore warning about gradients not existing for BERT's dense layer since 
# the dense layers are not used and are thus unconnected and do not need training

checkpoint_filepath = './checkpoint.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='decoder_perplexity',
    save_weights_only=True,
    save_freq=1000,
    mode='min',
    save_best_only=True)

model.fit(topic_expan_generator, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[TqdmCallback(verbose=1), model_checkpoint_callback])

In [None]:
mask = np.array([0.0, 1.0, 1.0, 0], dtype=bool)
pred = tf.convert_to_tensor(np.ones(shape=(4,512,3)))
target = tf.convert_to_tensor(np.ones(shape=(4,512)))

tf.boolean_mask(pred, mask)

In [28]:
test_gen = TopicExpanTrainGen(graph_list, documents[-1000:], replaced_document_labels[-1000:], batch_size, 1, encoded_topic_to_tokenized_dict)

In [29]:
model.evaluate(test_gen)



[nan,
 nan,
 0.6924110651016235,
 0.984375,
 nan,
 0.8218246698379517,
 1.9985281229019165]

In [26]:
labelEncoder.inverse_transform([298])

array(['ZZ_VIRTUAL'], dtype=object)

In [50]:
df.iloc[238230]

text          The World Table Tennis Championships have been...
l1                                                        Event
l2                                                SocietalEvent
l3                                                   Convention
l1_encoded                                                  100
l2_encoded                                                  251
l3_encoded                                                   75
Name: 238230, dtype: object

In [None]:
tokenizer.decode(documents[77784])

In [None]:
df.iloc[77784]

In [148]:
# test_gen = TopicExpanTrainGen(graph_list, documents[-1000:], documents_labels[-1000:], 1, 1, encoded_topic_to_tokenized_dict)
test_gen = TopicExpanVirtualPhraseGen(virtual_ego_graph, filtered_docs)
x1, x2, x3, x4, x5 = test_gen.__getitem__(0)
# sequence = x5.numpy()
# sequence[:, 1:] = 0
# sequence[:, 1] = 2447

In [154]:
padded_sequence = tf.pad(x5, [[0, 0], [0, max_len-tf.shape(x5)[1]]])
pred = model((x1, x2, x3, x4, padded_sequence))[1]
# pred

In [156]:
# pred

In [93]:
x4

<tf.Tensor: shape=(32, 512), dtype=float64, numpy=
array([[  101., 14641.,  2080., ...,     0.,     0.,     0.],
       [  101.,  1996.,  4936., ...,     0.,     0.,     0.],
       [  101.,  5754., 19190., ...,     0.,     0.,     0.],
       ...,
       [  101.,  1996.,  7326., ...,     0.,     0.,     0.],
       [  101.,  4894.,  2041., ...,     0.,     0.,     0.],
       [  101.,  1996.,  2541., ...,     0.,     0.,     0.]])>

In [164]:
tf.argmax(pred[:, 0, :], axis=1)

<tf.Tensor: shape=(32,), dtype=int64, numpy=
array([3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586,
       3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586,
       3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586, 3586])>

In [159]:
pred[3]

<tf.Tensor: shape=(512, 30522), dtype=float32, numpy=
array([[ 2.6153495e+00, -9.5019937e-02,  1.4651502e-02, ...,
        -3.5517168e-01, -3.5281119e-01, -5.6236595e-01],
       [ 3.6694511e+01, -4.5734015e+00, -4.6839118e+00, ...,
        -4.4302564e+00, -4.5606627e+00, -4.7432141e+00],
       [ 3.1848719e+01, -4.4285512e+00, -4.5817451e+00, ...,
        -4.3376241e+00, -4.3683510e+00, -4.4911666e+00],
       ...,
       [ 3.5318607e+01, -5.3323512e+00, -5.3753605e+00, ...,
        -5.1716352e+00, -5.2236719e+00, -5.3112555e+00],
       [ 3.5602047e+01, -5.2717361e+00, -5.3498573e+00, ...,
        -5.1118798e+00, -5.2239351e+00, -5.2790284e+00],
       [ 3.4962620e+01, -4.9379449e+00, -4.9965191e+00, ...,
        -4.8068523e+00, -4.8717856e+00, -5.0343022e+00]], dtype=float32)>

In [128]:
tokenizer.decode(x4[1])

'[CLS] the 1915 grand national was the 77th renewal of the world - famous grand national horse race that took place at aintree near liverpool, england, on 26 march 1915. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [24]:
from utils.TopicExpanExpansionGen import TopicExpanExpansionGen
x = np.arange(299)
x = labelEncoder.inverse_transform(x)
feature_array = x.reshape(299, -1)

model.load_weights('checkpoint.h5')

graph_dict[100][212][298] = 1 # adding virtual node
l1_topic = 100
l2_topic = 212
virtual_node = 298

virtual_ego_graph = create_ego_graph(l1_topic, l2_topic, virtual_node, graph_dict, feature_array, is_virtual=True)

In [36]:
import gc
import math

test_batch_size = 32
expan_gen = TopicExpanExpansionGen(virtual_ego_graph, documents, test_batch_size)

# outputs = model.predict_generator(expan_gen)
outputs = {}
for model_expansion_input, doc_ids in expan_gen:
    batch_output = model(model_expansion_input)

    for i, sim_pred in enumerate(batch_output[0]):
        outputs[doc_ids[i]] = (math.e**sim_pred)
            
    # resetting model to clear memory, model runs OOM without resetting
    tf.keras.backend.clear_session()
    _ = gc.collect()

ValueError: Exception encountered when calling layer 'topic_attentive_embedding_2' (type TopicAttentiveEmbedding).

Dimensions 32 and 14 are not compatible

Call arguments received by layer 'topic_attentive_embedding_2' (type TopicAttentiveEmbedding):
  • topic_embedding=tf.Tensor(shape=(32, 300), dtype=float32)
  • sequence_embedding=tf.Tensor(shape=(14, 512, 384), dtype=float32)
  • shared_bilinear_layer=<layers.Bilinear.Bilinear object at 0x7f92c6aa0b80>

In [37]:
import pickle

with open('expansion_out.pickle', 'wb') as handle:
    pickle.dump(outputs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
import pickle

with open('expansion_out.pickle', 'rb') as handle:
    outputs = pickle.load(handle)

In [26]:
import math

def sigmoid(x):
    return 1/(1+(math.e**-x))

def normalize(x):
    return (x-min(x))/(max(x)-min(x))

norm_outputs = normalize(np.concatenate([outputs[i].numpy() for i in outputs]))

In [27]:
import math
from spektral.data.utils import to_disjoint
from spektral.data import Graph
import tensorflow as tf
import numpy as np

class TopicExpanVirtualPhraseGen(tf.keras.utils.Sequence):
    # Generates batches for phrase generation predictions for a specific virtual node
    # Used for expansion
    def __init__(self, virtual_graph: Graph, documents: np.array):
        self._virtual_graph = virtual_graph # ego-graph of the new virtual node
        self.documents = documents
        self.batch_size = len(documents)
        
        # generating initial prompt with the start ID, this is the same prompt
        # for all batches since the initial prompt is always [101]
        self.initial_prompt = tf.fill((self.batch_size, 1), 101)
        # self.initial_prompt = tf.pad(unpadded_prompt, [[0, 0], [0, max_len-tf.shape(unpadded_prompt)[1]]])
        
        # the same virtual node ego-graph is used for the whole batch,
        # so generating it in the init to speed up computation
        self.x_in, self.a_in, self.i_in = to_disjoint(
            x_list=[self._virtual_graph.x for _ in range(self.batch_size)],
            a_list=[self._virtual_graph.a for _ in range(self.batch_size)]
        )
        
        self.x_in = tf.convert_to_tensor(self.x_in)
        self.a_in = tf.sparse.SparseTensor(
            indices=np.array([self.a_in.row, self.a_in.col]).T,
            values=self.a_in.data,
            dense_shape=self.a_in.shape
        )
        self.i_in = tf.convert_to_tensor(self.i_in)
        
    def __len__(self):
        return 1
    
    def __getitem__(self, non_batch_index: int):
        idx = non_batch_index * self.batch_size
        idx_limiter = min(idx + self.batch_size, len(self.documents)) # for limiting batch size at end of list
        fixed_batch_size = idx_limiter - idx # gets the fixed batch size in case this is the last batch
        
        model_expansion_inputs = (
            self.x_in,
            self.a_in,
            self.i_in,
            tf.convert_to_tensor(self.documents[idx:idx_limiter]),
            self.initial_prompt
        )
        
        doc_ids = range(idx, idx_limiter)
            
        return model_expansion_inputs

In [28]:
documents[np.argwhere(norm_outputs < 0.7)].reshape(-1, max_len)[0:64]

array([[  101.,  2520.,  3656., ...,     0.,     0.,     0.],
       [  101.,  7212.,  2003., ...,     0.,     0.,     0.],
       [  101., 14255.,  2099., ...,     0.,     0.,     0.],
       ...,
       [  101.,  1996.,  3731., ...,     0.,     0.,     0.],
       [  101.,  1996., 22440., ...,     0.,     0.,     0.],
       [  101., 28722., 12403., ...,     0.,     0.,     0.]])

In [36]:
from keras_nlp.utils import beam_search
from random import randint

filtered_docs = documents[np.argwhere(norm_outputs > 0.8)].reshape(-1, max_len)[0:32]
test_batch_size = len(filtered_docs)
test_gen = TopicExpanVirtualPhraseGen(virtual_ego_graph, filtered_docs)
x1, x2, x3, x4, x5 = test_gen[0]
# print(x5)
START_ID = 101
END_ID = 102

def token_probability_fn(inputs):
    padded_inputs = tf.pad(inputs, [[0, 0], [0, max_len-tf.shape(inputs)[1]]])
    
    # print(inputs)
    repeats = int(padded_inputs.shape[0] / test_batch_size)

    preds = [
        model((
            x1,
            x2,
            x3,
            x4,
            padded_inputs[repeat_idx*test_batch_size:(repeat_idx+1)*test_batch_size]
        )) for repeat_idx in range(repeats)]
    
    # print([pred[0] for pred in preds])
    
    concatenated_preds = tf.concat([pred[1] for pred in preds], axis=0)
    # the first zero index is the position in the sequence we're trying to find to add to the sequence
    first_zero_index = (padded_inputs.numpy()[0]==0).argmax(axis=0)
    # print(first_zero_index)
    # print(tf.argmax(concatenated_preds[:, first_zero_index, :], axis=1))
    
    return concatenated_preds[:, first_zero_index-1, :]

# prompt = tf.fill((test_batch_size, 1), START_ID)

predicted_phrases = keras_nlp.utils.beam_search(
    token_probability_fn,
    x5,
    max_length=5,
    num_beams=2,
    # p=0.92,
    end_token_id=END_ID,
    from_logits=True
)
[tokenizer.decode(phrase) for phrase in predicted_phrases]

['[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]',
 '[CLS] thoroughbred [SEP] [PAD] [PAD]',
 '[CLS] racehorse [SEP] [PAD] [PAD]']

In [37]:
print([tokenizer.decode(phrase, skip_special_tokens=True) for phrase in filtered_docs])


["it's anybodys race as they run into the final furlong, and it's rubstic on the nearside with the advantage over zongalero and rough and tumble as they race up towards the line, it's gonna be a victory for scotland, it's rubstic from zongalero in the national, and as they come to the line, rubstic is the winner! “ ” commentator peter o'sullevan describes the climax of the 1979 national the 1979 grand national was the 133rd renewal of the world - famous grand national horse race that took place at aintree near liverpool, england, on 31 march 1979. the race was won by rubstic who was the first ever scottish - trained winner. the favourite alverton died in the race, a month after winning the cheltenham gold cup. another fatality was kintai, who had to be put down later on.", "the 1848 grand national steeplechase was the tenth official annual running of a handicap steeplechase horse race at aintree racecourse near liverpool on wednesday, 1 march. it attracted a then record, field of 29 co

In [67]:
df[df['l3'] == 'RaceHorse']

Unnamed: 0,text,l1,l2,l3,l1_encoded,l2_encoded,l3_encoded
232,Longboat (24 March 1981 – ca. 1997) was a Brit...,Species,Horse,RaceHorse,255,132,213
336,"Superstar Leo is an Irish-bred, British-traine...",Species,Horse,RaceHorse,255,132,213
343,Donau (1907–1913) was an American Thoroughbred...,Species,Horse,RaceHorse,255,132,213
377,Came Home (foaled in 1999) is an American Thor...,Species,Horse,RaceHorse,255,132,213
394,"Kodiak Kowboy (foaled April 16, 2005 in Kentuc...",Species,Horse,RaceHorse,255,132,213
...,...,...,...,...,...,...,...
240056,So Casual (foaled 25 October 1995) is a Thorou...,Species,Horse,RaceHorse,255,132,213
240248,Humorist (1918–1921) was a British Thoroughbre...,Species,Horse,RaceHorse,255,132,213
240779,"Goldencents (foaled March 7, 2010) is an Ameri...",Species,Horse,RaceHorse,255,132,213
240804,"Lakeway (foaled February 19, 1991 in Kentucky)...",Species,Horse,RaceHorse,255,132,213


In [44]:
prompt = tf.fill((batch_size, 1), START_ID)
padded_prompt = tf.pad(prompt, [[0, 0], [0, max_len-tf.shape(prompt)[1]]])
padded_prompt

<tf.Tensor: shape=(32, 512), dtype=int32, numpy=
array([[101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0],
       ...,
       [101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0]], dtype=int32)>

In [68]:
leaves_to_remove

{4,
 7,
 8,
 19,
 23,
 24,
 27,
 28,
 30,
 36,
 37,
 40,
 48,
 49,
 50,
 51,
 53,
 60,
 67,
 79,
 80,
 83,
 101,
 102,
 113,
 116,
 118,
 121,
 126,
 127,
 130,
 131,
 134,
 141,
 142,
 147,
 150,
 154,
 158,
 162,
 163,
 172,
 173,
 177,
 178,
 182,
 190,
 194,
 198,
 200,
 201,
 203,
 205,
 218,
 219,
 222,
 226,
 227,
 228,
 230,
 233,
 239,
 242,
 244,
 245,
 246,
 253,
 266,
 273,
 274,
 275,
 292}

In [None]:
print(model((x1, x2, x3, x4, prompt)), tokenizer.decode(x4[2], skip_special_tokens=True))

In [None]:
test_gen[0][1]

In [106]:
df[df['l3'].str.contains('Festival')]

Unnamed: 0,text,l1,l2,l3,l1_encoded,l2_encoded,l3_encoded
214,The Ardee Baroque Festival is a celebration of...,Event,SocietalEvent,MusicFestival,100,251,171
653,The Slamdance Film Festival is an annual film ...,Event,SocietalEvent,FilmFestival,100,251,105
774,"The Stan Rogers Folk Festival, informally know...",Event,SocietalEvent,MusicFestival,100,251,171
1350,The 5th Toronto International Film Festival (T...,Event,SocietalEvent,FilmFestival,100,251,105
1886,The 2010 Slamdance Film Festival took place in...,Event,SocietalEvent,FilmFestival,100,251,105
...,...,...,...,...,...,...,...
240049,The São Paulo International Film Festival (Por...,Event,SocietalEvent,FilmFestival,100,251,105
240163,The 40th annual Toronto International Film Fes...,Event,SocietalEvent,FilmFestival,100,251,105
240210,During the 19th century Trinidadians and other...,Event,SocietalEvent,MusicFestival,100,251,171
240226,New York Polish Film Festival (abbreviated to ...,Event,SocietalEvent,FilmFestival,100,251,105


In [184]:
topics_with_virtual = np.append(topics, 'ZZ_VIRTUAL')
le = LabelEncoder()
le.fit(topics_with_virtual)
le.inverse_transform([298])

array(['ZZ_VIRTUAL'], dtype=object)

In [160]:
################################################################################
# Build model
################################################################################
from tensorflow import int64
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
import tensorflow_ranking

from spektral.layers import GCNConv, GlobalAvgPool, GraphMasking

# n_out = dataset.n_labels

X_in = Input(shape=(50))
A_in = Input(shape=(None,), sparse=True)
I_in = Input(shape=(), dtype=int64)

X = GCNConv(32, activation='relu')([X_in, A_in])
X = GCNConv(32, activation='relu')([X, A_in])
X = GlobalAvgPool()([X, I_in])

shared_bilinear = tensorflow_ranking.keras.layers.Bilinear(32, 32)
X_1 = shared_bilinear([X, X])
X = shared_bilinear([X, X], training=False)

out = Dense(2, activation='softmax')(X)

model = Model(inputs=[X_in, A_in, I_in], outputs=out)


ModuleNotFoundError: No module named 'tensorflow_ranking'

In [None]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
import tensorflow as tf
import numpy as np

preds = tf.constant([[[-11.7803297],
 [-9.34260654],
 [-14.0992193],
 [-9.90242]],[[-11.7803297],
 [-9.34260654],
 [-14.0992193],
 [-9.90242]]], dtype=float)

target = tf.constant([[[0],
 [0],
 [0],
 [1]],[[0],
 [0],
 [0],
 [1]]], dtype=float)

In [None]:
sigmoided = tf.keras.activations.sigmoid(preds)

In [None]:
tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=tf.reshape(target, shape=(2, -1)), logits=tf.reshape(sigmoided, shape=(2, -1))))

In [None]:
tf.keras.activations.softmax(tf.reshape(preds, shape=(1, -1)))

In [None]:
tf.reshape(target, shape=(2, -1))