In [1]:
import pandas as pd
df = pd.read_csv('../dataset/DBPEDIA_train.csv')
document_list = df['text'].to_numpy().astype('str')
document_list.dtype

print('Finished reading df')

Finished reading df


In [2]:
# encoding topics to create the adjacency matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np

labelEncoder = LabelEncoder()
topics = np.concatenate((df['l1'].unique(), df['l2'].unique(), df['l3'].unique()))
# For virtual node when expanding topic taxonomy
topics = np.append(topics, 'ZZ_VIRTUAL')
labelEncoder.fit(topics)

def encode_topic(topic):
    print(type(topic))
    return labelEncoder.transform(topic)

df['l1_encoded'] = labelEncoder.transform(df['l1'])
df['l2_encoded'] = labelEncoder.transform(df['l2'])
df['l3_encoded'] = labelEncoder.transform(df['l3'])


In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2023-04-04 03:48:13.792694: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-04 03:48:13.964319: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-04 03:48:16.407422: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /software/spackages_prod/apps/linux-ubuntu20.04-zen2/gcc-9.4.0/cudnn-8.2.4.15-11.4-aa4j

Num GPUs Available:  1


In [4]:
from transformers import BertTokenizer
import numpy as np
import re

max_len = 512

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_topic_to_tokenized_dict = {}
for topic in topics:
    # dbpedia categories are in PascalCase, so this makes them spaced
    spaced_words = re.sub( r"([A-Z])", r" \1", topic)[1:]
    tokenized_sequence = tokenizer.encode_plus(spaced_words, add_special_tokens=True, max_length=max_len, padding='max_length')['input_ids']

    encoded_topic_to_tokenized_dict[labelEncoder.transform([topic])[0]] = tokenized_sequence

In [5]:
# documents = df['text'].apply(lambda doc: np.array(tokenizer.encode_plus(doc, add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True)['input_ids'])).to_numpy()
# documents_labels = labelEncoder.transform(df['l3'].to_numpy())
# documents_fixed = np.empty(shape=(len(documents), max_len))
# for i, doc in enumerate(documents):
#     documents_fixed[i] = doc

In [6]:
# import pickle
# with open("tokenized_dbpedia.pkl", "wb") as f:
#     pickle.dump([documents_fixed, documents_labels], f)

In [7]:
import pickle

with (open("./tokenized_dbpedia.pkl", "rb")) as f:
    documents, documents_labels = pickle.load(f)
print('Finished getting tokenized file')

Finished getting tokenized file


In [8]:
graph_dict = {}

for i, row in df.iterrows():
    l1 = row['l1_encoded']
    l2 = row['l2_encoded']
    l3 = row['l3_encoded']

    if l1 not in graph_dict:
        graph_dict[l1] = {}
    if l2 not in graph_dict[l1]:
        graph_dict[l1][l2] = {} 

    graph_dict[l1][l2][l3] = 1

In [9]:
# creating node features
x = np.arange(298)
x = labelEncoder.inverse_transform(x)
feature_array = x.reshape(298, -1)

In [10]:
# loading GloVe model to get topic word embeddings
# from https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
import torchtext

glove = torchtext.vocab.GloVe(name="6B", dim=50)
print('Finished getting GlOVE embedder')

Finished getting GlOVE embedder


In [11]:
glove['MASK'].shape

torch.Size([50])

In [12]:
from spektral.data import Graph
import numpy as np
import re

def create_ego_graph(l1_topic, l2_topic, l3_topic, graph_dict, feature_array, is_virtual=False):
    # Creating "ego-graphs" (each node is seperated into a graph with itself, parent, and siblings)
    # The base node (so the node itself) will be masked, aka. have a [MASK] embedding
    # The sibling nodes need to have a negative relationship with the base node (so negative value in adjacency matrix?)
    if l3_topic != None:
        siblings_list = list(graph_dict[l1_topic][l2_topic].keys())
        siblings_list.remove(l3_topic)
        base = l3_topic
        parent = l2_topic
        grandparent = l1_topic

        all_nodes_list = siblings_list.copy()
        all_nodes_list.append(base)
        all_nodes_list.append(parent)
        all_nodes_list.append(grandparent)
        
        n_nodes = len(all_nodes_list)
        adj_matrix = np.zeros((n_nodes, n_nodes))

        node_label_encoder = LabelEncoder()
        node_label_encoder.fit(all_nodes_list)
        
        encoded_base = node_label_encoder.transform([base])[0]
        encoded_parent = node_label_encoder.transform([parent])[0]
        encoded_grandparent = node_label_encoder.transform([grandparent])[0]

        adj_matrix[encoded_base][encoded_parent] = 1
        adj_matrix[encoded_parent][encoded_base] = 1

        adj_matrix[encoded_grandparent][encoded_parent] = 1
        adj_matrix[encoded_parent][encoded_grandparent] = 1

        for sibling in siblings_list: 
            encoded_sibling = node_label_encoder.transform([sibling])[0]
            adj_matrix[encoded_sibling][encoded_base] = -1
            adj_matrix[encoded_base][encoded_sibling] = -1
        
    elif l2_topic != None:
        siblings_list = list(graph_dict[l1_topic].keys())
        siblings_list.remove(l2_topic)
        base = l2_topic
        parent = l1_topic

        all_nodes_list = siblings_list.copy()
        all_nodes_list.append(base)
        all_nodes_list.append(parent)
        
        n_nodes = len(all_nodes_list)
        adj_matrix = np.zeros((n_nodes, n_nodes))

        node_label_encoder = LabelEncoder()
        node_label_encoder.fit(all_nodes_list)
        
        encoded_base = node_label_encoder.transform([base])[0]
        encoded_parent = node_label_encoder.transform([parent])[0]

        adj_matrix[encoded_base][encoded_parent] = 1
        adj_matrix[encoded_parent][encoded_base] = 1

        for sibling in siblings_list: 
            encoded_sibling = node_label_encoder.transform([sibling])[0]
            adj_matrix[encoded_sibling][encoded_base] = -1
            adj_matrix[encoded_base][encoded_sibling] = -1
    
    elif l1_topic != None:
        siblings_list = list(graph_dict.keys())
        siblings_list.remove(l1_topic)
        base = l1_topic

        all_nodes_list = siblings_list.copy()
        all_nodes_list.append(base)
        
        n_nodes = len(all_nodes_list)
        adj_matrix = np.zeros((n_nodes, n_nodes))

        node_label_encoder = LabelEncoder()
        node_label_encoder.fit(all_nodes_list)
        
        encoded_base = node_label_encoder.transform([base])[0]

        for sibling in siblings_list: 
            encoded_sibling = node_label_encoder.transform([sibling])[0]
            adj_matrix[encoded_sibling][encoded_base] = -1
            adj_matrix[encoded_base][encoded_sibling] = -1

    ego_features = np.zeros((n_nodes, 50))
    encoded_nodes_list = node_label_encoder.transform(all_nodes_list)

    for i, node in enumerate(all_nodes_list):
        # Masking base node, setting the embedding to all 0's
        if (node == base):
            embedding_avg = glove['MASK']
        else:
            feature = feature_array[node]
            split_words_list = re.sub( r"([A-Z])", r" \1", feature[0]).split()
            n_words = len(split_words_list)
            embedding_avg = np.array([glove[word.lower()].numpy() for word in split_words_list]).sum(axis=0)/n_words

        ego_features[encoded_nodes_list[i]] = embedding_avg

    return Graph(a=adj_matrix, x=ego_features, y=(l1_topic, l2_topic, l3_topic))

graph_list = []

for l1_topic in graph_dict:
    for l2_topic in graph_dict[l1_topic]:
        for l3_topic in graph_dict[l1_topic][l2_topic]:
            graph_list.append(create_ego_graph(l1_topic, l2_topic, l3_topic, graph_dict, feature_array))
        graph_list.append(create_ego_graph(l1_topic, l2_topic, None, graph_dict, feature_array))
    graph_list.append(create_ego_graph(l1_topic, None, None, graph_dict, feature_array))

graph_list = np.array(graph_list)

In [13]:
from spektral.data import Dataset

class MyDataset(Dataset):
    """
    """
    def __init__(self, graph_list: list[Graph], **kwargs):
        self.graph_list = graph_list
        super().__init__(**kwargs)

    def read(self):
        # We must return a list of Graph objects
        num_l =  np.random.permutation(len(self.graph_list))
        return [self.graph_list[i] for i in num_l]
    
dataset = MyDataset(graph_list)

In [14]:
from layers.Bilinear import Bilinear
from layers.ContextEmbedding import ContextEmbedding
from layers.TopicAttentiveEmbedding import TopicAttentiveEmbedding

from utils.TopicExpanTrainGen import TopicExpanTrainGen

def sequence_to_document_embedding(sequence_embedding: tf.Tensor):
    # gets the document representation/embedding from a BERT sequence embedding
    # by getting the mean-pooling of the sequence 
    return tf.math.reduce_mean(sequence_embedding, axis=1)

In [15]:
from tensorflow import int64
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Lambda
from spektral.layers import GCNConv, GlobalAvgPool
import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
import tensorflow as tf
from transformers import TFBertModel
import keras_nlp

learning_rate = 5e-5  # Learning rate
epochs = 4  # Number of training epochs
batch_size = 32  # Batch size
weight_decay = 5e-6
mini_batch_size = 4 # a mini-batch will always have 1 positive triple, and (n-1) negative triples
                    # i.e. with a mini_batch_size of 4, we have 1 pos. doc. and 3 neg. docs.
batch_ratio = int(batch_size / mini_batch_size)

max_len = 512
vocab_size = tokenizer.vocab_size
infoNCE_temprature = 0.1

optimizer = Adam(learning_rate, weight_decay=weight_decay)
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
loss_fn_binary = tf.keras.losses.BinaryCrossentropy(from_logits=True)
loss_fn_crossentropy = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

n_out = dataset.n_labels
topic_embedding_dimension = 300

2023-04-04 03:48:36.302175: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-04 03:48:37.527261: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38220 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:31:00.0, compute capability: 8.0


In [16]:
import sys
# Custom training loop
class ModelWithNCE(Model):
    @tf.function(experimental_relax_shapes=True)
    def train_step(self, data):
        inputs, target = data
        with tf.GradientTape() as tape:
            similarity_prediction, phrase_prediction = self(inputs, training=True)
            similarity_prediction_infonce = tf.reshape(similarity_prediction / infoNCE_temprature, shape=(mini_batch_size, -1))

            # infoNCE_loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=tf.reshape(target[0], shape=(mini_batch_size, -1)), logits=similarity_prediction_infonce))
            # infoNCE_loss = loss_fn_crossentropy(tf.reshape(target[0], shape=(mini_batch_size, -1)), similarity_prediction_infonce)
            bce_loss = loss_fn_binary(target[0], similarity_prediction)
            phrase_loss = loss_fn(target[1], phrase_prediction)

            # tf.print(similarity_prediction[:4], bce_loss, phrase_loss, output_stream=sys.stderr)
        gradients = tape.gradient([bce_loss, phrase_loss], self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.compiled_metrics.update_state(target, (similarity_prediction, phrase_prediction))
        return {m.name: m.result() for m in self.metrics}

In [17]:
# topic_expan_generator = TopicExpanTrainGen(graph_list, documents, documents_labels, batch_size, mini_batch_size, encoded_topic_to_tokenized_dict)
#, arr = topic_expan_generator.__getitem__(2)


In [18]:
# arr[1][1][0]

In [19]:
shared_bilinear = Bilinear(topic_embedding_dimension, 768, 1)

# GNNs (Topic Encoder)
X_in = Input(shape=(dataset.n_node_features))
A_in = Input(shape=(None,), sparse=True)
I_in = Input(shape=(), dtype=int64)

topic_embedding = GCNConv(topic_embedding_dimension, activation='relu')([X_in, A_in])
topic_embedding = GCNConv(topic_embedding_dimension, activation='relu')([topic_embedding, A_in])
topic_embedding = GlobalAvgPool(name='topic_embedding')([topic_embedding, I_in])

# BERT Embedding (Document Encoder)
max_seq_length = max_len
encoder = TFBertModel.from_pretrained('bert-base-uncased')

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
embedding = encoder(input_ids)[0]

# Transformer Decoders (Phrase Generator)
decoder_tokens_input = Input(shape=(max_len,), name="decoder_phrase_input")
decoder_embedding = keras_nlp.layers.TokenAndPositionEmbedding(vocabulary_size=vocab_size, sequence_length=max_len, embedding_dim=768, mask_zero=True)(decoder_tokens_input)

# Getting context embedding for decoder
topic_attentive_embedding = TopicAttentiveEmbedding()(topic_embedding, embedding, shared_bilinear, training=False)
topic_attentive_embedding = tf.keras.layers.Reshape((max_len, 1))(topic_attentive_embedding)
context_embedding = ContextEmbedding()([topic_attentive_embedding, embedding])

transformer_decoder = keras_nlp.layers.TransformerDecoder(
    num_heads=16, 
    intermediate_dim=max_len,
    dropout=0.1
)(decoder_embedding, context_embedding)

# Transformer Output
out2 = Dense(vocab_size)(transformer_decoder)

# Output Bilinear layer (Similarity Predictor)
document_embedding = Lambda(sequence_to_document_embedding, name='document_embedding')(embedding)
out = shared_bilinear([topic_embedding, document_embedding])

# Outputs
model = ModelWithNCE(inputs=[X_in, A_in, I_in, input_ids, decoder_tokens_input], outputs=[out, out2])

# compiling model and adding metrics
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy', perplexity], run_eagerly=True)

topic_expan_generator = TopicExpanTrainGen(graph_list, documents[:-20000], documents_labels[:-20000], batch_size, mini_batch_size, encoded_topic_to_tokenized_dict)

2023-04-04 03:48:40.975226: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions withou

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [22]:
model.summary()

Model: "model_with_nce"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 gcn_conv (GCNConv)             (None, 300)          15300       ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 gcn_conv_1 (GCNConv)           (None, 300)          90300       ['gcn_conv[0][0]',  

In [None]:
from tqdm.keras import TqdmCallback

# NOTE: Ignore warning about gradients not existing for BERT's dense layer since 
# the dense layers are not used and are thus unconnected and do not need training

checkpoint_filepath = './checkpoint.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='dense_perplexity',
    save_weights_only=True,
    save_freq=1000,
    mode='min',
    save_best_only=True)

model.fit(topic_expan_generator, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[TqdmCallback(verbose=1), model_checkpoint_callback])

In [23]:
model.load_weights('checkpoint_april1.h5')

In [28]:
test_gen = TopicExpanTrainGen(graph_list, documents[-1000:], documents_labels[-1000:], batch_size, 2, encoded_topic_to_tokenized_dict)

In [29]:
model.evaluate(test_gen)



[nan,
 nan,
 0.12036581337451935,
 0.9875991940498352,
 nan,
 0.9819642901420593,
 1.1279094219207764]

In [20]:
import math
from spektral.data.utils import to_disjoint
import tensorflow as tf

class TopicExpanExpansionGen(tf.keras.utils.Sequence):
    def __init__(self, virtual_graph: Graph, documents: np.array, batch_size: int):
        self._virtual_graph = virtual_graph # ego-graph of the new virtual node
        self.documents = documents
        self.batch_size = batch_size
        
        # generating initial prompt with the start ID, this is the same prompt
        # for all batches since the initial prompt is always [101]
        unpadded_prompt = tf.fill((self.batch_size, 1), 101)
        self.initial_prompt = tf.pad(unpadded_prompt, [[0, 0], [0, max_len-tf.shape(unpadded_prompt)[1]]])
        
        # the same virtual node ego-graph is used for the whole batch,
        # so generating it in the init to speed up computation
        self.x_in, self.a_in, self.i_in = to_disjoint(
            x_list=[self._virtual_graph.x for _ in range(batch_size)],
            a_list=[self._virtual_graph.a for _ in range(batch_size)]
        )
        
        self.x_in = tf.convert_to_tensor(self.x_in)
        self.a_in = tf.sparse.SparseTensor(
            indices=np.array([self.a_in.row, self.a_in.col]).T,
            values=self.a_in.data,
            dense_shape=self.a_in.shape
        )
        self.i_in = tf.convert_to_tensor(self.i_in)
        
    def __len__(self):
        return math.ceil(len(self.documents) / (self.batch_size))
    
    def __getitem__(self, non_batch_index: int):
        idx = non_batch_index * self.batch_size
        idx_limiter = min(idx + self.batch_size, len(self.documents)) # for limiting batch size at end of list
        fixed_batch_size = idx_limiter - idx # gets the fixed batch size in case this is the last batch
        
        model_expansion_inputs = (
            self.x_in,
            self.a_in,
            self.i_in,
            tf.convert_to_tensor(self.documents[idx:idx_limiter]),
            self.initial_prompt
        )
            
        return model_expansion_inputs

In [21]:
x = np.arange(299)
x = labelEncoder.inverse_transform(x)
feature_array = x.reshape(299, -1)

graph_dict[100][212][298] = 1 # adding virtual node
l1_topic = 100
l2_topic = 212
virtual_node = 298

virtual_ego_graph = create_ego_graph(l1_topic, l2_topic, virtual_node, graph_dict, feature_array, is_virtual=True)

In [22]:
test_batch_size = 16
expan_gen = TopicExpanExpansionGen(virtual_ego_graph, documents, test_batch_size)

outputs = []
for model_expansion_input in expan_gen:
    batch_output = model(model_expansion_input)
    outputs.append(batch_output)
    
    for sim_pred in batch_output[0]:
        if sim_pred > 0:
            print('found positive prediction!')

found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
found positive prediction!
f

2023-04-04 03:49:02.305344: W tensorflow/tsl/framework/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 192.00MiB (rounded to 201326592)requested by op Softmax
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-04-04 03:49:02.305376: I tensorflow/tsl/framework/bfc_allocator.cc:1034] BFCAllocator dump for GPU_0_bfc
2023-04-04 03:49:02.305386: I tensorflow/tsl/framework/bfc_allocator.cc:1041] Bin (256): 	Total Chunks: 79, Chunks in use: 78. 19.8KiB allocated for chunks. 19.5KiB in use in bin. 2.5KiB client-requested in use in bin.
2023-04-04 03:49:02.305392: I tensorflow/tsl/framework/bfc_allocator.cc:1041] Bin (512): 	Total Chunks: 1, Chunks in use: 1. 768B allocated for chunks. 768B in use in bin. 640B client-requested in use in bin.
2023-04-04 03:49:02.305399: I tensorflow/tsl/framework/bfc_alloc

ResourceExhaustedError: Exception encountered when calling layer 'self' (type TFBertSelfAttention).

{{function_node __wrapped__Softmax_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[16,12,512,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Softmax]

Call arguments received by layer 'self' (type TFBertSelfAttention):
  • hidden_states=tf.Tensor(shape=(16, 512, 768), dtype=float32)
  • attention_mask=tf.Tensor(shape=(16, 1, 1, 512), dtype=float32)
  • head_mask=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_value=None
  • output_attentions=False
  • training=False

In [None]:
outputs

In [242]:
test_gen[3]

((<tf.Tensor: shape=(29, 50), dtype=float32, numpy=
  array([[ 0.23853   ,  0.34888   , -0.070754  , ...,  0.83062   ,
          -0.23929   ,  1.1943    ],
         [-1.5853    ,  1.3824    , -0.30704   , ..., -0.010681  ,
          -0.066724  ,  0.1998    ],
         [-1.41743   ,  0.111265  , -0.68006   , ...,  0.49370477,
           0.06240024, -0.17009124],
         ...,
         [-1.9629    ,  1.1863    , -0.11088   , ..., -1.1283    ,
          -0.073632  , -0.19008   ],
         [-1.09367   ,  0.93693995, -0.59236664, ..., -0.28081235,
          -0.01396667, -0.273598  ],
         [-1.4572    ,  0.87621   , -0.56166   , ..., -0.45689002,
          -0.33043998,  0.088323  ]], dtype=float32)>,
  SparseTensor(indices=tf.Tensor(
  [[ 0  0]
   [ 0  1]
   [ 0  2]
   ...
   [28 26]
   [28 27]
   [28 28]], shape=(841, 2), dtype=int64), values=tf.Tensor(
  [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  

In [205]:
siblings_list = list(graph_dict[100][212].keys())

siblings_list.remove(298)

In [203]:
x1, x2, x3, x4, x5 = test_gen.__getitem__(2)[0]
# model(test_gen.__getitem__(2)[0])[1]

In [148]:
test_gen = TopicExpanTrainGen(graph_list, documents[-1000:], documents_labels[-1000:], 1, 1, encoded_topic_to_tokenized_dict)
x1, x2, x3, x4, x5 = test_gen.__getitem__(3)[0]
sequence = x5.numpy()
sequence[:, 1:] = 0
# sequence[:, 1] = 2447

In [141]:
pred = model((x1, x2, x3, x4, sequence))[1]
pred[:, -1, :]

<tf.Tensor: shape=(1, 30522), dtype=float32, numpy=
array([[-7.874007, -8.135588, -8.380221, ..., -8.34035 , -8.491272,
        -8.455162]], dtype=float32)>

In [142]:
tf.math.argmax(pred[0][0])

<tf.Tensor: shape=(), dtype=int64, numpy=3539>

In [119]:
model(x1, x2, x3, x4, 

SyntaxError: unexpected EOF while parsing (2185693414.py, line 1)

In [143]:
tokenizer.decode([3539])

'prime'

In [144]:
from keras_nlp.utils import beam_search
from random import randint

test_batch_size = 4
test_gen = TopicExpanTrainGen(graph_list, documents[-20000:], documents_labels[-20000:], test_batch_size, 1, encoded_topic_to_tokenized_dict)
x1, x2, x3, x4, x5 = test_gen[randint(0, 4000)][0]

START_ID = 101
END_ID = 102

def token_probability_fn(inputs):
    padded_inputs = tf.pad(inputs, [[0, 0], [0, max_len-tf.shape(inputs)[1]]])
    repeats = int(padded_inputs.shape[0] / test_batch_size)
    # print(inputs.shape)
    preds = [
        model((
            x1, 
            x2, 
            x3, 
            x4, 
            padded_inputs[repeat_idx*test_batch_size:(repeat_idx+1)*test_batch_size]
        ))[1] for repeat_idx in range(repeats)]
    
    # print(preds)
    concatenated_preds = tf.concat(preds, axis=0)
    # print(concatenated_preds[:, 0, :].shape)
    
    # the first zero index is the position in the sequence we're trying to find to add to the sequence
    first_zero_index = (padded_inputs.numpy()[0]==0).argmax(axis=0)
    return concatenated_preds[:, first_zero_index, :]

prompt = tf.fill((test_batch_size, 1), START_ID)

predicted_phrases = keras_nlp.utils.beam_search(
    token_probability_fn,
    prompt,
    max_length=10,
    num_beams=3,
    end_token_id=END_ID,
    from_logits=True
)
[tokenizer.decode(phrase) for phrase in predicted_phrases]

['[CLS] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] an [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] tournament [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [128]:
prompt = tf.fill((batch_size, 1), START_ID)
padded_prompt = tf.pad(prompt, [[0, 0], [0, max_len-tf.shape(prompt)[1]]])
padded_prompt

<tf.Tensor: shape=(32, 512), dtype=int32, numpy=
array([[101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0],
       ...,
       [101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0],
       [101,   0,   0, ...,   0,   0,   0]], dtype=int32)>

In [129]:
print(model((x1, x2, x3, x4, prompt)), tokenizer.decode(x4[2], skip_special_tokens=True))

2023-04-03 18:51:23.138063: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at einsum_op_impl.h:498 : INVALID_ARGUMENT: Expected dimension 4 at axis 0 of the input shaped [32,1,16,48] but got dimension 32


InvalidArgumentError: Exception encountered when calling layer 'multi_head_attention_1' (type MultiHeadAttention).

{{function_node __wrapped__Einsum_N_2_device_/job:localhost/replica:0/task:0/device:GPU:0}} Expected dimension 4 at axis 0 of the input shaped [32,1,16,48] but got dimension 32 [Op:Einsum]

Call arguments received by layer 'multi_head_attention_1' (type MultiHeadAttention):
  • query=tf.Tensor(shape=(32, 1, 768), dtype=float32)
  • value=tf.Tensor(shape=(4, 512, 768), dtype=float32)
  • key=None
  • attention_mask=None
  • return_attention_scores=False
  • training=None
  • use_causal_mask=False

In [116]:
test_gen[0][1]

(<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
 array([[1],
        [1],
        [1],
        [1]])>,
 <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
 array([[ 2679,  3586,   102, ...,     0,     0,     0],
        [ 3137,  2846,   102, ...,     0,     0,     0],
        [14211,   102,     0, ...,     0,     0,     0],
        [ 2958,   102,     0, ...,     0,     0,     0]])>)

In [106]:
df[df['l3'].str.contains('Festival')]

Unnamed: 0,text,l1,l2,l3,l1_encoded,l2_encoded,l3_encoded
214,The Ardee Baroque Festival is a celebration of...,Event,SocietalEvent,MusicFestival,100,251,171
653,The Slamdance Film Festival is an annual film ...,Event,SocietalEvent,FilmFestival,100,251,105
774,"The Stan Rogers Folk Festival, informally know...",Event,SocietalEvent,MusicFestival,100,251,171
1350,The 5th Toronto International Film Festival (T...,Event,SocietalEvent,FilmFestival,100,251,105
1886,The 2010 Slamdance Film Festival took place in...,Event,SocietalEvent,FilmFestival,100,251,105
...,...,...,...,...,...,...,...
240049,The São Paulo International Film Festival (Por...,Event,SocietalEvent,FilmFestival,100,251,105
240163,The 40th annual Toronto International Film Fes...,Event,SocietalEvent,FilmFestival,100,251,105
240210,During the 19th century Trinidadians and other...,Event,SocietalEvent,MusicFestival,100,251,171
240226,New York Polish Film Festival (abbreviated to ...,Event,SocietalEvent,FilmFestival,100,251,105


In [184]:
topics_with_virtual = np.append(topics, 'ZZ_VIRTUAL')
le = LabelEncoder()
le.fit(topics_with_virtual)
le.inverse_transform([298])

array(['ZZ_VIRTUAL'], dtype=object)

In [160]:
################################################################################
# Build model
################################################################################
from tensorflow import int64
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
import tensorflow_ranking

from spektral.layers import GCNConv, GlobalAvgPool, GraphMasking

# n_out = dataset.n_labels

X_in = Input(shape=(50))
A_in = Input(shape=(None,), sparse=True)
I_in = Input(shape=(), dtype=int64)

X = GCNConv(32, activation='relu')([X_in, A_in])
X = GCNConv(32, activation='relu')([X, A_in])
X = GlobalAvgPool()([X, I_in])

shared_bilinear = tensorflow_ranking.keras.layers.Bilinear(32, 32)
X_1 = shared_bilinear([X, X])
X = shared_bilinear([X, X], training=False)

out = Dense(2, activation='softmax')(X)

model = Model(inputs=[X_in, A_in, I_in], outputs=out)


ModuleNotFoundError: No module named 'tensorflow_ranking'

In [None]:
from keras.utils.vis_utils import plot_model

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
import tensorflow as tf
import numpy as np

preds = tf.constant([[[-11.7803297],
 [-9.34260654],
 [-14.0992193],
 [-9.90242]],[[-11.7803297],
 [-9.34260654],
 [-14.0992193],
 [-9.90242]]], dtype=float)

target = tf.constant([[[0],
 [0],
 [0],
 [1]],[[0],
 [0],
 [0],
 [1]]], dtype=float)

In [None]:
sigmoided = tf.keras.activations.sigmoid(preds)

In [None]:
tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=tf.reshape(target, shape=(2, -1)), logits=tf.reshape(sigmoided, shape=(2, -1))))

In [None]:
tf.keras.activations.softmax(tf.reshape(preds, shape=(1, -1)))

In [None]:
tf.reshape(target, shape=(2, -1))