# **Import Libraries**

In [None]:
import os
import gc
import sys
import time
import shutil

import random
import pickle

from tqdm import tqdm as print_progress
from glob import glob

import dask.dataframe as dd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
os.environ['TF_KERAS'] = '1'

import tensorflow as tf

from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [None]:
from tensorflow.keras.layers import (
    Layer, 
    Input, InputLayer, Embedding, 
    Dropout, Dense, 
    Dot, Concatenate, Average, Add,
    Bidirectional, LSTM,
    Lambda, Reshape
)
from tensorflow.keras.activations import softmax, sigmoid
from tensorflow.keras.initializers import Identity, GlorotNormal
from tensorflow.keras.utils import plot_model

In [None]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
pip install stellargraph

In [None]:
pip install gradient-centralization-tf

# **Load data**

In [None]:
datasets_path = '../input/hotel-comment-splitted'
sample_dfs = dict()
for dataset in ['training', 'valuating', 'testing']:
    print(f'\n\n\nProcessing {dataset} ...')
    sample_dfs[dataset] = dd.read_csv(
        os.path.join(datasets_path, f'{dataset}_data*.csv')).compute()
    print(f"{dataset}-set contains {len(sample_dfs[dataset])} samples")
    print(sample_dfs[dataset].sample(n=3))

In [None]:
filename = os.path.join(datasets_path, 'label_encoder.pkl')
label_encoder = pickle.load(open(filename, 'rb'))
labels = list(label_encoder.classes_)

# **Pretrained Sentence-Transformer**

In [None]:
pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model_version = '../input/sentence-transformers/distilUSE'
embedder = SentenceTransformer(model_version)

In [None]:
labels_vector = embedder.encode(labels, convert_to_numpy=True, output_value='token_embeddings')
labels_vector = [np.mean(l.cpu().numpy(), axis=0) for l in labels_vector]
labels_matrix = np.vstack(labels_vector)
labels_matrix = np.expand_dims(labels_matrix, axis=0)
# np.save(os.path.join(datasets_path, 'labels_embeddings.npy'), labels_matrix)
labels_matrix.shape

# **Data Generator**

In [None]:
import sklearn
from ast import literal_eval
from tensorflow.keras.utils import Sequence, to_categorical

class DataGenerator(Sequence):

    def __init__(self, data_df: pd.DataFrame,
                       word_embedder,
                       labels_fixed: np.array,
                       batch_size: int = 64, 
                       shuffle: bool = True):
        self.data_df = data_df
        if len(labels_fixed.shape) == 2:
            labels_fixed = np.expand_dims(labels_fixed, axis=0)
        elif len(labels_fixed.shape) != 3:
            raise ValueError("Shape of `labels_fixed` must be 2D or 3D")
        self.labels_fixed = labels_fixed
        self.word_embedder = word_embedder
        self.embedding_dim = list(word_embedder.parameters())[-2].shape[-1]
        self.max_seq_length = word_embedder.max_seq_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.array(list(self.data_df.index))
        self.on_epoch_end()

    def __len__(self):
        " Denotes the number of batches per epoch "
        return int(len(self.data_df) // self.batch_size)

    def __getitem__(self, index):
        " Generate single batch of data "
        # Generate indexes of the batch
        start_index = self.batch_size * index
        end_index = self.batch_size * (index+1)
        indices = self.indices[start_index:end_index]

        # Generate data
        samples = self.data_df.loc[indices, ['Comment', 'label_encoder']].copy()
        labels = samples.label_encoder.values.tolist()
        texts = samples.Comment.values.tolist()
        embeddings = self.word_embedder.encode(texts, 
                                               batch_size=4,
                                               output_value='token_embeddings',
                                               convert_to_numpy=True,
                                               show_progress_bar=False)
        # 0-padding for word embeddings
        embeddings_padded = np.zeros((len(samples), self.max_seq_length, self.embedding_dim))
        for e_i, e in enumerate(embeddings):
            embeddings_padded[e_i, :len(e), :self.embedding_dim] = e.cpu().numpy()
        embeddings_padded = np.array(embeddings_padded)
        
        # Encoding multi-class labels
        mClss_labels = []
        for l in labels:
            l = literal_eval(l) if ',' in l else [int(ch) for ch in l[1:-1].split()]
                
            # Build multi-class labels
            mtc = np.sum(to_categorical(l, num_classes=self.labels_fixed.shape[-2]), axis=0)
            mClss_labels += [self.smooth_labels(mtc)]
        mClss_labels = np.array(mClss_labels)

        del samples, labels, texts, embeddings
        _ = gc.collect()
        return [embeddings_padded, self.labels_fixed], mClss_labels

    def smooth_labels(self, labels, factor=0.1):
        " Smooth the labels "
        labels *= (1 - factor)
        labels += (factor / labels.shape[-1])
        return labels

    def on_epoch_end(self):
        " Update indices after each epoch "
        if self.shuffle:
            self.indices = sklearn.utils.shuffle(self.indices)

In [None]:
data_generator = dict()
for dataset, dset in sample_dfs.items():
    data_generator[dataset] = DataGenerator(dset, 
                                            word_embedder=embedder, 
                                            labels_fixed=labels_matrix, 
                                            batch_size=64, 
                                            shuffle=True if dataset=='training' else False)

In [None]:
len(data_generator['training']), len(data_generator['valuating'])

In [None]:
X = data_generator['valuating'][0]
print(X[0][0].shape)
print(X[0][1].shape)
print(X[1].shape)

# **Load Model**

In [None]:
class CyclicLR(Callback):
    """
    This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with some constant frequency, 
        as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or per-cycle basis.
    
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "halving":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exponential":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.

    For more detail, please read the paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {original, halving, exponential}.
            Default 'original'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """
    def __init__(self, base_lr=0.001, max_lr=0.1, step_size=2000., mode='original',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'halving':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exponential':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
            else:
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None, new_step_size=None):
        """
        Resets cycle iterations.
            Optional boundary/step size adjustment.
        """
        if new_base_lr is not None:
            self.base_lr = new_base_lr
        if new_max_lr is not None:
            self.max_lr = new_max_lr
        if new_step_size is not None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}
        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        new_lr = self.clr()
        K.set_value(self.model.optimizer.lr, new_lr)

In [None]:
class Adjacency(Layer):

    def __init__(self, nodes=1, weights=None, init_method='identity'):
        super(Adjacency, self).__init__()

        self.shape = (1, nodes, nodes)

        if weights is not None:
            assert weights.shape==(nodes, nodes), \
                f'Adjacency Matrix must have shape ({nodes}, {nodes})' + \
                f' while its shape is {weights.shape}'
            w_init = tf.convert_to_tensor(weights)
        else:
            init_method = init_method.lower()
            if init_method == 'identity':
                initializer = tf.initializers.Identity()
            elif init_method in ['xavier', 'glorot']:
                initializer = tf.initializers.GlorotNormal()
            w_init = initializer(shape=(nodes, nodes))

        self.w = tf.Variable(
            initial_value=tf.expand_dims(w_init, axis=0), 
            dtype="float32", trainable=True
        )

    def call(self, inputs):
        return tf.convert_to_tensor(self.w)

    def compute_output_shape(self):
        return self.shape

In [None]:
from tensorflow.keras import activations, initializers, constraints, regularizers
from tensorflow.keras.layers import Layer, Dropout, LeakyReLU


class GraphAttention(Layer):
    """
    Graph Attention (GAT) layer. The base implementation is taken from
            https://github.com/danielegrattarola/keras-gat,
    with some modifications added for ease of use.
    Based on the original paper: Graph Attention Networks. P. Veličković et al. ICLR 2018 https://arxiv.org/abs/1710.10903
    Notes:
      - The inputs are tensors with a batch dimension of 1:
        Keras requires this batch dimension, and for full-batch methods
        we only have a single "batch".
      - There are two inputs required, the node features,
        and the graph adjacency matrix
      - This does not add self loops to the adjacency matrix, you should preprocess
        the adjacency matrix to add self-loops
    .. seealso:: :class:`.GAT` combines several of these layers, and :class:`.GraphAttentionSparse` supports a sparse adjacency matrix.
    
    Args:
        F_out (int): dimensionality of output feature vectors
        attn_heads (int or list of int): number of attention heads
        attn_heads_reduction (str): reduction applied to output features of each attention head, ``concat`` or ``average``.
            ``average`` should be applied in the final prediction layer of the model (Eq. 6 of the paper).
        in_dropout_rate (float): dropout rate applied to features
        attn_dropout_rate (float): dropout rate applied to attention coefficients
        activation (str): nonlinear activation applied to layer's output to obtain output features (eq. 4 of the GAT paper)
        final_layer (bool): Deprecated, use ``tf.gather`` or :class:`.GatherIndices`
        use_bias (bool): toggles an optional bias
        saliency_map_support (bool): If calculating saliency maps using the tools in
            stellargraph.interpretability.saliency_maps this should be True. Otherwise this should be False (default).
        kernel_initializer (str or func, optional): The initialiser to use for the head weights.
        kernel_regularizer (str or func, optional): The regulariser to use for the head weights.
        kernel_constraint (str or func, optional): The constraint to use for the head weights.
        bias_initializer (str or func, optional): The initialiser to use for the head bias.
        bias_regularizer (str or func, optional): The regulariser to use for the head bias.
        bias_constraint (str or func, optional): The constraint to use for the head bias.
        attn_kernel_initializer (str or func, optional): The initialiser to use for the attention weights.
        attn_kernel_regularizer (str or func, optional): The regulariser to use for the attention weights.
        attn_kernel_constraint (str or func, optional): The constraint to use for the attention weights.
    """

    def __init__(
        self,
        units,
        attn_heads=1,
        attn_heads_reduction="concat",  # {'concat', 'average'}
        in_dropout_rate=0.0,
        attn_dropout_rate=0.0,
        activation="relu",
        use_bias=True,
        final_layer=None,
        saliency_map_support=False,
        kernel_initializer="glorot_uniform",
        kernel_regularizer=None,
        kernel_constraint=None,
        bias_initializer="zeros",
        bias_regularizer=None,
        bias_constraint=None,
        attn_kernel_initializer="glorot_uniform",
        attn_kernel_regularizer=None,
        attn_kernel_constraint=None,
        **kwargs,
    ):

        if attn_heads_reduction not in {"concat", "average"}:
            raise ValueError(
                "{}: Possible heads reduction methods: concat, average; received {}".format(
                    type(self).__name__, attn_heads_reduction
                )
            )

        self.units = units  # Number of output features (F' in the paper)
        self.attn_heads = attn_heads  # Number of attention heads (K in the paper)
        self.attn_heads_reduction = attn_heads_reduction  # Eq. 5 and 6 in the paper
        self.in_dropout_rate = in_dropout_rate  # dropout rate for node features
        self.attn_dropout_rate = attn_dropout_rate  # dropout rate for attention coefs
        self.activation = activations.get(activation)  # Eq. 4 in the paper
        self.use_bias = use_bias
        if final_layer is not None:
            raise ValueError(
                "'final_layer' is not longer supported, use 'tf.gather' or 'GatherIndices' separately"
            )

        self.saliency_map_support = saliency_map_support

        # Populated by build()
        self.kernels = []  # Layer kernels for attention heads
        self.biases = []  # Layer biases for attention heads
        self.attn_kernels = []  # Attention kernels for attention heads

        if attn_heads_reduction == "concat":
            # Output will have shape (..., K * F')
            self.output_dim = self.units * self.attn_heads
        else:
            # Output will have shape (..., F')
            self.output_dim = self.units

        self.kernel_initializer = initializers.get(kernel_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_initializer = initializers.get(bias_initializer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.bias_constraint = constraints.get(bias_constraint)
        self.attn_kernel_initializer = initializers.get(attn_kernel_initializer)
        self.attn_kernel_regularizer = regularizers.get(attn_kernel_regularizer)
        self.attn_kernel_constraint = constraints.get(attn_kernel_constraint)

        super().__init__(**kwargs)

    def get_config(self):
        """
        Gets class configuration for Keras serialization
        """
        config = {
            "units": self.units,
            "attn_heads": self.attn_heads,
            "attn_heads_reduction": self.attn_heads_reduction,
            "in_dropout_rate": self.in_dropout_rate,
            "attn_dropout_rate": self.attn_dropout_rate,
            "activation": activations.serialize(self.activation),
            "use_bias": self.use_bias,
            "saliency_map_support": self.saliency_map_support,
            "kernel_initializer": initializers.serialize(self.kernel_initializer),
            "kernel_regularizer": regularizers.serialize(self.kernel_regularizer),
            "kernel_constraint": constraints.serialize(self.kernel_constraint),
            "bias_initializer": initializers.serialize(self.bias_initializer),
            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
            "bias_constraint": constraints.serialize(self.bias_constraint),
            "attn_kernel_initializer": initializers.serialize(
                self.attn_kernel_initializer
            ),
            "attn_kernel_regularizer": regularizers.serialize(
                self.attn_kernel_regularizer
            ),
            "attn_kernel_constraint": constraints.serialize(
                self.attn_kernel_constraint
            ),
        }
        base_config = super().get_config()
        return {**base_config, **config}

    def compute_output_shape(self, input_shapes):
        """
        Computes the output shape of the layer.
        Assumes the following inputs:
        Args:
            input_shapes (tuple of int)
                Shape tuples can include None for free dimensions, instead of an integer.
        Returns:
            An input shape tuple.
        """
        feature_shape, *As_shapes = input_shapes

        batch_dim = feature_shape[0]
        out_dim = feature_shape[1]

        return batch_dim, out_dim, self.output_dim

    def build(self, input_shapes):
        """
        Builds the layer
        Args:
            input_shapes (list of int): shapes of the layer's inputs (node features and adjacency matrix)
        """
        feat_shape = input_shapes[0]
        input_dim = int(feat_shape[-1])

        # Variables to support integrated gradients
        self.delta = self.add_weight(
            name="ig_delta", shape=(), trainable=False, initializer=initializers.ones()
        )
        self.non_exist_edge = self.add_weight(
            name="ig_non_exist_edge",
            shape=(),
            trainable=False,
            initializer=initializers.zeros(),
        )

        # Initialize weights for each attention head
        for head in range(self.attn_heads):
            # Layer kernel
            kernel = self.add_weight(
                shape=(input_dim, self.units),
                initializer=self.kernel_initializer,
                regularizer=self.kernel_regularizer,
                constraint=self.kernel_constraint,
                name="kernel_{}".format(head),
            )
            self.kernels.append(kernel)

            # # Layer bias
            if self.use_bias:
                bias = self.add_weight(
                    shape=(self.units,),
                    initializer=self.bias_initializer,
                    regularizer=self.bias_regularizer,
                    constraint=self.bias_constraint,
                    name="bias_{}".format(head),
                )
                self.biases.append(bias)

            # Attention kernels
            attn_kernel_self = self.add_weight(
                shape=(self.units, 1),
                initializer=self.attn_kernel_initializer,
                regularizer=self.attn_kernel_regularizer,
                constraint=self.attn_kernel_constraint,
                name="attn_kernel_self_{}".format(head),
            )
            attn_kernel_neighs = self.add_weight(
                shape=(self.units, 1),
                initializer=self.attn_kernel_initializer,
                regularizer=self.attn_kernel_regularizer,
                constraint=self.attn_kernel_constraint,
                name="attn_kernel_neigh_{}".format(head),
            )
            self.attn_kernels.append([attn_kernel_self, attn_kernel_neighs])
        self.built = True

    def call(self, inputs):
        """
        Creates the layer as a Keras graph.
        Note that the inputs are tensors with a batch dimension of 1:
        Keras requires this batch dimension, and for full-batch methods
        we only have a single "batch".
        There are two inputs required, the node features,
        and the graph adjacency matrix
        Notes:
            This does not add self loops to the adjacency matrix.
        Args:
            inputs (list): list of inputs with 3 items:
            node features (size 1 x N x F),
            graph adjacency matrix (size N x N),
            where N is the number of nodes in the graph,
                  F is the dimensionality of node features
                  M is the number of output nodes
        """
        X = inputs[0]  # Node features (1 x N x F)
        A = inputs[1]  # Adjacency matrix (1 X N x N)
        N = K.int_shape(A)[-1]

        batch_dim, n_nodes, _ = K.int_shape(X)
        if batch_dim != 1:
            raise ValueError("Currently full-batch methods only support a batch dimension of one")
        else:
            # Remove singleton batch dimension
            X = K.squeeze(X, 0)
            A = K.squeeze(A, 0)

        outputs = []
        for head in range(self.attn_heads):
            kernel = self.kernels[head]  # W in the paper (F x F')
            attention_kernel = self.attn_kernels[head]  # Attention kernel a in the paper (2F' x 1)

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (N x F')

            # Compute feature combinations
            # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(
                features, attention_kernel[0]
            )  # (N x 1), [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(
                features, attention_kernel[1]
            )  # (N x 1), [a_2]^T [Wh_j]

            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]]
            dense = attn_for_self + K.transpose(
                attn_for_neighs
            )  # (N x N) via broadcasting

            # Add nonlinearity
            dense = LeakyReLU(alpha=0.2)(dense)

            # Mask values before activation (Vaswani et al., 2017)
            #   YT: this only works for 'binary' A, not for 'weighted' A!
            #   YT: if A does not have self-loops, the node itself will be masked, so A should have self-loops
            #   YT: this is ensured by setting the diagonal elements of A tensor to 1 above
            if not self.saliency_map_support:
                mask = -10e9 * (1.0 - A)
                dense += mask
                dense = K.softmax(dense)  # (N x N), Eq. 3 of the paper
            else:
                # dense = dense - tf.reduce_max(dense)
                # GAT with support for saliency calculations
                W = (self.delta * A) * K.exp(
                    dense - K.max(dense, axis=1, keepdims=True)
                ) * (1 - self.non_exist_edge) + self.non_exist_edge * (
                    A + self.delta * (tf.ones((N, N)) - A) + tf.eye(N)
                ) * K.exp(
                    dense - K.max(dense, axis=1, keepdims=True)
                )
                dense = W / K.sum(W, axis=1, keepdims=True)

            # Apply dropout to features and attention coefficients
            dropout_feat = Dropout(self.in_dropout_rate)(features)  # (N x F')
            dropout_attn = Dropout(self.attn_dropout_rate)(dense)  # (N x N)

            # Linear combination with neighbors' features [YT: see Eq. 4]
            node_features = K.dot(dropout_attn, dropout_feat)  # (N x F')

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            # Add output of attention head to final output
            outputs.append(node_features)

        # Aggregate the heads' output according to the reduction method
        if self.attn_heads_reduction == "concat":
            output = K.concatenate(outputs)  # (N x KF')
        else:
            output = K.mean(K.stack(outputs), axis=0)  # N x F')

        # Nonlinear activation function
        output = self.activation(output)

        # Add batch dimension back if we removed it
        if batch_dim == 1:
            output = K.expand_dims(output, 0)

        return output

In [None]:
def buil_MAGNET(n_labels,
                embedding_dim: int,
                sequence_length: int=512, 
                lstm_units: int=64,
                dropout_rates=[0.2, 0.3],
                attention_heads=[4, 2],
                adjacency_matrix=None,
                adjacency_generation='xavier', # 'identity' or 'xavier' or 'glorot'
                feed_text_embeddings=True, # if False, add additional Embedding layer
                text_embeddings_matrix=None, # initialized weights for text Embedding layer
                feed_label_embeddings=True, # if False, add additional Embedding layer
                label_embeddings_matrix=None, # initialized weights for label Embedding layer
                ) -> Model:

    if isinstance(attention_heads, int):
        attention_heads = [attention_heads, attention_heads]
    if not isinstance(attention_heads, (list, tuple)):
        raise ValueError('`attention_heads` must be INT, LIST or TUPLE')

    # 1. Sentence Representation
    if feed_text_embeddings:
        sentence_model = Sequential(name='sentence_model')
        sentence_model.add(Dropout(dropout_rates[0], input_shape=(sequence_length, embedding_dim), name='word_embeddings'))
        word_inputs, word_embeddings = sentence_model.inputs, sentence_model.outputs
    else:
        word_inputs = Input(shape=(sequence_length, ), name='word_inputs')
        embedding_args = {
            'input_dim': sequence_length,
            'output_dim': embedding_dim,
            'name': 'word_embeddings'
        }
        if text_embeddings_matrix is not None \
            and text_embeddings_matrix.shape==(sequence_length, embedding_dim):
            embedding_args['weights'] = [text_embeddings_matrix]
        word_embeddings = Embedding(**embedding_args)(word_inputs)
        word_embeddings = Dropout(dropout_rates[0], name='WE_dropout')(word_embeddings)

    forward_rnn = LSTM(units=lstm_units, return_sequences=True, name='forward_rnn')
    backward_rnn = LSTM(units=lstm_units, return_sequences=True, name='backward_rnn', go_backwards=True)
    bidir_rnn = Bidirectional(layer=forward_rnn, backward_layer=backward_rnn, merge_mode="concat", name='bidir_rnn')
    
    sentence_repr = bidir_rnn(word_embeddings)
    sentence_repr = K.mean(sentence_repr, axis=1)
    # print(f"sentence_repr: {K.int_shape(sentence_repr)}")

    # 2. Labels Representation
    if feed_label_embeddings:
        label_inputs = Input(batch_shape=(1, n_labels, embedding_dim), name='label_embeddings')
        label_embeddings = label_inputs
    else:
        label_inputs = Input(batch_shape=(1, n_labels), name='label_inputs')
        embedding_args = {'input_dim': n_labels,
                          'output_dim': embedding_dim,
                          'name': 'label_embeddings'}
        if label_embeddings_matrix is not None \
            and label_embeddings_matrix.shape==(n_labels, embedding_dim):
            embedding_args['weights'] = [label_embeddings_matrix]
        label_embeddings = Embedding(**embedding_args)(label_inputs)
        label_embeddings = Dropout(rate=dropout_rates[0], name='LE_dropout')(label_embeddings)
    label_embeddings = Dense(units=embedding_dim//4, name='label_embeddings_reduced')(label_embeddings)
    # print(f"label_inputs: {K.int_shape(label_inputs)}")

    label_correlation = Adjacency(nodes=n_labels, 
                                  weights=adjacency_matrix,
                                  init_method=adjacency_generation)(label_embeddings)
    # print(f"label_correlation: {K.int_shape(label_correlation)}")

    label_attention = GraphAttention(units=embedding_dim//4//attention_heads[0],
                                     activation='tanh',
                                     attn_heads=attention_heads[0],
                                     in_dropout_rate=dropout_rates[1],
                                     attn_dropout_rate=dropout_rates[1], )([label_embeddings, label_correlation])
    # print(f"label_attention: {K.int_shape(label_attention)}")

    label_residual = Add(name='label_residual')([label_attention, label_embeddings])
    # print(f"label_residual: {K.int_shape(label_residual)}")

    label_repr = GraphAttention(units=2*lstm_units,
                                activation='tanh',
                                attn_heads_reduction='average',
                                attn_heads=attention_heads[1],
                                in_dropout_rate=dropout_rates[1],
                                attn_dropout_rate=dropout_rates[1], )([label_residual, label_correlation])

    label_repr = K.sum(label_repr, axis=0, keepdims=False)
    # print(f"label_repr: {K.int_shape(label_repr)}")

    # 3. Prediction
    prediction = tf.einsum('Bk,Nk->BN', sentence_repr, label_repr)
    prediction = sigmoid(prediction)
    # print(f"prediction: {K.int_shape(prediction)}")

    return Model(inputs=[word_inputs, label_inputs], outputs=prediction, name='MAGNET')

In [None]:
def weighted_cross_entropy(y_true, y_pred, pos_weight=1.618):
    losses = y_true * -K.log(y_pred) * pos_weight + (1-y_true) * -K.log(1-y_pred)
    losses = K.clip(losses, 0.0, 9.7)
    return K.mean(losses)

In [None]:
import gctf
from stellargraph.utils import plot_history

class MAGNET:

    def __init__(self, n_labels: int, embedding_dim: int):

        self.embedding_dim = embedding_dim

        # Build model(s)
        print(f"\n\n\nBuilding MAGNET ...\n\n\n")
        self.model = buil_MAGNET(n_labels, embedding_dim=embedding_dim, sequence_length=512, lstm_units=32)
        self.model.summary()

    def compile(self, model_saved: str, logs_path: str, schedule_step: int, verbose: int=1):
                    
        # Compile optimizer, loss & metric functions
        print(f"Compiling MAGNET using \n\tgrad-centralized ADAM, \n\ttop-k Accuracy, \n\tweighted Cross-Entropy \n...")
        self.model.compile(optimizer=gctf.optimizers.adam(learning_rate=0.001), 
                           # optimizer=Adam(learning_rate=0.001), 
                           metrics=["accuracy", TopKCategoricalAccuracy(k=3)],
                           loss=weighted_cross_entropy)

        # Define Callbacks
        return [
            # TensorBoard(log_dir=logs_path),
            # ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=verbose),
            CyclicLR(mode='exponential', base_lr=1e-7, max_lr=1e-3, step_size=schedule_step),
            ModelCheckpoint(filepath=model_saved, monitor='accuracy', save_weights_only=True, save_best_only=False, save_freq='epoch'),
            # LearningRateScheduler(noam_scheme),
            # EarlyStopping(monitor='val_accuracy', mode='max', restore_best_weights=True, min_delta=1e-7, patience=7, verbose=verbose),
        ]

    def finetune(self, train_generator, val_generator, model_saved: str, logs_path: str, n_loops: int=3, verbose: int=1):
        # Compile
        schedule_step = len(train_generator) // 2
        custom_callbacks = self.compile(model_saved, logs_path, schedule_step, verbose)

        # Define part(s) of layers for fine-tuning
        graph_layers = ['adjacency', 'graph_attention', 'graph_attention_1']
        word_layers = ['bidir_rnn', 'label_embeddings_reduced']
        train_histories = []

        ######################################
        #             FINE-TUNING            #
        ######################################

        print(f"[Fine-tuning MAGNET]")
        train_args = {
            'generator': train_generator,
            'steps_per_epoch': len(train_generator)//3,
            'validation_data': val_generator,
            'validation_steps': len(val_generator),
            'callbacks': custom_callbacks
        }
        for l in range(n_loops):
            
            print(f"Training loop {l+1}")

            # Step 1: Train ALL layers
            for layer in self.model.layers:
                layer.trainable = True

            print(f"\tStep 1: Training ALL layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*5, epochs=l*5+2, **train_args)
            train_histories.append(train_history)

            # Step 2: Train GRAPH layers
            for layer in self.model.layers:
                layer.trainable = True if layer.name in graph_layers else False

            print(f"\tStep 2: Training GRAPH layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*5+2, epochs=l*5+3, **train_args)
            train_histories.append(train_history)

            # Step 3: Train EMBEDDING layers
            for layer in self.model.layers:
                layer.trainable = True if layer.name in word_layers else False

            print(f"\tStep 3: Training EMBEDDING layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*5+3, epochs=l*5+4, **train_args)
            train_histories.append(train_history)

            # Step 4: Train ALL layers
            for layer in self.model.layers:
                layer.trainable = True

            print(f"\tStep 4: Training ALL layers ...")
            train_history = self.model.fit_generator(initial_epoch=l*5+4, epochs=l*5+5, **train_args)
            train_histories.append(train_history)

            # Reduce learning rate
            # custom_callbacks[0].base_lr /= 1.69
            # custom_callbacks[0].max_lr /= 1.69

        return train_histories

    def train(self, train_generator, val_generator, 
                    model_saved: str, logs_path: str,
                    max_epochs: int=50, verbose: int=1):
        # Compile
        schedule_step = len(train_generator) // 2
        custom_callbacks = self.compile(model_saved, logs_path, schedule_step, verbose)

        # Training
        train_history = self.model.fit_generator(generator=train_generator,
                                                 steps_per_epoch=len(train_generator),
                                                 validation_data=val_generator,
                                                 validation_steps=len(val_generator),
                                                 callbacks=custom_callbacks, 
                                                 epochs=max_epochs,
                                                 initial_epoch=0)
        return train_history

    def load_weights(self, weight_path: str):
        self.model.load_weights(weight_path)

    def predict(self, label_embeddings: np.array, sent_embeddings: np.array):
        sent_embeddings = np.reshape(sent_embeddings, (1, 512, self.embedding_dim))
        preds = self.model.predict([sent_embeddings, label_embeddings]).tolist()
        return preds[0]

In [None]:
N_LABELS = labels_matrix.shape[1]
embedding_dim = list(embedder.parameters())[-2].shape[-1]

model = MAGNET(n_labels=N_LABELS, embedding_dim=embedding_dim)

In [None]:
def noam_scheme(global_step, init_lr, warmup_steps=16):
    """
    Noam scheme learning rate decay
        init_lr: (scalar) initial learning rate. 
        global_step: (scalar) current training step
        warmup_steps: (scalar) During warmup_steps, learning rate increases until it reaches init_lr.
    """
    step = tf.cast(global_step+1, dtype=tf.float32, name="global_step")
    return init_lr * (warmup_steps**0.5) * tf.minimum(step*(warmup_steps**-1.5), step**-0.5)

# **Train**

In [None]:
models_path = '/kaggle/working/models'
if not os.path.isdir(models_path):
    os.makedirs(models_path)

logs_path = '/kaggle/working/logs'
if not os.path.isdir(logs_path):
    os.makedirs(logs_path)
    
pred_dir = '/kaggle/working/predictions'
if not os.path.isdir(pred_dir):
    os.makedirs(pred_dir)
    
model_format = 'ep={epoch:03d}_acc={accuracy:.3f}_val_acc={val_accuracy:.3f}_topk={top_k_categorical_accuracy:.3f}_val_topk={val_top_k_categorical_accuracy:.3f}.h5'
model_saved =  os.path.join(models_path, model_format)

In [None]:
train_history = model.finetune(data_generator['training'], 
                               data_generator['valuating'], 
                               model_saved=model_saved, 
                               logs_path=logs_path, 
                               n_loops=3, 
                               verbose=1)

In [None]:
plot_history(train_history)

In [None]:
os.chdir(r'/kaggle/working')
dir_path = '/kaggle/working/'
shutil.make_archive(dir_path+"data", 'zip', dir_path)