# **Import Libraries**

In [None]:
import os
import gc
import sys
import time
import shutil

import random
import pickle

from ast import literal_eval
from tqdm import tqdm as print_progress
from glob import glob

import dask.dataframe as dd
import seaborn as sns
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
os.environ['TF_KERAS'] = '1'

import tensorflow as tf

from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

In [None]:
from tensorflow.keras.layers import (
    Layer, 
    Input, InputLayer, Embedding, 
    Dropout, Dense, 
    Dot, Concatenate, Average, Add,
    Bidirectional, LSTM,
    Lambda, Reshape
)
from tensorflow.keras.activations import softmax, sigmoid
from tensorflow.keras.initializers import Identity, GlorotNormal
from tensorflow.keras.utils import plot_model

In [None]:
pip install stellargraph

In [None]:
pip install gradient-centralization-tf

# **Load data**

In [None]:
label_encoder = pickle.load(open('../input/hotel-comment/label_encoder.pkl', 'rb'))
labels = list(label_encoder.classes_)
len(labels)

# **Data Generator**

In [None]:
labels_matrix = np.load('../input/hotel-comment-valtest-distiluse/labels_embeddings.npy')
labels_matrix.shape

In [None]:
import sklearn
from ast import literal_eval
from tensorflow.keras.utils import Sequence, to_categorical


class DataGenerator(Sequence):

    def __init__(self,
                 data_root,
                 labels_fixed: np.array,
                 max_seq_len: int=512,
                 batch_size: int = 256, 
                 shuffle: bool = True):
        
        self.data_root = data_root
        self.num_labels = labels_fixed.shape[-2]
        self.max_seq_len = max_seq_len
        self.labels_fixed = labels_fixed
        self.embedding_dim = labels_fixed.shape[-1]
        
        # list of files containing both word-embeddings and multi-labels
        if isinstance(self.data_root, str):
            self.files = glob(os.path.join(self.data_root, 'sample_*.npz'))
        elif isinstance(self.data_root, (list, tuple)):
            self.files = []
            for data_dir in self.data_root:
                self.files += glob(os.path.join(data_dir, 'sample_*.npz'))
                
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.indices = np.array(list(range(len(self.files))))
        self.on_epoch_end()

    def __len__(self):
        """
        Denotes the number of batches per epoch
        """
        n_samples = len(self.files)
        return n_samples//self.batch_size + (0 if n_samples%self.batch_size==0 else 1)

    def __getitem__(self, index):
        """
        Generate one batch of data
        """
        # Generate indexes of the batch
        start_index = self.batch_size * index
        end_index = self.batch_size * (index+1)
        indices = self.indices[start_index:end_index]

        # Generate data
        wb_batch = []
        mtl_batch = []
        for idx in indices:
            
            sample_file = self.files[idx]
            
            # Load word embeddings
            wb_pad = np.zeros((self.max_seq_len, self.embedding_dim))
            wb = np.load(sample_file)['emb']
            wb_pad[:wb.shape[0],:] = wb
            wb_batch += [wb_pad]
            
            # Load multi-labels
            mtl = np.load(sample_file)['mtl']
            mtl_batch += [self.smooth_labels(mtl)]
        return [np.array(wb_batch), self.labels_fixed], np.array(mtl_batch)

    def smooth_labels(self, labels, factor=0.1):
        # smooth the labels
        labels *= (1 - factor)
        labels += (factor / labels.shape[-1])
        return labels

    def on_epoch_end(self):
        """
        Update indices after each epoch
        """
        if self.shuffle:
            self.indices = sklearn.utils.shuffle(self.indices)

In [None]:
dset_paths = [
    ['../input/hotel-comment-valtest-distiluse/valuatingdata'],
    ['../input/hotel-comment-testset-distiluse/testingdata']
]
data_generator = dict()
for dataset, dset_path in zip(['valuating', 'testing'], dset_paths):
    data_generator[dataset] = DataGenerator(data_root=dset_path, 
                                            labels_fixed=labels_matrix, 
                                            batch_size=128, 
                                            shuffle=False)

In [None]:
len(data_generator['testing']), len(data_generator['valuating'])

In [None]:
X = data_generator['testing'][0]
print(X[0][0].shape)
print(X[0][1].shape)
print(X[1].shape)

# **Load Model**

In [None]:
class Adjacency(Layer):

    def __init__(self, nodes=1, weights=None, init_method='identity'):
        super(Adjacency, self).__init__()

        self.shape = (1, nodes, nodes)

        if weights is not None:
            assert weights.shape==(nodes, nodes), \
                f'Adjacency Matrix must have shape ({nodes}, {nodes})' + \
                f' while its shape is {weights.shape}'
            w_init = tf.convert_to_tensor(weights)
        else:
            init_method = init_method.lower()
            if init_method == 'identity':
                initializer = tf.initializers.Identity()
            elif init_method in ['xavier', 'glorot']:
                initializer = tf.initializers.GlorotNormal()
            w_init = initializer(shape=(nodes, nodes))

        self.w = tf.Variable(
            initial_value=tf.expand_dims(w_init, axis=0), 
            dtype="float32", trainable=True
        )

    def call(self, inputs):
        return tf.convert_to_tensor(self.w)

    def compute_output_shape(self):
        return self.shape

In [None]:
import gctf
from stellargraph.layer import GraphAttention
from stellargraph.utils import plot_history


def buil_MAGNET(n_labels,
                embedding_dim: int,
                sequence_length: int=512, 
                lstm_units: int=64,
                dropout_rates=[0.3, 0.2],
                attention_heads=[4, 2],
                adjacency_matrix=None,
                adjacency_generation='xavier', # 'identity' or 'xavier' or 'glorot'
                feed_text_embeddings=True, # if False, add additional Embedding layer
                text_embeddings_matrix=None, # initialized weights for text Embedding layer
                feed_label_embeddings=True, # if False, add additional Embedding layer
                label_embeddings_matrix=None, # initialized weights for label Embedding layer
                ) -> Model:

    if isinstance(attention_heads, int):
        attention_heads = [attention_heads, attention_heads]
    if not isinstance(attention_heads, (list, tuple)):
        raise ValueError('`attention_heads` must be INT, LIST or TUPLE')

    # 1. Sentence Representation
    if feed_text_embeddings:
        sentence_model = Sequential(name='sentence_model')
        sentence_model.add(Dropout(dropout_rates[0], input_shape=(sequence_length, embedding_dim), name='word_embeddings'))
        word_inputs, word_embeddings = sentence_model.inputs, sentence_model.outputs
    else:
        word_inputs = Input(shape=(sequence_length, ), name='word_inputs')
        embedding_args = {
            'input_dim': sequence_length,
            'output_dim': embedding_dim,
            'name': 'word_embeddings'
        }
        if text_embeddings_matrix is not None \
            and text_embeddings_matrix.shape==(sequence_length, embedding_dim):
            embedding_args['weights'] = [text_embeddings_matrix]
        word_embeddings = Embedding(**embedding_args)(word_inputs)
        word_embeddings = Dropout(dropout_rates[0], name='WE_dropout')(word_embeddings)
    
    forward_rnn = LSTM(units=lstm_units, return_sequences=True, name='forward_rnn')
    backward_rnn = LSTM(units=lstm_units, return_sequences=True, name='backward_rnn', go_backwards=True)
    bidir_rnn = Bidirectional(layer=forward_rnn, backward_layer=backward_rnn, merge_mode="concat", name='bidir_rnn')
    
    sentence_repr = bidir_rnn(word_embeddings)
    sentence_repr = K.mean(sentence_repr, axis=1)
    # print(f"sentence_repr: {K.int_shape(sentence_repr)}")

    # 2. Labels Representation
    if feed_label_embeddings:
        label_inputs = Input(batch_shape=(1, n_labels, embedding_dim), name='label_embeddings')
        label_embeddings = label_inputs
    else:
        label_inputs = Input(batch_shape=(1, n_labels), name='label_inputs')
        embedding_args = {'input_dim': n_labels,
                          'output_dim': embedding_dim,
                          'name': 'label_embeddings'}
        if label_embeddings_matrix is not None \
            and label_embeddings_matrix.shape==(n_labels, embedding_dim):
            embedding_args['weights'] = [label_embeddings_matrix]
        label_embeddings = Embedding(**embedding_args)(label_inputs)
        label_embeddings = Dropout(rate=dropout_rates[0], name='LE_dropout')(label_embeddings)
    label_embeddings = Dense(units=embedding_dim//2, name='label_embeddings_reduced')(label_embeddings)
    # print(f"label_inputs: {K.int_shape(label_inputs)}")

    label_correlation = Adjacency(nodes=n_labels, 
                                  weights=adjacency_matrix,
                                  init_method=adjacency_generation)(label_embeddings)
    # print(f"label_correlation: {K.int_shape(label_correlation)}")

    label_attention = GraphAttention(units=embedding_dim//2//attention_heads[0],
                                     activation='tanh',
                                     attn_heads=attention_heads[0],
                                     in_dropout_rate=dropout_rates[1],
                                     attn_dropout_rate=dropout_rates[1], )([label_embeddings, label_correlation])
    # print(f"label_attention: {K.int_shape(label_attention)}")

    label_residual = Add(name='label_residual')([label_attention, label_embeddings])
    # print(f"label_residual: {K.int_shape(label_residual)}")

    label_repr = GraphAttention(units=2*lstm_units,
                                activation='tanh',
                                attn_heads_reduction='average',
                                attn_heads=attention_heads[1],
                                in_dropout_rate=dropout_rates[1],
                                attn_dropout_rate=dropout_rates[1], )([label_residual, label_correlation])

    label_repr = K.sum(label_repr, axis=0, keepdims=False)
    # print(f"label_repr: {K.int_shape(label_repr)}")

    # 3. Prediction
    prediction = tf.einsum('Bk,Nk->BN', sentence_repr, label_repr)
    prediction = sigmoid(prediction)
    # print(f"prediction: {K.int_shape(prediction)}")

    return Model(inputs=[word_inputs, label_inputs], outputs=prediction, name='MAGNET')

In [None]:
def weighted_cross_entropy(y_true, y_pred, pos_weight=1.69):
    losses = y_true * -K.log(y_pred) * pos_weight + (1-y_true) * -K.log(1-y_pred)
    losses = K.clip(losses, 0.0, 11.27)
    return K.mean(losses)

In [None]:
import gctf
from stellargraph.utils import plot_history


class MAGNET:

    def __init__(self, n_labels: int, embedding_dim: int, model_ckpt=None):

        self.embedding_dim = embedding_dim

        # Build model(s)
        print(f"\n\n\nBuilding MAGNET ...\n\n\n")
        self.model = buil_MAGNET(n_labels, embedding_dim=embedding_dim, sequence_length=512, lstm_units=32)
        self.model.summary()
        
        # Load weights
        if model_ckpt:
            try:
                print(f"Try to load checkpoint @{model_ckpt} ...")
                self.model.load_weights(model_ckpt)
            except:
                print(f"\t==> Loading Fail !!!")
            
    def compile(self, model_saved: str, logs_path: str, schedule_step: int, verbose: int=1):
                    
        # Compile optimizer, loss & metric functions
        print(f"Compiling MAGNET using \n\tgrad-centralized ADAM, \n\ttop-k Accuracy, \n\tweighted Cross-Entropy \n...")
        self.model.compile(optimizer=gctf.optimizers.adam(learning_rate=0.00169), 
                           # optimizer=Adam(learning_rate=0.001), 
                           metrics=["accuracy", TopKCategoricalAccuracy(k=3)],
                           loss=weighted_cross_entropy)

        # Define Callbacks
        return [
            ModelCheckpoint(filepath=model_saved, monitor='accuracy', save_weights_only=True, save_best_only=False, save_freq='epoch'),
        ]

    def train(self, train_generator, val_generator, 
                    model_saved: str, logs_path: str,
                    max_epochs: int=50, verbose: int=1):
        # Compile
        schedule_step = len(train_generator) // 2
        custom_callbacks = self.compile(model_saved, logs_path, schedule_step, verbose)

        # Training
        train_history = self.model.fit_generator(generator=train_generator,
                                                 steps_per_epoch=len(train_generator),
                                                 validation_data=val_generator,
                                                 validation_steps=len(val_generator),
                                                 callbacks=custom_callbacks, 
                                                 epochs=max_epochs,
                                                 initial_epoch=0)
        return train_history

    def load_weights(self, weight_path: str):
        self.model.load_weights(weight_path)

    def predict(self, sent_embeddings: np.array, label_embeddings: np.array):
        sent_embeddings = np.reshape(sent_embeddings, (-1, 512, self.embedding_dim))
        preds = self.model([sent_embeddings, label_embeddings], training=False)
        return preds

In [None]:
N_LABELS = labels_matrix.shape[-2]
embedding_dim = labels_matrix.shape[-1]

model = MAGNET(n_labels=N_LABELS, 
               embedding_dim=embedding_dim, 
               model_ckpt='../input/magnet-distiluse-checkpoints/weightedCE/ep027_acc0.388_val_acc0.370_topk0.753_val_topk0.793.h5')

# **Predict**

In [None]:
eval_dir = '/kaggle/working/evaluations'
if not os.path.isdir(eval_dir):
    os.makedirs(eval_dir)
    
pred_dir = '/kaggle/working/predictions'
if not os.path.isdir(pred_dir):
    os.makedirs(pred_dir)
    
cfs_mtrx_dir = os.path.join(eval_dir, 'confusion_matrix')
if not os.path.isdir(cfs_mtrx_dir):
    os.makedirs(cfs_mtrx_dir)

In [None]:
test_generator = data_generator['testing']
preds, ys_true = [], []
for i in print_progress(range(len(test_generator))):
    Xs, labels = test_generator[i]
    labels = np.where(labels>0.5, 1., 0.)
    Y = model.predict(*Xs).numpy()
    preds.extend(Y)
    ys_true.extend(labels)

# **Evaluate**

In [None]:
import string
from sklearn.metrics import f1_score, multilabel_confusion_matrix


def Precision_Recall_at_K(y_true: np.array, y_pred: np.array, K: int=-1, threshold: float=0.5):

    # Clip by threshold
    preds = np.where(y_pred>threshold, y_pred, 0.0)

    # Rank predictions
    preds = np.argsort(y_pred)[::-1]

    # Convert multi-hot into categories
    labels = np.where(y_true==1)[0]

    # Calculate precision@k and recall@k
    precisions, recalls, relevances = np.zeros(len(preds)), np.zeros(len(preds)), np.zeros(len(preds))
    n_corrects = 0
    for k in range(len(preds)):
        if preds[k] in labels:
            n_corrects += 1
            relevances[k] = 1
        precisions[k] = n_corrects / (k+1)
        recalls[k] = n_corrects / len(labels)
    return precisions[:K], recalls[:K], relevances[:K]


def AP_at_K(y_true: np.array, y_pred: np.array, K: int=3, post_norm: bool=False):

    # Average Precision at top-k predictions
    precision, _, relevance = Precision_Recall_at_K(y_true, y_pred)
    mean_precision = np.dot(precision, relevance)
    if post_norm:
        return mean_precision

    L = len(np.where(y_true==1)[0])
    return mean_precision / min(K,L)


def MAP_at_K(ys_true: list, ys_pred: list, K=3):
    L = np.max([len(np.where(y_true==1)[0]) for y_true in ys_true])
    B = len(ys_true)

    # Mean Average Precision at batch's top-k predictions
    APs = [AP_at_K(y_true, y_pred, K, post_norm=True) \
               for y_true, y_pred in zip(ys_true, ys_pred)]
    return np.sum(APs) / (B*min(L,K))


def TP_multilabel(ys_true: np.array, ys_pred: np.array, threshold=0.5):
    n_classes = ys_true.shape[1]
    if isinstance(threshold, float):
        threshold = [threshold] * n_classes
    if len(threshold) != n_classes:
        raise ValueError('Number of thresholds doesnt match number of classes')

    TP_by_classes = []
    for clss in range(n_classes):
        TP_by_classes += [
            np.sum(np.logical_and(ys_pred[:, clss]>=threshold[clss], ys_true[:, clss]==1))
        ]
    return TP_by_classes


def TN_multilabel(ys_true: np.array, ys_pred: np.array, threshold=0.5):
    n_classes = ys_true.shape[1]
    if isinstance(threshold, float):
        threshold = [threshold] * n_classes
    if len(threshold) != n_classes:
        raise ValueError('Number of thresholds doesnt match number of classes')

    TN_by_classes = []
    for clss in range(n_classes):
        TN_by_classes += [
            np.sum(np.logical_and(ys_pred[:, clss]<threshold[clss], ys_true[:, clss]==0))
        ]
    return TN_by_classes


def FP_multilabel(ys_true: np.array, ys_pred: np.array, threshold=0.5):
    n_classes = ys_true.shape[1]
    if isinstance(threshold, float):
        threshold = [threshold] * n_classes
    if len(threshold) != n_classes:
        raise ValueError('Number of thresholds doesnt match number of classes')

    FP_by_classes = []
    for clss in range(n_classes):
        FP_by_classes += [
            np.sum(np.logical_and(ys_pred[:, clss]>=threshold[clss], ys_true[:, clss]==0))
        ]
    return FP_by_classes


def FN_multilabel(ys_true: np.array, ys_pred: np.array, threshold=0.5):
    n_classes = ys_true.shape[1]
    if isinstance(threshold, float):
        threshold = [threshold] * n_classes
    if len(threshold) != n_classes:
        raise ValueError('Number of thresholds doesnt match number of classes')

    FN_by_classes = []
    for clss in range(n_classes):
        FN_by_classes += [
            np.sum(np.logical_and(ys_pred[:, clss]<threshold[clss], ys_true[:, clss]==1))
        ]
    return FN_by_classes


def hamming_loss(y_true, y_pred, mode='multilabel'):
    mode = mode.lower()
    if mode not in ['multiclass', 'multilabel']:
        raise TypeError('`mode` must be: [None, multilabel])')

    if mode == 'multiclass':
        non_zero_values = tf.cast(tf.math.count_nonzero(y_true*y_pred, axis=-1), tf.float32)
        return 1.0 - non_zero_values
    else:
        non_zero_values = tf.cast(tf.math.count_nonzero(y_true-y_pred, axis=-1), tf.float32)
        return non_zero_values / y_true.shape[-1]

In [None]:
fn = '../input/hotel-comment/testing_data*.csv'
dataset = dd.read_csv(fn).compute().loc[:len(preds)-1, :]
dataset = dataset[['Comment', 'Sections_Name', 'label_encoder']]

ys_true, ys_pred = np.array(ys_true), np.array(preds)
print(ys_true.shape, ys_pred.shape)
for K in range(3):
    print(f"MAP@{K+1} = {MAP_at_K(ys_true, ys_pred, K+1)}")
    
print('Calculating Confusion Matrix ...')
TP_by_classes = TP_multilabel(ys_true, ys_pred)
TN_by_classes = TN_multilabel(ys_true, ys_pred)
FP_by_classes = FP_multilabel(ys_true, ys_pred)
FN_by_classes = FN_multilabel(ys_true, ys_pred)

print('\n\n\nConfusion Matrix\n')
for clss, (TP, TN, FP, FN) in enumerate(zip(TP_by_classes, TN_by_classes, FP_by_classes, FN_by_classes)):
    clss_name = label_encoder.inverse_transform([clss])[0]
    clss_name = clss_name.translate(str.maketrans('', '', string.punctuation))
    print(f'\tClass {clss_name}\n')
    print(f'\t\tTP = {TP:05d}\tFP = {FP:05d}\n\t\tFN = {FP:05d}\tTN = {TN:05d}\n')

    df_cm = pd.DataFrame([[TP, FP], [FN, TN]], ['True', 'False'], ['True', 'False'])
    sns.set(font_scale=1.4) # for label size
    sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
    plt.savefig(os.path.join(cfs_mtrx_dir, f'{clss_name}.png'))
    plt.clf()

# Calculate micro-metrics
print('Calculating other metrics ...')
print('\n\n\nMicro Metrics\n')

Precision_micro = np.sum(TP_by_classes) / (np.sum(TP_by_classes) + np.sum(FP_by_classes))
print(f'\tPrecision_micro = {Precision_micro}\n')

Recall_micro = np.sum(TP_by_classes) / (np.sum(TP_by_classes) + np.sum(FN_by_classes))
print(f'\tRecall_micro = {Recall_micro}\n')

F1_micro = 2*np.sum(TP_by_classes) / (2*np.sum(TP_by_classes) + np.sum(FP_by_classes) + np.sum(FN_by_classes))
print(f'\tF1_micro = {F1_micro}\n')

# Calculate Hamming loss
ys_pred = np.where(ys_pred<0.5, 0, 1)
loss = hamming_loss(ys_true, ys_pred)
loss_ = np.sum(loss) / len(loss)
print(f'\n\n\nhamming_loss = {loss_}\n')

In [None]:
# os.chdir(r'/kaggle/working')
# dir_path = '/kaggle/working/'
# shutil.make_archive(dir_path+"data", 'zip', dir_path)