In [None]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import re,os
import tensorflow
import tensorflow.compat.v1 as tf
from tqdm import tqdm

tf.disable_v2_behavior()
print(tf.__version__)
print(tf.test.is_gpu_available())

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import numpy as np
from skimage.util import view_as_windows
import warnings, logging
import tensorflow.compat.v1 as tf
from collections import OrderedDict
import numpy as np


def make_batches(size, batch_size):
    num_batches = (size + batch_size - 1) // batch_size  # round up
    return [(i * batch_size, min(size, (i + 1) * batch_size))
            for i in range(num_batches)]


def to_list(x, allow_tuple=False):
    if isinstance(x, list):
        return x
    if allow_tuple and isinstance(x, tuple):
        return list(x)
    return [x]


def unpack_singleton(x):

    if len(x) == 1:
        return np.array(x)
    return x


def slice_arrays(arrays, start=None, stop=None):

    if arrays is None:
        return [None]
    elif isinstance(arrays, list):
        return [None if x is None else x[start:stop] for x in arrays]
    else:
        return arrays[start:stop]


def placeholder_from_data(numpy_array):
    if numpy_array is None:
        return None
    return tf.placeholder('float', [None,] + list(numpy_array.shape[1:]))

SUPPORTED_ACTIVATIONS = [
    'Relu', 'Elu', 'Sigmoid', 'Tanh', 'Softplus'
]

UNSUPPORTED_ACTIVATIONS = [
    'CRelu', 'Relu6', 'Softsign'
]

_ENABLED_METHOD_CLASS = None
_GRAD_OVERRIDE_CHECKFLAG = 0


# -----------------------------------------------------------------------------
# UTILITY FUNCTIONS
# -----------------------------------------------------------------------------


def activation(type):
    """
    Returns Tensorflow's activation op, given its type
    :param type: string
    :return: op
    """
    if type not in SUPPORTED_ACTIVATIONS:
        warnings.warn('Activation function (%s) not supported' % type)
    f = getattr(tf.nn, type.lower())
    return f


def original_grad(op, grad):
    """
    Return original Tensorflow gradient for an op
    :param op: op
    :param grad: Tensor
    :return: Tensor
    """
    if op.type not in SUPPORTED_ACTIVATIONS:
        warnings.warn('Activation function (%s) not supported' % op.type)
    opname = '_%sGrad' % op.type
    return f(op, grad)


# -----------------------------------------------------------------------------
# ATTRIBUTION METHODS BASE CLASSES
# -----------------------------------------------------------------------------


class AttributionMethod(object):
    """
    Attribution method base class
    """
    def __init__(self, T, X, session, keras_learning_phase=None):
        self.T = T  # target Tensor
        self.X = X  # input Tensor
        self.Y_shape = [None,] + T.get_shape().as_list()[1:]
        # Most often T contains multiple output units. In this case, it is often necessary to select
        # a single unit to compute contributions for. This can be achieved passing 'ys' as weight for the output Tensor.
        self.Y = tf.placeholder(tf.float32, self.Y_shape)
        # placeholder_from_data(ys) if ys is not None else 1.0  # Tensor that represents weights for T
        self.T = self.T * self.Y
        self.symbolic_attribution = None
        self.session = session
        self.keras_learning_phase = keras_learning_phase
        self.has_multiple_inputs = type(self.X) is list or type(self.X) is tuple
        logging.info('Model with multiple inputs: %s' % self.has_multiple_inputs)

        # Set baseline
        # TODO: now this sets a baseline also for those methods that does not require it
        self._set_check_baseline()

        # References
        self._init_references()

        # Create symbolic explanation once during construction (affects only gradient-based methods)
        self.explain_symbolic()

    def explain_symbolic(self):
        return None

    def run(self, xs, ys=None, batch_size=None):
        pass

    def _init_references(self):
        pass

    def _check_input_compatibility(self, xs, ys=None, batch_size=None):
        if ys is not None:
            if not self.has_multiple_inputs and len(xs) != len(ys):
                raise RuntimeError('When provided, ys must have the same batch size as xs (xs has batch size {} and ys {})'.format(len(xs), len(ys)))
            elif self.has_multiple_inputs and np.all([len(i) != len(ys) for i in xs]):
                raise RuntimeError('When provided, ys must have the same batch size as all elements of xs')
        if batch_size is not None and batch_size > 0:
            if self.T.shape[0].value is not None and self.T.shape[0].value is not batch_size:
                raise RuntimeError('When using batch evaluation, the first dimension of the target tensor '
                                   'must be compatible with the batch size. Found %s instead' % self.T.shape[0].value)
            if isinstance(self.X, list):
                for x in self.X:
                    if x.shape[0].value is not None and x.shape[0].value is not batch_size:
                        raise RuntimeError('When using batch evaluation, the first dimension of the input tensor '
                                           'must be compatible with the batch size. Found %s instead' % x.shape[
                                               0].value)
            else:
                if self.X.shape[0].value is not None and self.X.shape[0].value is not batch_size:
                    raise RuntimeError('When using batch evaluation, the first dimension of the input tensor '
                                       'must be compatible with the batch size. Found %s instead' % self.X.shape[0].value)

    def _session_run_batch(self, T, xs, ys=None):
        feed_dict = {}
        if self.has_multiple_inputs:
            for k, v in zip(self.X, xs):
                feed_dict[k] = v
        else:
            feed_dict[self.X] = xs

        # If ys is not passed, produce a vector of ones that will be broadcasted to all batch samples
        feed_dict[self.Y] = ys if ys is not None else np.ones([1,] + self.Y_shape[1:])

        if self.keras_learning_phase is not None:
            feed_dict[self.keras_learning_phase] = 0
        return self.session.run(T, feed_dict)

    def _session_run(self, T, xs, ys=None, batch_size=None):
        num_samples = len(xs)
        if self.has_multiple_inputs is True:
            num_samples = len(xs[0])
            if len(xs) != len(self.X):
                raise RuntimeError('List of input tensors and input data have different lengths (%s and %s)'
                                   % (str(len(xs)), str(len(self.X))))
            if batch_size is not None:
                for xi in xs:
                    if len(xi) != num_samples:
                        raise RuntimeError('Evaluation in batches requires all inputs to have '
                                           'the same number of samples')

        if batch_size is None or batch_size <= 0 or num_samples <= batch_size:
            return self._session_run_batch(T, xs, ys)
        else:
            outs = []
            batches = make_batches(num_samples, batch_size)
            for batch_index, (batch_start, batch_end) in enumerate(batches):
                # Get a batch from data
                xs_batch = slice_arrays(xs, batch_start, batch_end)
                # If the target tensor has one entry for each sample, we need to batch it as well
                ys_batch = None
                if ys is not None:
                    ys_batch = slice_arrays(ys, batch_start, batch_end)
                batch_outs = self._session_run_batch(T, xs_batch, ys_batch)
                batch_outs = to_list(batch_outs)
                if batch_index == 0:
                    # Pre-allocate the results arrays.
                    for batch_out in batch_outs:
                        shape = (num_samples,) + batch_out.shape[1:]
                        outs.append(np.zeros(shape, dtype=batch_out.dtype))
                for i, batch_out in enumerate(batch_outs):
                    outs[i][batch_start:batch_end] = batch_out
            return unpack_singleton(outs)

    def _set_check_baseline(self):
        # Do nothing for those methods that have no baseline required
        if not hasattr(self, "baseline"):
            return

        if self.baseline is None:
            if self.has_multiple_inputs:
                self.baseline = [np.zeros([1,] + xi.get_shape().as_list()[1:]) for xi in self.X]
            else:
                self.baseline = np.zeros([1,] + self.X.get_shape().as_list()[1:])

        else:
            if self.has_multiple_inputs:
                for i, xi in enumerate(self.X):
                    if list(self.baseline[i].shape) == xi.get_shape().as_list()[1:]:
                        self.baseline[i] = np.expand_dims(self.baseline[i], 0)
                    else:
                        raise RuntimeError('Baseline shape %s does not match expected shape %s'
                                           % (self.baseline[i].shape, self.X.get_shape().as_list()[1:]))
            else:
                if list(self.baseline.shape) == self.X.get_shape().as_list()[1:]:
                    self.baseline = np.expand_dims(self.baseline, 0)
                else:
                    raise RuntimeError('Baseline shape %s does not match expected shape %s'
                                       % (self.baseline.shape, self.X.get_shape().as_list()[1:]))


class GradientBasedMethod(AttributionMethod):
    """
    Base class for gradient-based attribution methods
    """
    def get_symbolic_attribution(self):
        return tf.gradients(self.T, self.X)

    def explain_symbolic(self):
        if self.symbolic_attribution is None:
            self.symbolic_attribution = self.get_symbolic_attribution()
        return self.symbolic_attribution

    def run(self, xs, ys=None, batch_size=None):
        self._check_input_compatibility(xs, ys, batch_size)
        results = self._session_run(self.explain_symbolic(), xs, ys, batch_size)
        return results[0] if not self.has_multiple_inputs else results

    @classmethod
    def nonlinearity_grad_override(cls, op, grad):
        return original_grad(op, grad)


class PerturbationBasedMethod(AttributionMethod):
    """
       Base class for perturbation-based attribution methods
       """
    def __init__(self, T, X, session, keras_learning_phase):
        super(PerturbationBasedMethod, self).__init__(T, X, session, keras_learning_phase)
        self.base_activation = None



# -----------------------------------------------------------------------------
# ATTRIBUTION METHODS
# -----------------------------------------------------------------------------
"""
Returns zero attributions. For testing only.
"""


class DummyZero(GradientBasedMethod):

    def get_symbolic_attribution(self,):
        return tf.gradients(self.T, self.X)

    @classmethod
    def nonlinearity_grad_override(cls, op, grad):
        input = op.inputs[0]
        return tf.zeros_like(input)

"""
Saliency maps
https://arxiv.org/abs/1312.6034
"""


class Saliency(GradientBasedMethod):

    def get_symbolic_attribution(self):
        return [tf.abs(g) for g in tf.gradients(self.T, self.X)]


"""
Gradient * Input
https://arxiv.org/pdf/1704.02685.pdf - https://arxiv.org/abs/1611.07270
"""


class GradientXInput(GradientBasedMethod):

    def get_symbolic_attribution(self):
        return [g * x for g, x in zip(
            tf.gradients(self.T, self.X),
            self.X if self.has_multiple_inputs else [self.X])]


"""
Integrated Gradients
https://arxiv.org/pdf/1703.01365.pdf
"""


class IntegratedGradients(GradientBasedMethod):

    def __init__(self, T, X, session, keras_learning_phase, steps=100, baseline=None):
        self.steps = steps
        self.baseline = baseline
        super(IntegratedGradients, self).__init__(T, X, session, keras_learning_phase)

    def run(self, xs, ys=None, batch_size=None):
        self._check_input_compatibility(xs, ys, batch_size)

        gradient = None
        for alpha in list(np.linspace(1. / self.steps, 1.0, self.steps)):
            xs_mod = [b + (x - b) * alpha for x, b in zip(xs, self.baseline)] if self.has_multiple_inputs \
                else self.baseline + (xs - self.baseline) * alpha
            _attr = self._session_run(self.explain_symbolic(), xs_mod, ys, batch_size)
            if gradient is None: gradient = _attr
            else: gradient = [g + a for g, a in zip(gradient, _attr)]

        results = [g * (x - b) / self.steps for g, x, b in zip(
            gradient,
            xs if self.has_multiple_inputs else [xs],
            self.baseline if self.has_multiple_inputs else [self.baseline])]

        return results[0] if not self.has_multiple_inputs else results


"""
Layer-wise Relevance Propagation with epsilon rule
http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0130140
"""


class EpsilonLRP(GradientBasedMethod):
    eps = None

    def __init__(self, T, X, session, keras_learning_phase, epsilon=1e-4):
        assert epsilon > 0.0, 'LRP epsilon must be greater than zero'
        global eps
        eps = epsilon
        super(EpsilonLRP, self).__init__(T, X, session, keras_learning_phase)

    def get_symbolic_attribution(self):
        return [g * x for g, x in zip(
            tf.gradients(self.T, self.X),
            self.X if self.has_multiple_inputs else [self.X])]

    @classmethod
    def nonlinearity_grad_override(cls, op, grad):
        output = op.outputs[0]
        input = op.inputs[0]
        return grad * output / (input + eps *
                                tf.where(input >= 0, tf.ones_like(input), -1 * tf.ones_like(input)))

"""
DeepLIFT
This reformulation only considers the "Rescale" rule
https://arxiv.org/abs/1704.02685
"""


class DeepLIFTRescale(GradientBasedMethod):

    _deeplift_ref = {}

    def __init__(self, T, X, session, keras_learning_phase, baseline=None):
        self.baseline = baseline
        super(DeepLIFTRescale, self).__init__(T, X, session, keras_learning_phase)

    def get_symbolic_attribution(self):
        return [g * (x - b) for g, x, b in zip(
            tf.gradients(self.T, self.X),
            self.X if self.has_multiple_inputs else [self.X],
            self.baseline if self.has_multiple_inputs else [self.baseline])]

    @classmethod
    def nonlinearity_grad_override(cls, op, grad):
        output = op.outputs[0]
        input = op.inputs[0]
        ref_input = cls._deeplift_ref[op.name]
        ref_output = activation(op.type)(ref_input)
        delta_out = output - ref_output
        delta_in = input - ref_input
        instant_grad = activation(op.type)(0.5 * (ref_input + input))
        return tf.where(tf.abs(delta_in) > 1e-5, grad * delta_out / delta_in,
                        original_grad(instant_grad.op, grad))


"""
Occlusion method
Generalization of the grey-box method presented in https://arxiv.org/pdf/1311.2901.pdf
This method performs a systematic perturbation of contiguous hyperpatches in the input,
replacing each patch with a user-defined value (by default 0).
window_shape : integer or tuple of length xs_ndim
Defines the shape of the elementary n-dimensional orthotope the rolling window view.
If an integer is given, the shape will be a hypercube of sidelength given by its value.
step : integer or tuple of length xs_ndim
Indicates step size at which extraction shall be performed.
If integer is given, then the step is uniform in all dimensions.
"""


class Occlusion(PerturbationBasedMethod):

    def __init__(self, T, X, session, keras_learning_phase, window_shape=None, step=None):
        super(Occlusion, self).__init__(T, X, session, keras_learning_phase)
        if self.has_multiple_inputs:
            raise RuntimeError('Multiple inputs not yet supported for perturbation methods')

        input_shape = X[0].get_shape().as_list()
        if window_shape is not None:
            assert len(window_shape) == len(input_shape), \
                'window_shape must have length of input (%d)' % len(input_shape)
            self.window_shape = tuple(window_shape)
        else:
            self.window_shape = (1,) * len(input_shape)

        if step is not None:
            assert isinstance(step, int) or len(step) == len(input_shape), \
                'step must be integer or tuple with the length of input (%d)' % len(input_shape)
            self.step = step
        else:
            self.step = 1
        self.replace_value = 0.0
        logging.info('Input shape: %s; window_shape %s; step %s' % (input_shape, self.window_shape, self.step))

    def run(self, xs, ys=None, batch_size=None):
        self._check_input_compatibility(xs, ys, batch_size)
        input_shape = xs.shape[1:]
        batch_size = xs.shape[0]
        total_dim = np.asscalar(np.prod(input_shape))

        # Create mask
        index_matrix = np.arange(total_dim).reshape(input_shape)
        idx_patches = view_as_windows(index_matrix, self.window_shape, self.step).reshape((-1,) + self.window_shape)
        heatmap = np.zeros_like(xs, dtype=np.float32).reshape((-1), total_dim)
        w = np.zeros_like(heatmap)

        # Compute original output
        eval0 = self._session_run(self.T, xs, ys, batch_size)

        # Start perturbation loop
        for i, p in enumerate(idx_patches):
            mask = np.ones(input_shape).flatten()
            mask[p.flatten()] = self.replace_value
            masked_xs = mask.reshape((1,) + input_shape) * xs
            delta = eval0 - self._session_run(self.T, masked_xs, ys, batch_size)
            delta_aggregated = np.sum(delta.reshape((batch_size, -1)), -1, keepdims=True)
            heatmap[:, p.flatten()] += delta_aggregated
            w[:, p.flatten()] += p.size

        attribution = np.reshape(heatmap / w, xs.shape)
        if np.isnan(attribution).any():
            warnings.warn('Attributions generated by Occlusion method contain nans, '
                          'probably because window_shape and step do not allow to cover the all input.')
        return attribution


"""
Shapley Value sampling
Computes approximate Shapley Values using "Polynomial calculation of the Shapley value based on sampling",
Castro et al, 2009 (https://www.sciencedirect.com/science/article/pii/S0305054808000804)
samples : integer (default 5)
Defined the number of samples for each input feature. 
Notice that evaluating a model samples * n_input_feature times might take a while.
sampling_dims : list of dimension indexes to run sampling on (feature dimensions).
By default, all dimensions except the batch dimension will be sampled.
For example, with a 4-D tensor that contains color images, single color channels are sampled.
To sample pixels, instead, use sampling_dims=[1,2]
"""


class ShapleySampling(PerturbationBasedMethod):

    def __init__(self, T, X, session, keras_learning_phase, samples=5, sampling_dims=None):
        super(ShapleySampling, self).__init__(T, X, session, keras_learning_phase)
        if self.has_multiple_inputs:
            raise RuntimeError('Multiple inputs not yet supported for perturbation methods')
        dims = len(X.shape)
        if sampling_dims is not None:
            if not 0 < len(sampling_dims) <= (dims - 1):
                raise RuntimeError('sampling_dims must be a list containing 1 to %d elements' % (dims-1))
            if 0 in sampling_dims:
                raise RuntimeError('Cannot sample batch dimension: remove 0 from sampling_dims')
            if any([x < 1 or x > dims-1 for x in sampling_dims]):
                raise RuntimeError('Invalid value in sampling_dims')
        else:
            sampling_dims = list(range(1, dims))

        self.samples = samples
        self.sampling_dims = sampling_dims

    def run(self, xs, ys=None, batch_size=None):
        xs_shape = list(xs.shape)
        batch_size = xs.shape[0]
        n_features = int(np.asscalar(np.prod([xs.shape[i] for i in self.sampling_dims])))
        result = np.zeros((xs_shape[0], n_features))

        run_shape = list(xs_shape)  # a copy
        run_shape = np.delete(run_shape, self.sampling_dims).tolist()
        run_shape.insert(1, -1)

        reconstruction_shape = [xs_shape[0]]
        for j in self.sampling_dims:
            reconstruction_shape.append(xs_shape[j])

        for r in range(self.samples):
            p = np.random.permutation(n_features)
            x = xs.copy().reshape(run_shape)
            y = None
            for i in p:
                if y is None:
                    y = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
                x[:, i] = 0
                y0 = self._session_run(self.T, x.reshape(xs_shape), ys, batch_size)
                delta = y - y0
                delta_aggregated = np.sum(delta.reshape((batch_size, -1)), -1, keepdims=False)
                result[:, i] += delta_aggregated
                y = y0

        shapley = result / self.samples
        return shapley.reshape(reconstruction_shape)


# -----------------------------------------------------------------------------
# END ATTRIBUTION METHODS
# -----------------------------------------------------------------------------


attribution_methods = OrderedDict({
    'zero': (DummyZero, 0),
    'saliency': (Saliency, 1),
    'grad*input': (GradientXInput, 2),
    'intgrad': (IntegratedGradients, 3),
    'elrp': (EpsilonLRP, 4),
    'deeplift': (DeepLIFTRescale, 5),
    'occlusion': (Occlusion, 6),
    'shapley_sampling': (ShapleySampling, 7)
})

class DeepExplain(object):

    def __init__(self, graph=None, session=tf.get_default_session()):
        self.method = None
        self.batch_size = None
        self.session = session
        self.graph = session.graph if graph is None else graph
        self.graph_context = self.graph.as_default()
        self.override_context = self.graph.gradient_override_map(self.get_override_map())
        self.keras_phase_placeholder = None
        self.context_on = False
        if self.session is None:
            raise RuntimeError('DeepExplain: could not retrieve a session. Use DeepExplain(session=your_session).')

    def __enter__(self):
        # Override gradient of all ops created in context
        self.graph_context.__enter__()
        self.override_context.__enter__()
        self.context_on = True
        return self

    def __exit__(self, type, value, traceback):
        self.graph_context.__exit__(type, value, traceback)
        self.override_context.__exit__(type, value, traceback)
        self.context_on = False

    def get_explainer(self, method, T, X, **kwargs):
        if not self.context_on:
            raise RuntimeError('Explain can be called only within a DeepExplain context.')
        global _ENABLED_METHOD_CLASS, _GRAD_OVERRIDE_CHECKFLAG
        self.method = method
        if self.method in attribution_methods:
            method_class, method_flag = attribution_methods[self.method]
        else:
            raise RuntimeError('Method must be in %s' % list(attribution_methods.keys()))
        if isinstance(X, list):
            for x in X:
                if 'tensor' not in str(type(x)).lower():
                    raise RuntimeError('If a list, X must contain only Tensorflow Tensor objects')
        else:
            if 'tensor' not in str(type(X)).lower():
                raise RuntimeError('X must be a Tensorflow Tensor object or a list of them')

        if 'tensor' not in str(type(T)).lower():
            raise RuntimeError('T must be a Tensorflow Tensor object')

        logging.info('DeepExplain: running "%s" explanation method (%d)' % (self.method, method_flag))
        self._check_ops()
        _GRAD_OVERRIDE_CHECKFLAG = 0

        _ENABLED_METHOD_CLASS = method_class
        method = _ENABLED_METHOD_CLASS(T, X,
                                       self.session,
                                       keras_learning_phase=self.keras_phase_placeholder,
                                       **kwargs)

        if issubclass(_ENABLED_METHOD_CLASS, GradientBasedMethod) and _GRAD_OVERRIDE_CHECKFLAG == 0:
            warnings.warn('DeepExplain detected you are trying to use an attribution method that requires '
                          'gradient override but the original gradient was used instead. You might have forgot to '
                          '(re)create your graph within the DeepExlain context. Results are not reliable!')
        _ENABLED_METHOD_CLASS = None
        _GRAD_OVERRIDE_CHECKFLAG = 0
        self.keras_phase_placeholder = None
        return method

    def explain(self, method, T, X, xs, ys=None, batch_size=None, **kwargs):
        explainer = self.get_explainer(method, T, X, **kwargs)
        return explainer.run(xs, ys, batch_size)

    @staticmethod
    def get_override_map():
        return dict((a, 'DeepExplainGrad') for a in SUPPORTED_ACTIVATIONS)

    def _check_ops(self):
        """
        Heuristically check if any op is in the list of unsupported activation functions.
        This does not cover all cases where explanation methods would fail, and must be improved in the future.
        Also, check if the placeholder named 'keras_learning_phase' exists in the graph. This is used by Keras
         and needs to be passed in feed_dict.
        :return:
        """
        g = tf.get_default_graph()
        for op in g.get_operations():
            if len(op.inputs) > 0 and not op.name.startswith('gradients'):
                if op.type in UNSUPPORTED_ACTIVATIONS:
                    warnings.warn('Detected unsupported activation (%s). '
                                  'This might lead to unexpected or wrong results.' % op.type)
            elif 'keras_learning_phase' in op.name:
                self.keras_phase_placeholder = op.outputs[0]


In [None]:
def batch_norm(data, is_train, trainable=True, name=None, data_format='channels_last',
               USE_FUSED_BN = True, BN_EPSILON = 0.001, BN_MOMENTUM = 0.99):
    
    bn_axis = -1 if data_format == 'channels_last' else 1
    
    with tf.name_scope(name):
        return tf.layers.batch_normalization(data, training=is_train, name=name, momentum=BN_MOMENTUM, axis=bn_axis,
                                             trainable=trainable, epsilon=BN_EPSILON, reuse=None, fused=USE_FUSED_BN)

class config:
    def __init__(
        self, num_output, max_seq_length = 33):
        
        self.num_output = num_output
        self.max_seq_length = max_seq_length
    
class create():
    def __init__(self, config, seed=None, use_bias=False,
                 num_layers=8, num_heads=8, linear_key_dim=32,
                 linear_value_dim=32, model_dim=90, ffn_dim=50, num_vox_roi=90,
                 gpu_memory_fraction=None, optimizer_type='adam', phase='train',
                 freeze_encoder=False, ckpt_path=None):

        self.config = config
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.linear_key_dim = linear_key_dim
        self.linear_value_dim = linear_value_dim
        self.model_dim = model_dim
        self.ffn_dim = ffn_dim
        self.optimizer_type = optimizer_type
        self.freeze_encoder = freeze_encoder
        
        if phase not in ['train', 'test'] : raise  ValueError("phase must` be 'train' or 'test'.")

        self.graph = tf.get_default_graph()
        config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
        if gpu_memory_fraction is None:
            config.gpu_options.allow_growth = True
        else:
            config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction
        self.sess = tf.Session(config=config, graph=self.graph)    

        with self.graph.as_default():
            if seed is not None :
                tf.set_random_seed(seed)
            
            #self.embedding_inputs = tf.convert_to_tensor(self.config.word_vectors, dtype=tf.float32)
            self.encoder_inputs = tf.placeholder(tf.float32, [None, self.config.max_seq_length,self.model_dim], name="encoder_input")
            #self.targets = tf.placeholder(tf.float32, [None, self.config.max_seq_length, self.config.num_output], name = 'target')
            self.targets = tf.placeholder(tf.int32, [None,], name = 'target')
            self.batch_size = tf.shape(self.encoder_inputs)[0]
            if phase == 'train':
                self.is_train = tf.placeholder(tf.bool)
                self.lr = tf.placeholder(tf.float32, name="lr")
            else :
                self.is_train = False
            
            # Create Encoder
            self.encoder_outputs = self.__create_embed(self.encoder_inputs, encoder=True)
            self.encoder_outputs = self.__create_encoder(self.encoder_outputs, use_bias=use_bias)
            
            # Create Decoder
            self.decoder_inputs = self.__create_embed(self.encoder_outputs, encoder=False)
            self.decoder_outputs = self.__create_decoder(self.decoder_inputs, use_bias=use_bias)
            
            self.logits = self.__create_output(self.decoder_outputs, self.is_train)
            self.class_logits = tf.reduce_mean(self.logits,1)
            
            if phase == 'train':
                with tf.variable_scope('loss'):
                    self.target_one_hot = tf.one_hot(self.targets,depth=self.config.num_output,axis=-1)
                    self.loss = tf.losses.softmax_cross_entropy(self.target_one_hot, self.class_logits)
                    #self.loss = tf.losses.mean_squared_error(self.targets, logits)
                    self.train_op= self.__set_op(self.loss, self.lr, optimizer_type=self.optimizer_type)
                    self.sess.run(tf.global_variables_initializer())
                    
            if ckpt_path is not None :
                '''
                vars_to_train = tf.trainable_variables()
                if self.freeze_encoder :
                    vars_to_train = [var for var in vars_to_train if 'encode_layer' in var.name]
                saver = tf.train.Saver(vars_to_train)'''
                saver = tf.train.Saver()
                saver.restore(self.sess, ckpt_path)
            
    def positional_encoding(self, sentence_length, dtype=tf.float32):

        encoded_vec = np.array([pos/np.power(10000, 2*i/self.model_dim) for pos in range(sentence_length) for i in range(self.model_dim)])
        encoded_vec[::2] = np.sin(encoded_vec[::2])
        encoded_vec[1::2] = np.cos(encoded_vec[1::2])

        return tf.convert_to_tensor(encoded_vec.reshape([sentence_length, self.model_dim]), dtype=dtype)
    
    def __create_embed(self, inputs, encoder=True, name='Embeddings'):
        with tf.variable_scope(name) as scope :
            
            # Positional Encoding
            with tf.variable_scope('positional_encoding'):
                positional_encoded = self.positional_encoding(self.config.max_seq_length)
            
            # Add positional encoded
            position_inputs = tf.tile(tf.range(0, self.config.max_seq_length), [self.batch_size])
            position_inputs = tf.reshape(position_inputs, [self.batch_size, self.config.max_seq_length])
            '''    
            if encoder :
                inputs = tf.nn.embedding_lookup(self.embedding_inputs, inputs)
            '''    
            encoded_inputs = tf.add(inputs, tf.nn.embedding_lookup(positional_encoded, position_inputs))
            
            return encoded_inputs

    def __create_encoder(self, enc_input, use_bias=False):
        tensor = residual = tf.identity(enc_input)
        for i in range(1, self.num_layers+1):
            with tf.variable_scope("encode_layer{}".format(i)):
                tensor = self.self_attention(q=tensor, k=tensor, v=tensor)
                tensor = tf.add(tensor, residual)
                tensor = residual = batch_norm(tensor,self.is_train)
                
                tensor = tf.layers.dense(tensor, self.model_dim, use_bias=use_bias, activation=tf.nn.relu)
                tensor = tf.layers.dense(tensor, self.model_dim, use_bias=use_bias)
                
                tensor = tf.add(tensor, residual)
                tensor = batch_norm(tensor,self.is_train)
                
        return tensor
    
    def __create_decoder(self, dec_input, use_bias=False):
        for i in range(1, self.num_layers+1):
            if i == 1:
                tensor = encoder_inputs =  residual = dec_input
            else : 
                encoder_inputs = dec_input
            with tf.variable_scope("decode_layer{}".format(i)):
                    
                tensor = self.self_attention(q=tensor, k=tensor, v=tensor, model_dim=self.model_dim, masked=True, name='masked_self_attention')
                tensor = tf.add(tensor, residual)
                tensor = residual = batch_norm(tensor,self.is_train)
                
                tensor = self.self_attention(q=tensor, k=encoder_inputs, v=encoder_inputs, model_dim=self.model_dim, masked=True, name='encoder_decoder_attention')
                tensor = tf.add(tensor, residual)
                tensor = residual = batch_norm(tensor,self.is_train)
                
                tensor = tf.layers.dense(tensor, self.ffn_dim, use_bias=use_bias, activation=tf.nn.relu)
                tensor = tf.layers.dense(tensor, self.model_dim, use_bias=use_bias)
                
                tensor = tf.add(tensor, residual)
                tensor = batch_norm(tensor,self.is_train)
                
        return tensor
    
    def __create_output(self, decoder_outputs, is_train, use_bias=False, reuse=False):
        with tf.variable_scope("Output", reuse=reuse):
            logits = tf.layers.dense(decoder_outputs, self.config.num_output, use_bias=use_bias)
            logits = tf.layers.batch_normalization(logits, training=is_train, fused=True)
            
        return logits
        
    def self_attention(self, q, k, v, model_dim=None, masked=False, use_bias=False, name='self_attention'):
        if model_dim is None:
            model_dim = self.model_dim
            
        with tf.variable_scope(name):
            # linear_projection
            q = tf.layers.dense(q, self.linear_key_dim, use_bias=use_bias, name='q_proj')
            k = tf.layers.dense(k, self.linear_key_dim, use_bias=use_bias, name='k_proj')
            v = tf.layers.dense(v, self.linear_value_dim, use_bias=use_bias, name='v_proj')

            # split_heads 
            q = self.__split_last_dimension(q, self.linear_key_dim)
            k = self.__split_last_dimension(k, self.linear_key_dim)
            v = self.__split_last_dimension(v, self.linear_value_dim)

            # scaled dot product 
            key_dim_per_head = self.linear_key_dim // self.num_heads

            output = tf.matmul(q, k, transpose_b=True)
            output = output / (key_dim_per_head**0.5)

            if masked:
                diag_vals = tf.ones_like(output[0, 0, :, :]) # (batch_size, num_heads, query_dim, key_dim)
                tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (q_dim, k_dim)
                masks = tf.tile(tf.reshape(tril, [1, 1] + tril.get_shape().as_list()),
                                [tf.shape(output)[0], tf.shape(output)[1], 1, 1])
                paddings = tf.ones_like(masks) * -1e9
                output = tf.where(tf.equal(masks, 0), paddings, output)

            output = tf.nn.softmax(output)
            output = tf.matmul(output, v)

            # concatenate heads
            output = tf.transpose(output, [0, 2, 1, 3]) # [batch_size, max_seq_len, num_heads, dim]
            t_shape = output.get_shape().as_list()
            num_heads, dim = t_shape[-2:]
            output = tf.reshape(output, [-1] + t_shape[1:-2] + [num_heads * dim])

            output = tf.layers.dense(output, model_dim, use_bias=use_bias)
            return output

    def __split_last_dimension(self, tensor, dim):
        t_shape = tensor.get_shape().as_list()
        tensor = tf.reshape(tensor, [-1] + t_shape[1:-1] + [self.num_heads, dim // self.num_heads])
        return tf.transpose(tensor, [0, 2, 1, 3])
    
    def __set_op(self, loss_op, learning_rate, optimizer_type="adam"):
        with self.graph.as_default():
            if optimizer_type=="adam":
                optimizer = tf.train.AdamOptimizer(learning_rate)
            elif optimizer_type == "adagrad":
                optimizer = tf.train.AdagradOptimizer(learning_rate, initial_accumulator_value=0.0001)
            elif optimizer_type == "sgd":
                optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            elif optimizer_type == "momentum":
                optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
            elif optimizer_type == "adadelta":
                optimizer = tf.train.AdadeltaOptimizer(learning_rate,rho=0.95,epsilon=1e-09)
            else : raise ValueError("{} optimizer doesn't exist.".format(optimizer_type))
            
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            vars_to_train = tf.trainable_variables()
            if self.freeze_encoder :
                vars_to_train = [var for var in vars_to_train if 'encode_layer' not in var.name]
                
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss_op, var_list=vars_to_train)

            self.sess.run(tf.variables_initializer(optimizer.variables()))
                
        return train_op

In [None]:
dataset = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv').fillna('')
testset = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv').fillna('')

In [None]:
train_vocab = set(' '.join(dataset['text']).split())
test_vocab = set(' '.join(testset['text']).split())

total_vocab = list(train_vocab) + list(test_vocab)
eng_word_vec = np.random.randn(len(total_vocab),100)
eng_word_vec = {w:v for w, v in zip(total_vocab,eng_word_vec)}

In [None]:
def preprocess_word(sentense, max_len=33):
    result = []
    for w in sentense.split():
        w = eng_word_vec[w]
        result.append(w)

    if len(result) != max_len:
        for _ in range(max_len-len(result)):
            result.append(np.zeros((100,)))
    return np.array(result)
        

In [None]:
sentiment_cls = ['neutral','negative','positive']
train_target = np.array([sentiment_cls.index(s) for s in dataset['sentiment']])
test_target = np.array([sentiment_cls.index(s) for s in testset['sentiment']])

In [None]:
train_input = list(map(preprocess_word,dataset['text']))
test_input = list(map(preprocess_word,testset['text']))
del eng_word_vec, dataset

batch_size = 500

input_sentences = []
input_sentiments = []

batch_sentences = []
batch_sentiments = []

for s, t in zip(train_input,train_target):
    batch_sentences.append(s)
    batch_sentiments.append(t)
    if len(batch_sentences) == batch_size:
        input_sentences.append(np.array(batch_sentences))
        input_sentiments.append(np.array(batch_sentiments))
        
        batch_sentences.clear()
        batch_sentiments.clear()
        
if len(batch_sentences) != batch_size:
    input_sentences.append(np.array(batch_sentences))
    input_sentiments.append(np.array(batch_sentiments))

In [None]:
tr_config = config(num_output=3, max_seq_length=33)

tf.reset_default_graph()
net = create(tr_config,
             phase = 'train',
             num_heads=8,
             num_layers=4,
             linear_key_dim=8,
             linear_value_dim=8,
             model_dim=100,
             optimizer_type='adam')

with tqdm(total=40) as pbar:
    pbar.set_description('Training')
    for _ in range(40):
        total_pred = []
        for i in range(len(input_sentences)):
            feed_dict={net.encoder_inputs : input_sentences[i],
                       net.targets : input_sentiments[i],
                       net.lr : 1e-3, 
                       net.is_train : True}

            pred, cost, _ = net.sess.run([net.class_logits, net.loss, net.train_op], feed_dict=feed_dict)
            total_pred.append(pred)

        total_pred = np.concatenate(total_pred).argmax(1)
        total_targets = np.concatenate(input_sentiments)
        train_acc = round(sum(total_pred == total_targets)/len(total_targets),5)

        feed_dict = {net.encoder_inputs: test_input, net.is_train : True}
        pred = net.sess.run(net.class_logits, feed_dict=feed_dict)
        
        pbar.set_postfix_str(f'train_ACC : {train_acc}')
        pbar.update(1)

In [None]:
ckpt_path = './sentiment_transformer'
saver = tf.train.Saver()
summary_writer = tf.summary.FileWriter(ckpt_path, net.sess.graph)
saver.save(net.sess, f"{ckpt_path}/model.ckpt")

In [None]:
tf.reset_default_graph()
net = create(tr_config,
             ckpt_path = './sentiment_transformer/model.ckpt',
             phase = 'test',
             num_heads=8,
             num_layers=4,
             linear_key_dim=8,
             linear_value_dim=8,
             model_dim=100,
             optimizer_type='adam')


one_hot_value = np.eye(3)

test_target = np.array([one_hot_value[y] for y in test_target])

feed_dict = {net.encoder_inputs: test_input}
pred = net.sess.run(net.class_logits, feed_dict=feed_dict)

with DeepExplain(session=net.sess) as explainer:
    explained_result = explainer.explain('elrp', net.class_logits * test_target, net.encoder_inputs, test_input)

In [None]:
from scipy.stats.mstats import winsorize
all_seq_len = [len(text.split()) for text in testset['text']]
relevance_score = [s[:leng] for leng, s in zip(all_seq_len,explained_result)]
r_std = np.array([r.std() if len(r)!=0 else 0 for r in relevance_score ]) 
r_std = (r_std-r_std.min())/(r_std.max()-r_std.min())
relevance_score = [winsorize(s,[abs(std-1e-16),0]) for s,std in zip(relevance_score,r_std) if len(s)!=0]

In [None]:
candidate_list = []
sentiment_cls = ['neutral','negative','positive']
idx=0
for text, cls in np.array(testset[['text','sentiment']]):
    if len(text) == 0 : continue
    cls = sentiment_cls.index(cls)
    
    if cls == 0:
        candidate = text
    else :
        d_score = sum(relevance_score[idx].data.T)
        d_where = d_score > (d_score.mean()+d_score.std())
        
        if sum(d_where) == 1:
            candidate = np.array(text.split())[d_where][0]
        elif sum(d_where) == 0 :
            candidate = text
        else :
            candidate = np.array(text.split())[d_where]
            candidate = [text.split().index(w) for w in candidate]
            c_min = min(candidate)
            c_max = max(candidate)
            candidate = text.split()[c_min:c_max+1]
            candidate = ' '.join(candidate)
    candidate_list.append(candidate)
    idx+=1

In [None]:
os.system(f'rm -rf {ckpt_path}')
testset['selected_text'] = candidate_list
testset[['textID','selected_text']].to_csv('submission.csv',index=False)
print("Submission successful")