In [None]:
!pip install bertviz

In [None]:
from bertviz import head_view
from transformers import XLNetTokenizer, XLNetModel

**XL-NET classifier**

In [None]:
from transformers import XLNetTokenizer, XLNetModel
model_version = 'xlnet-base-cased'
model = XLNetModel.from_pretrained(model_version, output_attentions=True)
tokenizer = XLNetTokenizer.from_pretrained(model_version)

text = "The quick brown fox jumps over the lazy dogs."
inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']
attention = model(input_ids)[-1]
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
print(attention)
head_view(attention, tokens)

In [None]:
#CLS token representation for last layer for head 0
import torch
print(torch.sum(attention[0][0][-1][-1]))

tensor(1.0000, grad_fn=<SumBackward0>)


**Calculating attention for the sentence**

In [None]:
#finding the max attention for the CLS token for last layer
attention_max = attention[0][0][-1][-1]
for a in attention:
  att = a[0][-1][-1]
  print(att)
  print(torch.sum(att))
  attention_max = torch.max(att,attention_max)
print(torch.norm(attention_max))

tensor([0.0738, 0.0747, 0.0648, 0.0884, 0.0515, 0.0537, 0.1108, 0.2174, 0.0812,
        0.0711, 0.0300, 0.0304, 0.0279, 0.0243], grad_fn=<SelectBackward0>)
tensor(1.0000, grad_fn=<SumBackward0>)
tensor([0.0161, 0.0084, 0.0039, 0.0142, 0.0025, 0.0145, 0.0307, 0.0252, 0.0585,
        0.0152, 0.0581, 0.2815, 0.3163, 0.1549], grad_fn=<SelectBackward0>)
tensor(1.0000, grad_fn=<SumBackward0>)
tensor([6.8712e-02, 5.3705e-02, 2.9311e-03, 5.1748e-01, 8.5324e-04, 2.1432e-02,
        3.2033e-02, 1.1316e-02, 2.0100e-01, 3.3107e-03, 3.6151e-04, 5.0048e-02,
        4.6718e-03, 3.2146e-02], grad_fn=<SelectBackward0>)
tensor(1., grad_fn=<SumBackward0>)
tensor([0.0245, 0.0052, 0.0373, 0.0512, 0.0131, 0.0185, 0.0101, 0.0105, 0.0578,
        0.0163, 0.0063, 0.5281, 0.1442, 0.0770], grad_fn=<SelectBackward0>)
tensor(1., grad_fn=<SumBackward0>)
tensor([0.1672, 0.0268, 0.0764, 0.0581, 0.0390, 0.0157, 0.0473, 0.0188, 0.0762,
        0.0104, 0.0295, 0.3122, 0.0839, 0.0387], grad_fn=<SelectBackward0>)
tensor(1

In [None]:
normalized_attention_max = torch.nn.functional.softmax(attention_max)
print(normalized_attention_max)
print(torch.sum(torch.tensor(normalized_attention_max)))

tensor([0.1165, 0.0557, 0.0578, 0.0867, 0.0544, 0.0546, 0.0578, 0.0643, 0.0632,
        0.0763, 0.0548, 0.0877, 0.0709, 0.0993], grad_fn=<SoftmaxBackward0>)
tensor(1.)


  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
len(attention)

12

**Calculating Reverse Attention**

In [None]:
# -1 gives the reverse - attention of the CLS token
import torch
reverse_attention =  torch.subtract(torch.tensor(1),torch.tensor(normalized_attention_max))
length = normalized_attention_max.shape[0]
print(length)
#normalizing the reverse attention
normalized_reverse_attention = reverse_attention/(length-1)
print(torch.sum(normalized_reverse_attention))

14
tensor(1.)


  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# out = model(**inputs)
# print(out)

## *Getting input embeddings intial*

In [None]:
input_embeddings = model.get_input_embeddings()(torch.tensor(inputs['input_ids']))
print("Initial shape of input embeddings: ", input_embeddings.shape)
print("Initial shape of normalized_reverse_attention",normalized_reverse_attention.shape)


Initial shape of input embeddings:  torch.Size([1, 14, 768])
Initial shape of normalized_reverse_attention torch.Size([14])


  """Entry point for launching an IPython kernel.


In [None]:
input_embeddings = input_embeddings.squeeze(0)
normalized_reverse_attention = normalized_reverse_attention.reshape(-1,1)
print("Initial shape of input embeddings: ", input_embeddings.shape)
print("Initial shape of normalized_reverse_attention",normalized_reverse_attention.shape)

Initial shape of input embeddings:  torch.Size([14, 768])
Initial shape of normalized_reverse_attention torch.Size([14, 1])


In [None]:
#imposing the reverse attention on the embedding matrix using element-wise multiplication
embeddings_updated = torch.mul(normalized_reverse_attention,input_embeddings)
print(embeddings_updated)

tensor([[-2.2942e-03, -2.2092e-03, -1.9920e-03,  ..., -2.3187e-03,
          2.0584e-03, -9.5169e-04],
        [ 5.8976e-03, -7.2265e-05, -2.9719e-03,  ...,  1.6594e-03,
          4.7470e-03,  4.4126e-03],
        [ 1.5321e-03, -1.3371e-04,  5.8807e-03,  ...,  2.0599e-03,
          2.6373e-03,  2.1585e-03],
        ...,
        [-1.6712e-03, -4.2549e-04, -1.1416e-03,  ..., -1.1254e-03,
          3.0278e-03, -2.1672e-03],
        [ 5.6310e-03, -4.1641e-03, -6.4672e-03,  ...,  3.5257e-03,
          4.5281e-03, -3.7160e-03],
        [ 1.2563e-03, -1.0350e-04, -1.0353e-02,  ...,  8.0741e-05,
         -6.4669e-05,  1.2999e-03]], grad_fn=<MulBackward0>)


In [None]:
print(embeddings_updated.shape)

torch.Size([14, 768])


**Adding GRU Layer**

In [None]:
#applying the GRU layer to the input embeddings
with torch.no_grad():
  input_gru = embeddings_updated.unsqueeze(0)
print("input shape: ",input_gru.shape)
hidden_gru = torch.rand(size =[2,1,250]) #D=2 for bidirectional
print("hidden state shape: ",hidden_gru.shape)
gru = torch.nn.GRU(input_size=768,hidden_size = 250,num_layers= 1,batch_first = True, bidirectional = True, dropout = 0)
gru_output, h_n = gru(input_gru,hidden_gru)
print("output shape: ", gru_output.shape)
print("final hidden state shape: ", h_n.shape)

input shape:  torch.Size([1, 14, 768])
hidden state shape:  torch.Size([2, 1, 250])
output shape:  torch.Size([1, 14, 500])
final hidden state shape:  torch.Size([2, 1, 250])


In [None]:
last_h = h_n.flatten(0).unsqueeze(0)
print(last_h.shape)

torch.Size([1, 500])


In [None]:
import torch
import numbers
from torch.nn.parameter import Parameter
from torch.nn import LayerNorm
from torch import Tensor, Size
from typing import Union, List, Tuple
from torch.nn import  init

class CLN (torch.nn.LayerNorm):
  __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
  normalized_shape: Tuple[int, ...]
  eps: float
  elementwise_affine: bool
  _shape_t = Union[int, List[int], Size]

  def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super(LayerNorm, self).__init__()
        if isinstance(normalized_shape, numbers.Integral):
            # mypy error: incompatible types in assignment
            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
            self.weight1 = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
            self.bias1 = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
            self.weight2 = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
            self.bias2 = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)

        self.reset_parameters()

  def reset_parameters(self) -> None:
        if self.elementwise_affine:
            init.ones_(self.weight1)
            init.zeros_(self.bias1)
            init.ones_(self.weight2)
            init.zeros_(self.bias2)

  def forward(self, input: Tensor) -> Tensor:
        #batch sent of  only one style at a time
        for input_i in input:
            if input_i[1] == 0:
              outputs.append(F.layer_norm(
                input_i[0], self.normalized_shape, self.weight1, self.bias1, self.eps))
            else:
              outputs.append(F.layer_norm(
                input_i[0], self.normalized_shape, self.weight2, self.bias2, self.eps))
        return outputs

In [None]:
linear_layer = torch.nn.Linear(500,200)
last_h_squeezed = linear_layer(last_h)

In [None]:
last_h_squeezed.shape

torch.Size([1, 200])

**Applying Layernorm**

In [None]:
from torch.nn import LayerNorm

ln_neutral = LayerNorm(normalized_shape=[1, 200]) #TODO check normalized shape
ln_right = LayerNorm(normalized_shape=[2, 1, 500])

#check class: TODO decide how to store which data the batch belongs to 
x_cln = ln_neutral(last_h_squeezed)
# if X[0] ==0:
#     X = ln_neutral(X[1:])
# else:
#     X = ln_right(X[1:])

In [None]:
x_cln.shape

torch.Size([1, 200])

In [None]:
last_h.shape

torch.Size([1, 500])

In [None]:
intial_hidden_state = torch.cat((last_h,x_cln), dim=-1)
intial_hidden_state = intial_hidden_state.unsqueeze(0)
intial_hidden_state.shape

torch.Size([1, 1, 700])

In [None]:
gru = torch.nn.GRU(input_size=768,hidden_size = 700,num_layers= 1,batch_first = True, bidirectional = False, dropout = 0)


**Adding the decoder GRU layer**

In [None]:
gru_output_decoder, h_n_decoder = gru(input_gru,intial_hidden_state)

In [None]:
h_n_decoder.shape

torch.Size([1, 1, 700])

In [None]:
gru_output_decoder.shape

torch.Size([1, 14, 700])

In [None]:
linear_layer_2 = torch.nn.Linear(700,768)
output_to_vocab = linear_layer_2(gru_output_decoder)

In [None]:
gru_output_softmax = torch.nn.functional.softmax(output_to_vocab,dim=2)
gru_output_softmax.shape

torch.Size([1, 14, 768])