# Base

This contains basic layers used for various functions.

In [29]:
%pip install --upgrade torch

import math
import unittest
import warnings
import random
import numpy as np
from collections.abc import Iterable

import torch
from torch import nn
from torch.nn import functional as F

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\chris\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


# Core

Core is used everywhere.


### View

A better form of torch's view.


### DimensionalTransform

Many, many functions of pytorch expect the dimension of interest to be on the last dimension, or at least located relative to it. I do not expect that. To accomodate nonstandard data loadouts, this class contains some functions to transform a tensor to have a particular dimension last, let you transparently do things to it, including increasing the number of dimensions, and transform it back

In [31]:
def craft_transforms(self, dim):
    """
    A method to calculate, then return, fuss free transforms and inverse transforms
    to allow editing of a particular dimension.

    """


    def roll_dims(tensor, direction):

      total_dims = len(tensor.shape)

      roll = dim %(- total_dims) #distance left to reach point from the end. Indexing starts at -1. Goes down
      roll = - roll #Indexing now starts at 1, goes up
      roll = (roll - 1) % total_dims #indexing now starts at 0. Goes up by roll needed.
      roll = roll*direction

      permute = torch.arange(len(tensor.shape))
      permute = permute.roll(roll).numpy()
      permute = [*permute]
      return tensor.permute(permute)

    #Create transform, inverse transform

    transform = lambda x: roll_dims(x, 1)
    inv_transform = lambda x: roll_dims(x, -1)

    return transform, inv_transform

def craft_mass_exchange(self, target_dims):
  """
  A method to make transforms to swap any dimensions, in any particular order,
  to the end of a tensor, or swap it back
  """
  if isinstance(target_dims, int):
    target_dims = [target_dims]

  #Calculate permuter. Do this by identifying target ending region,
  #target source region, then using a buffer to transfer entities
  #between these regions.

  swap_a = torch.arange(-len(target_dims), 0)
  swap_b = target_dims
  #Fortunately, this also builds the inverse permuter too.
  def exchange(tensor):
    total_dims = len(tensor.shape)

    source = torch.arange(total_dims)
    endings = [*torch.arange(-len(target_dims), 0).numpy()]

    permute = source.clone()
    permute[endings] = source[target_dims]
    permute[target_dims] = source[endings]
    permute = [*permute.numpy()]

    #Create transforms. Return

    return tensor.permute(permute)

  return exchange, exchange

def DimHandler(cls):
  """
  A decorator to apply this logic to a particular class
  """
  cls.craft_transforms = craft_transforms
  cls.craft_mass_exchange = craft_mass_exchange
  return cls



In [32]:
import torch

test_range = torch.zeros([1, 2, 3, 4, 5])
print(test_range.shape)
transform, inverse = craft_mass_exchange(None, [-1, -2])
intermediate = transform(test_range)
print(intermediate.shape)
final = inverse(intermediate)
print(final.shape)

torch.Size([1, 2, 3, 4, 5])
torch.Size([1, 2, 3, 5, 4])
torch.Size([1, 2, 3, 4, 5])


### ModularPadClip

A layer to perform ModularPadClip. This ensures an indicated dimension, by default -1, is divisible by modulo.

<img src="https://docs.google.com/drawings/d/e/2PACX-1vTmp0Iag-1dKXntJ6LUftoaagASaEAt_thOkFOVr_7XxKHOjNhAeA6bSjXcupHYsHdXYRpD8uLxREau/pub?w=960&amp;h=720">


In [33]:
@DimHandler
class ModularPadClip(nn.Module):
  """
  A class containing methods needed to do
  block padclip. This enlarges, or clips, a dimension to cleanly
  divide under modular divsion by modulo

  If not instantized, the static method
  .functional can be used to utilized the logic directly.

  Else, can be instantized with values to apply that to subsequent calls.


  Parameters are:

  :param: moduol: The size of the modulo to enforce
  :param variety: The kind of ModularPadClip to do.
    Allowed are "constant, reflect, replicate, circular, clip"
  :param value: Any auxilary values for padding purposes.


  """
  _pad_varieties = ("constant", "circular")
  _clip_varieties = ("clip")

  @classmethod
  def functional(cls, x, modulo, variety, value, dim=-1):
    """
    A function to perform block PadClip.

    This will reshape the last dimension

  :param x: The tensor to operate on
  :param modulo: The size of the modulo to enforce
  :param variety: The kind of ModularPadClip to do.
    Allowed are "constant, reflect, replicate, circular, clip"
  :param value: Any auxilary values for padding purposes.
  :param dim: The dimension to PadClip
    """

    #Create Transforms. Apply
    transform, inverse = cls.craft_transforms(dim, len(x.shape))

    #Begin Logic
    x = transform(x)
    item_length = x.shape[-1]

    ##If no reshape is needed, do not perform
    if item_length % modulo == 0:
      output = x

    ##Handle pad, clip cases
    elif variety in cls._pad_varieties:

      ###Figure out how much we need to pad the end by.

      required_blocks =  (modulo+item_length)//modulo    #### Perform integer ceil
      required_length = modulo*required_blocks #### Find required length
      padding_length = required_length-item_length #### Find padding needs

      ### Apply padding
      if variety == "constant":
        output = F.pad(x, (0, padding_length), variety, value)
      if variety == "circular":
        roll_slice = x[..., :padding_length] #Slice out first padding entries and concat to end.
        output = torch.concat([x, roll_slice], dim=-1)
    elif variety in cls._clip_varieties:
      required_blocks = item_length//modulo #### integer floor
      required_length = modulo*required_blocks #### How long the slice will be

      ### Apply slice and return
      output = x[..., :required_length]
    else:
      raise ValueError("Expected variety to be one of %s)" % (*cls._pad_varieties, *cls._clip_varieties))

    #Finish with restore
    return inverse(output)
  def __init__(self, modulo, variety="constant", value=0, dim=-1):
    """"

    :param modulo: The size of the modulo to enforce
    :param variety: The kind of ModularPadClip to do.
      Allowed are "constant, reflect, replicate, circular, clip"
    :param value: Any auxilary values for padding purposes.
    :param dim: The dimension to PadClip
    """

    super().__init__()

    #store

    self._modulo = modulo
    self._variety = variety
    self._value = value
    self._dim = dim
  def forward(self, x):
    return self.functional(x, self._modulo, self._variety, self._value, self._dim)

#Unitcode

##Define test function
def TestFunc(tensor, modulo, variety, value, dim):
  #Introduce test
  print("Testing ModularPadClip. The input parameters are:")
  variable_message = "tensor: %s, modulo: %s, variety: %s, value: %s, dim: %s" % (tensor.shape, modulo, variety, value, dim)

  print(variable_message)

  #Run test
  test_layer = ModularPadClip(modulo, variety, value, dim)
  output = test_layer(tensor)

  #Say results
  print("The output shape was:")
  print(output.shape)

#Perform some tests
test_vec = torch.zeros([30, 67, 38, 91])
TestFunc(test_vec, 7, "constant", 0, -1)
TestFunc(test_vec, 7, "constant", 0, -2)
TestFunc(test_vec, 7, "constant", 0, -2)
TestFunc(test_vec, 5, "circular", 0, -3)
TestFunc(test_vec, 7, "clip", 0, -2)


Testing ModularPadClip. The input parameters are:
tensor: torch.Size([30, 67, 38, 91]), modulo: 7, variety: constant, value: 0, dim: -1
The output shape was:
torch.Size([35, 67, 38, 91])
Testing ModularPadClip. The input parameters are:
tensor: torch.Size([30, 67, 38, 91]), modulo: 7, variety: constant, value: 0, dim: -2
The output shape was:
torch.Size([35, 67, 38, 91])
Testing ModularPadClip. The input parameters are:
tensor: torch.Size([30, 67, 38, 91]), modulo: 7, variety: constant, value: 0, dim: -2
The output shape was:
torch.Size([35, 67, 38, 91])
Testing ModularPadClip. The input parameters are:
tensor: torch.Size([30, 67, 38, 91]), modulo: 5, variety: circular, value: 0, dim: -3
The output shape was:
torch.Size([30, 67, 38, 91])
Testing ModularPadClip. The input parameters are:
tensor: torch.Size([30, 67, 38, 91]), modulo: 7, variety: clip, value: 0, dim: -2
The output shape was:
torch.Size([28, 67, 38, 91])


### RelPosEncoding

Absolute encodings are not used here. Rather, everything is relative, and as a result a modified version of PosEncoding is needed.

The standard PosEncoding using the sinosoid is utilized, with a few new twists. It is possible for the zero point, or start point, to be defined later in the sequence - allowing the clear definition of the concept of "prior" sections. The prior concept can be utilized to embed local block information. Or it can be made to operate in standard format instead.


In [34]:

class RelPosEncoding(nn.Module):
  """
  A class to perform RelPosEncoding along two dimensions.

  Due to frequency being a required quantity for buffering to work,
  this does not have a functional variety. It will, however, resize
  its buffer if it is too small, and throw a warning when doing it.

  It is, however, required that channel width stay consistent. One additional
  complication which should also be noted is that since positional encodings
  will be added in frequently, the combination strength is a learnable parameter.
  """

  @property
  def length(self):
    return self._offset + self._standard
  @staticmethod
  def calculate_buffer(offset, standard_length, channel_width, period, dilation=1, suppress_errors=False):
    """
    Perform a bog-standard sinosoidal posencoding calculation, with the notable
    distinction of permitting negative item values

    """
    if suppress_errors is False:
      warnings.warn("RelPosEncoding layer is recalculating buffer. Consider choosing better defaults if this happens often")

    #Create item indices, create channel indices
    pos_enc = torch.arange(-offset, standard_length)
    pos_enc = pos_enc*dilation
    channel_indices = torch.arange(channel_width)

    #Unsqueeze for cross multiplication
    pos_enc = pos_enc.unsqueeze(-1)
    channel_indices = channel_indices.unsqueeze(-2)

    #Create eval input table, create buffer
    inputs = 2*math.pi*pos_enc*(1/period)**(channel_indices/channel_width)
    encodings = torch.zeros([offset + standard_length, channel_width])

    #Calculate buffer. Note that I want the offset=0 entry to be a sin instance. I jump through
    #a few extra hoops to ensure this.

    if offset % 2 == 0:
      encodings[::2, :] = torch.sin(inputs[::2, :])
      encodings[1::2, :] = torch.cos(inputs[1::2, :])
    else:
      encodings[1::2, :] = torch.sin(inputs[1::2, :])
      encodings[::2, :] = torch.cos(inputs[::2, :])

    #Finally, return

    return encodings


  def _recalculate_needed(self, length, offset):
    if offset > self._offset:
      return True
    if length > self.length:
      return True



  def _read_buffer(self, length, offset):
    """
    A method to read out of the buffer. If needed, recalculates the buffer

    """

    #Figure out if I need to enlarge my buffer. If so, double dimensions which are too small.

    if self._recalculate_needed(length, offset):
      print("recalculate")
      if offset > self._offset:
        self._offset = 2*self._offset
      if length > self.length:
        self._standard = 2*length
      self.encodings = self.calculate_buffer(self._offset, self._standard, self._width, self._period)


    #Access buffer. Slice out needed segment. Return it
    buffer_shift = self._offset - offset
    encoding_slice = self.encodings[buffer_shift:(length + buffer_shift)]
    return encoding_slice

  def __init__(self, offset=0, pos_length=1000, channel_width=100,  period=10000, dilation=1):
    #Warm up torch
    super().__init__()

    #Store variables.
    self._offset = offset
    self._standard = pos_length
    self._width = channel_width
    self._period = period

    #Store parameters
    self._alpha = nn.Parameter(torch.zeros(1))

    #Create the buffer
    buffer = self.calculate_buffer(offset, pos_length, channel_width, period, suppress_errors=True)
    self.register_buffer('encodings', buffer)

  def forward(self, x, offset):
    """
    The forward pass. Retrieves needed information from buffer,
    and then mixes it where requested

    """

    x = x + torch.sigmoid(self._alpha)*self._read_buffer(x.shape[-2], offset)
    return x

#Unit testing

def TestFunc(tensor, init_offset, eval_offset, init_pos_length, channel_width, period):
  print("Performing unit test for RelPosEncoding")


  items = (tensor.shape, init_offset, eval_offset, init_pos_length, channel_width, period)
  msg = "Parameters are: tensorshape: %s, init_offset: %s, eval_offset: %s, init_pos_length: %s, channel_width: %s, period: %s" % items
  print(msg)
  test_layer = RelPosEncoding(init_offset, init_pos_length, channel_width, period)
  outcome = test_layer(tensor, eval_offset)
  print("Output shape is:")
  print(outcome.shape)
  return outcome

#Run

small_vector = torch.zeros([5, 4])
test_vector = torch.zeros([10, 100, 150, 5])

#Test offset behavior by manual view

print("Manual inspection: \n")
print(TestFunc(small_vector, 0, 0, 100, 4, 100))
print(TestFunc(small_vector, 1, 1, 100, 4, 100))
print(TestFunc(small_vector, 2, 1, 100, 4, 100))

#Test handling of buffer resize

print("Buffer resize: \n")
TestFunc(test_vector, 5, 10, 200, 5, 10000)
TestFunc(test_vector, 5, 10, 50, 5, 10000)
'done'


Manual inspection: 

Performing unit test for RelPosEncoding
Parameters are: tensorshape: torch.Size([5, 4]), init_offset: 0, eval_offset: 0, init_pos_length: 100, channel_width: 4, period: 100
Output shape is:
torch.Size([5, 4])
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 5.0000e-01, -2.0211e-01,  4.0451e-01,  4.9016e-01],
        [ 1.7485e-07, -3.6972e-01,  4.7553e-01,  1.9350e-01],
        [ 5.0000e-01,  4.7423e-01, -1.5451e-01,  4.1377e-01],
        [ 3.4969e-07,  4.9781e-01,  2.9389e-01,  3.5685e-01]],
       grad_fn=<AddBackward0>)
Performing unit test for RelPosEncoding
Parameters are: tensorshape: torch.Size([5, 4]), init_offset: 1, eval_offset: 1, init_pos_length: 100, channel_width: 4, period: 100
Output shape is:
torch.Size([5, 4])
tensor([[ 5.0000e-01, -2.0211e-01,  4.0451e-01,  4.9016e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 5.0000e-01, -2.0211e-01,  4.0451e-01,  4.9016e-01],
        [ 1.7485e-07, -3.6972e-01



'done'

### LocalView

LocalView is an extremely important class used to efficiently construct a tensor representing local information between tenser entries, even across differing parameters. It places this information in up to two new dimensions, centered on where the original dimension was.

**Output**

The class expands a dimension, in place, to include three extra dimensions, collocolally referred to as the head, primary, and local dimension. Head is in front of the initial dimension; local behind

For example, a tensor of shape (..., view_dim, ...) would
have an output shape analogous to (..., head_dim, primary_dim, local_dim, ...)

**Parameters: Prior, Post, dim**

The parameters prior and post indicate to the class how many neighbors to include in the slicing behavior. Meanwhile, dim is an integer that, straightforwardly, chooses which dimension to perform a local view on. Knowing only this, one can already extract a reasonably accurate local view of a problem.


Using just these methods, one can easily extract a local view of a particular dimension, while changing no defaults. For instance:

(..., view_dim, ...) -> (..., 1, view_dim, prior + 1 + post)

The last dimension ends up giving first all prior items in order, then the current dimension, then the post dimensions. Notably, any missing sections, as would be seen near the start and end, are padded with zeros. The heading dimensions, since only one dilation is used, is one.

**Parameters: Stride, Dilations**

Stride and Dilation makes things significantly more complicated.

Stride controls the rate of traversal along the middle output dimension. That is, how fast does the center viewpiece move along. An insufficient number of viewing items will result in the edges simply being clipped, so keep this in mind.

Dilations, on the other hand, control two items. Number one, it controls the dilation parameter of the local dimension. . This is how many units away from the center focus we jump when performing each lookup. Here, any missing information IS handled by padding.


Number two, it controls how many heads are produced. Dilations can be a list of ints, or an int. In the case of a list of ints, each dilation parameter will be evaluated independently, and the results stacked and returned along the head dimension.

**Performance**

A lot of tricks are used to keep the performance high. First, an underlying buffer is constructed which contains all the padding, and the parts declared in the right order. Then, for each dilation a combination of offset, shape, and striding parameters is taken to get the appropriate view. Finally, all the views are stacked then returned.

I expect this to be a significant part of the model, and thus it must be fast.



**Buffer**

The first thing LocalView

**Buffer**


In [35]:
class LocalView(nn.Module):
  @staticmethod
  def split_three_ways(listobject,index):
    """

    Extracts all elements up to an entry, the entry itself, and all elements
    after an entry into three separate tensors

    """
    #Calculate implicit, handle negative indexing
    listobject = [*listobject[:]]
    total_dimensions = len(listobject)
    index = index % total_dimensions
    #Calculate split quantities

    prior_sliceout = index
    post_sliceout = prior_sliceout + 1

    prior = listobject[:prior_sliceout]
    center = listobject[prior_sliceout:post_sliceout]
    post = listobject[post_sliceout:]

    return prior, center, post

  @staticmethod
  def merge_three_ways(lista, listb, listc):
    """
    The reverse of s  plit three ways
    """
    return lista + listb + listc

  @staticmethod
  def merge_three_ways(lista, listb, listc):
    """
    The reverse of s  plit three ways
    """
    return lista + listb + listc
  @classmethod
  def functional(cls, tensor, prior, post, stride, dilation, dim, pad=True):
    """
    The functional definition of the LocalView Process.

    Expects the last dimension to be the one we wish to view

    A stride, then pad, then another stride is performed internally.

    The extra local channel is defined as the second to last entry in the
    tensor dimensions
    """
    #Turn dim into a list if it is not one already

    dim = dim % len(tensor.shape)
    
    #Calculate the undershoot and overshoot. This will be used to either pad to the appropriate length,
    #or change the output size.
    
    dilation_undershoot = prior*dilation #The number of prior segments, times the factor we are extending each segment
    dilation_overshoot = post*dilation #The number of post segments, times the factor by which we are extending each segment

    #Create the underlying data buffer. Do this by calculating padding quantities, if padding will be used, or else
    #just rearranging where the buffer starts
    
    if pad:
        pad_op = (dilation_undershoot, dilation_overshoot)
    else:
        pad_op = (0, 0)

    buffer_tensor = tensor.swapdims(-1, dim)
    buffer_tensor = F.pad(buffer_tensor, pad_op)
    buffer_tensor = buffer_tensor.swapdims(dim, -1)

    #Preprocess as much as possible. The individual dimension lengths and stride shapes
    #will change, but these will not. Also, carefully handle the cases with padding, or no padding


    shape_priors, shape_peeloff, shape_post = cls.split_three_ways(buffer_tensor.shape, index=dim)
    stride_priors, stride_peeloff, stride_post = cls.split_three_ways(buffer_tensor.stride(), index=dim)

    if pad:
        shape_peeloff = tensor.shape[dim]
    else:
        shape_peeloff = tensor.shape[dim] - dilation_undershoot - dilation_overshoot
    
    stride_peeloff = stride_peeloff[0]

    ##

    dim_shape_length = shape_peeloff//stride #Integer Floor ensures stride jump stops early enough
    local_shape_length = prior + 1 + post #Should be self, plus priors and posts. Mind padding is needed!

       

    #calculate the shape and stride parameters. Then create and return the tensor

    dim_stride_size = stride_peeloff*stride #Increase advancement by stride rate
    local_stride_size = stride_peeloff*dilation #Increase dilation buffer by dilation rate.

    update_shape = [dim_shape_length, local_shape_length]
    update_stride = [dim_stride_size, local_stride_size]

    reshape = cls.merge_three_ways(shape_priors, update_shape, shape_post)
    restride = cls.merge_three_ways(stride_priors, update_stride, stride_post)

    output_tensor = buffer_tensor.as_strided(reshape, restride)



    return output_tensor

  def __init__(self, prior, post, stride=1, dilation=1, dimension=-1, pad=True):
    """
    The initializer for layer access. Unlike the
    functional variety, this will happily view and restore channels
    other than the last one.
    """
    #Spin up torch

    super().__init__()

    #Store parameters

    self._prior = prior
    self._post = post
    self._stride = stride
    self._dilations = dilation
    self._dim = dimension
    self._pad = pad
  def forward(self, x):

    return self.functional(x, self._prior, self._post, self._stride, self._dilations, self._dim, self._pad)

#Create unit testing


def TestFunc(tensor, prior, post, stride, dilation, dimension, pad=True):
  print("Performing unit test for LocalView")


  items = (tensor.shape, prior, post, stride, dilation, dimension)
  msg = "Inputs are: tensor: %s, prior: %s, post: %s, stride: %s, dilation: %s, dimension: %s" % items
  print(msg)
  test_layer = LocalView(prior, post, stride,  dilation, dimension, pad)
  outcome = test_layer(tensor)
  print("Output shape is:")
  print(outcome.shape)
  return outcome

simple_test = torch.arange(20)
simple_test = simple_test.view([10,2,1])

print("Performing manual test. Verify sequence is dilating")
print("input")
print(simple_test.squeeze())
print(simple_test.shape)
test_layer = LocalView(1, 1, 1, 1, -3, False)
test_output = test_layer(simple_test)
test_output = test_output.squeeze()
print("output, (squeezed)")
print(test_output)
print(test_output[0])
print(test_output[1])
print(test_output.shape)


complex_test = torch.zeros([30, 24, 4, 6])

#Test base complex cases
TestFunc(complex_test, 1, 1, 1, 1, -1)
TestFunc(complex_test, 1, 2, 1, 1, -1)
TestFunc(complex_test, 3, 2, 1, 1, -1)
TestFunc(complex_test, 3, 2, 4, 3, -1)
TestFunc(complex_test, 3, 2, 4, 3, -3)
cap = TestFunc(complex_test, 3, 2, 1, 1, -1, False)

"done"

Performing manual test. Verify sequence is dilating
input
tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17],
        [18, 19]])
torch.Size([10, 2, 1])
output, (squeezed)
tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 2,  3],
         [ 4,  5],
         [ 6,  7]],

        [[ 4,  5],
         [ 6,  7],
         [ 8,  9]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[ 8,  9],
         [10, 11],
         [12, 13]],

        [[10, 11],
         [12, 13],
         [14, 15]],

        [[12, 13],
         [14, 15],
         [16, 17]],

        [[14, 15],
         [16, 17],
         [18, 19]]])
tensor([[0, 1],
        [2, 3],
        [4, 5]])
tensor([[2, 3],
        [4, 5],
        [6, 7]])
torch.Size([8, 3, 2])
Performing unit test for LocalView
Inputs are: tensor: torch.Size([30, 24, 4, 6]), prior: 1, post: 1, stride: 1, dilation: 1, d

'done'

### Pad

One relatively annoying thing about torch is that the padding behavior of torch is difficult to use on off target dimensions.

This is just a little function to make life a little easier, allowing for the padding of a target dimension

In [36]:
def Pad(tensor, dimension, paddings):
  tensor = tensor.swapdims(-1, dimension)
  tensor = F.pad(tensor, paddings)
  tensor = tensor.swapdims(-1, dimension)
  return tensor

## Select_Mask

## Linear Reshape

The LinearReshape layer combines a View with a Linear layer in order to permit the seamless reshaping of layers. It has particular utility performing actions such as generating or eliminating heads. 

In [37]:
class LinearReshape(nn.Module):
  def __init__(self, input_shape, output_shape):
    super().__init__()
    
    input_size, output_size = np.prod(input_shape), np.prod(output_shape)
    self._preprocessing = View(input_shape, input_size)
    self._linear = nn.Linear(input_size, output_size)
    self._postprocessing = View(output_size, output_shape)
    
  def forward(self, x):
    x = self._preprocessing(x)
    x = self._linear(x)
    x = self._postprocessing(x)
    return x

#Unit testing



def TestLinearReshape(tensor, input_shape, output_shape):
    print("Test Linear Reshape")
    items = (tensor.shape, input_shape, output_shape)
    print("Inputs are: shape: %s, input_shape: %s, output_shape: %s" % items) 
    layer = LinearReshape(input_shape, output_shape)
    output = layer(tensor)
    print("output shape is:")
    print(output.shape)
    return output

tensor = torch.zeros((30, 20, 15, 5, 3))

TestLinearReshape(tensor, (15, 5, 3), 225)
"done"

    
    

Test Linear Reshape
Inputs are: shape: torch.Size([30, 20, 15, 5, 3]), input_shape: (15, 5, 3), output_shape: 225
output shape is:
torch.Size([30, 20, 225])


'done'

## Feedforward

Feedforward is a fairly standard, two layer feedforward process for allowing connectons to occur between channel entries in a transformer, or wherever else this might be needed. It works by performing a higher dimensional projection, then allowing the machine itself to choose where and how to slice. 

It consists of a linear layer, followed by a relu, followed by another linear layer


In [38]:
class Feedforward(nn.Module):
    pass


## Transformer

Transformer contains the basic essencials needed in order to make the transformer process into a reality. The transformer utilized in these projects consist of a query, content input which can be simaltaniously driven for self attention, or separately driven for focus attention. Inside the layer, the query, key, and value head projection are located, attention is performed, and the results returned.

Notably, masking is determined by the mask parameter, which can be fed in at initiation time, or run time. If fed during initiation, it must be provided as "lower", "upper", or "None". Runtime can be anything, but must be compatible under matrix multip

In [39]:
class Transformer(nn.Module):
    __permitted = (None, "lower", "upper")
    @property
    def mask(self):
        return self._mask
    @mask.setter
    def mask(self, value):
        assert value in self.__permitted, "mask cannot be set to this"
        self._mask = value
    def __init__(self, channel_dim, head_width, mask=None):
        
        """
        
        Accepted mask is "lower", "upper", or none
        
        """
        
        #Spin up torch
        super().__init__()
        
        #Create action generators
        QueryGen = LinearReshape(channel_dim, (head_width, channel_dim))
        KeyGen = LinearReshape(channel_dim, (head_width, channel_dim))
        ValGen = LinearReshape(channel_dim, (head_width, channel_dim))
        
        CollapseGen = LinearReshape((head_width, channel_dim), channel_dim)
        
        #Create actions. Note the swap is needed to get the head in front of the items. 
        
        self._query = lambda x : QueryGen(x).swapdims(-2, -3) 
        self._key = lambda x : KeyGen(x).swapdims(-2, -3)
        self._value = lambda x : ValGen(x).swapdims(-2, -3)
        self._dehead = lambda x : CollapseGen(x.transpose(-2, -3))
        
        self.mask = mask
        
    def forward(self, query, content, mask=None):
        #Create query, key, value
        
        query = self._query(query)
        key = self._key(content).swapdims(-1, -2)
        value = self._value(content)
        
        #Create focus matrix. Mask. Softmax.
        
        focus = query.matmul(key)
        focus_dims = focus.shape[-2:]
        if mask is None:
            #Runs only if not provided a mask.
            if self.mask == "lower":
                mask = torch.tril(torch.ones(focus_dims))
                focus = focus.masked_fill(mask == 0, -1e9)
            if self.mask == "upper":
                mask = torch.triu(torch.ones(focus_dims))
                focus = focus.masked_fill(mask == 0, -1e9)

        focus = F.softmax(focus, dim=-1)
        
        #Apply focus matrix to values. Then compact head
        
        output = focus.matmul(value)
        output = self._dehead(output)
        
        return output
    
#Unit code.

def TestFunc(query, content, channel_width, head_width, mask_type, mask_value = None):
    print("Performing test of Transformer")
    items = (query.shape, content.shape, channel_width, head_width, mask_type, mask_value)
    print("Inputs are: shape: %s, shape: %s channel_width: %s, head_width: %s, mask_type: %s, mask_value: %s" % items)
    test_layer = Transformer(channel_width, head_width, mask_type)
    test_output = test_layer(query, content, mask_value)
    print("output shape is: %s"  % str(test_output.shape))
    return test_output

test_query = torch.zeros(10, 30, 40, 10)

item = torch.zeros(10, 30, 40, 10, 10)
item.view((10, 30, 40, 100))
test_content = torch.zeros(10, 30, 20, 10)
TestFunc(test_query, test_content, 10, 10, None)
TestFunc(test_query, test_content, 10, 20, None)
TestFunc(test_query, test_content, 10, 10, "lower")

"done"
    

Performing test of Transformer
Inputs are: shape: torch.Size([10, 30, 40, 10]), shape: torch.Size([10, 30, 20, 10]) channel_width: 10, head_width: 10, mask_type: None, mask_value: None
output shape is: torch.Size([10, 30, 40, 10])
Performing test of Transformer
Inputs are: shape: torch.Size([10, 30, 40, 10]), shape: torch.Size([10, 30, 20, 10]) channel_width: 10, head_width: 20, mask_type: None, mask_value: None
output shape is: torch.Size([10, 30, 40, 10])
Performing test of Transformer
Inputs are: shape: torch.Size([10, 30, 40, 10]), shape: torch.Size([10, 30, 20, 10]) channel_width: 10, head_width: 10, mask_type: lower, mask_value: None
output shape is: torch.Size([10, 30, 40, 10])


'done'

# Breakthrough

Extremely potent algorithms which will be used for many purposes

## Feedback Transformer

Transformers are good when you want it to be the case that the items of the output should be equal to the query. Unfortunately, they are not quite as great when the items in the output should be equal to the items in the content. The feedback transformer is my solution to this problem.



## Resize Transformer

Transformers will be utilized to resize the layers, but that still leaves a problem: Generating the queue. While it will be the case that the Key and Value can be generated from the incoming stream directly, the Queue needs to be the length of the 
output to work correctly. 



**Queue**

The queue is generated by performing an interleave expansion, a view, and then either a max or a mean. 

Let K be the initial embeddings size, and L be the target embeddings size. Perform a repeat interleave along embeddings of length L. The embeddings now cleanly divide into L. Go and perform a view on the embeddings, looking at (L, K). It is now the case that one dimension is the right shape. 

At this point, any summarizing statistic can be performed on the last embedding dimension to get something of length L. Mean is predicted to be particularly helpful when operating locally, but max might be useful as well.

**Transformer**

With a Queue tensor made, Transfomer can be executed. Take the Queue, take the Content, expand heads and transform. Resize is then complete. 



## Optimized Transformers

**Introduction**

Transformers are quite effective, however they suffer from some flaws, particularly when working with extremely large text corpuses. The primary problem is that though one might have more than enough computational capacity, the memory interchange becomes burdensome as the input widths continue to increase. This is by virtue of the fact that every input must condition every output in order for the transformer process to work properly.

The Local Philosophy is a way around this. It consists of creating locally relevant views of the incoming data, and performing transformer on that. In this way, the same computation can be performed with relatively little intercommunication. 

Meanwhile, a global feedback mechanism ensures that global feedback is generated in several stages, and fed back into the model. This global feedback mechanism is meant to contain far fewer channels of information than a standard transformer. 

Both an ability to resize, along with an ability to consider things locally, is needed for this to work. Thus the development of the local resize transformer.



# Local Transformer

## Local Resize

Local Resize is more or less just a mechanism 

## Local Resize Transformer

The Local Resize Transformer consists of a transfomer capable of elegantly performing local resize operations. It consists of two primary features. These are:

* Resize Generation
* Query, Content operation



## Local Resize Transfomer

Local transfomer consists of a set of methods for local transformer, with possible resize, to occur

**Local Transformer**

Local transformer is a variety of transformer developed for two purposes. One of these is to take advantage of local information, such as the relative positioning of words. The other is to enable incredibly parallel computations.

Overlapping views are created of an input 

. One problem which traditional transformers are faced with is that 

optimized

## 

## Decisive Resize  Gradient

The Decisive Virtual Gradient is a philosophy and set of classes and methods designed to allow the smooth recovery of a differential gradient in a situation which might otherwise not permit it. It is designed particularly to function in discard situations - where information will be permanently lost. This scenario is not an altogether uncommon situation, appearing everywhere in life. Despite this, there are few tools to smoothly handle the situation. As a result, I developed the Decisive virtual gradient. 

The conditions required for this algorithm to function are related to the loss, which must be labeled, and to the type of decision, which must be one of sorting, where ensuring the proper number of items are sorted into a particular catagory is what matters. 
### Explanation


Let there be some circumstance in an algorithm in which one wants to make a non-differentiable decision affecting the configuration of the rest of the model. This might occur if, for example, you are deciding whether to expand the number of words in a string, or discard information down a relu.

Let it be the case that there are a clear set of decision labels y_d capable of telling us what the proper configuration at the end should be, in either a statistical or direct manner. Let there be a hidden layer, H, with input parameters h, at which we need to make a decision. Let it be the case that the decision is made by taking h and using it to, for each batch entry, form a set of logits representing each entry, with the argmax indicating the actual decision. 
This is difficult to handle using standard methods. Although policy gradients can help select proper parameters, they are exaustive and slow. Standard gradient descent will fail utterly without a manifold. Instead, we do the following.

#### Virtual Gradient Resize

The decision process is designed to produce, at the end, a "handle" of sorts in the neural network through which, by knowing label information regarding required tensor length later on, we can expand or contract at will while utilizing gradient descent. The process works something like follows. 

There exists any number of standard tensors, and a special tensor, known as history. There also exists a layer, known as decisive resize, which performs the following action. Channels are reduced down to a sequence of logits by a linear layer, and from this sequence whether to pass this forward or not into the corrosponding output tensor is determined by whether the logit is positive or negative. Then, the logits are split into two, by their positive and negative sections, and 



Once this decision is made, the logit tensors are split into positve and negative versions, then abs, summed, and normalized, then concatenated together. This tensor is then known as local history. Local history is then added to the incoming global history tensor, producing a new global history. The output tensor is then fed forward, along with the global history, until the loss endpoint is reached

#### Loss endpoint

The loss endpoint is fed two items. First, there are the tensors about which we actually want to make a decision on. Second, there is the history tensor, a backpropagation handle. The endpoint first evaluates the shape of the output - primarily whether it is acceptable. If it is not, it uses the handle to produce a loss and indicate this. Else, the

Local history has the property that if you wanted to shut off some of the individual logits, one could penalize the true logit portion and incentivise the false logit portion. Vise Versa is also true. This allows the control, on a local level, of parameters which do not have nice gradients. 

Further control may then be gained by 


* Take in History, Tensor.
* use tensor. Create binary decision,
* Separate logits by true, false. Sum results
* Normalize logits by number of true/false cases
* Add to history. 


#### Decision 

We start by making a decision kernel. For each individual decision, and each incoming channel of whatever nature, we run it through a linear which outputs options according to the number of decisions which are available. We then split it into two sections, based on the sign of the logits. If the sign is positive, the case is said to be accepted, and will be passed to that output. If negative, it is said to be negative, and will not be passed forward out that channel

Internal decision order is preserved.

#### Decision Graph Backpropagation.

This is, of course, impossible to perform gradient descent against, so I introduce decision graph backpropogation. The idea behind this technique is to turn the logits into probabilities, update these as we go along, and use them  . First, we split the logits up into two tensors, containing only the positive




#### Decision

We begin by making a decision kernel, which for each individual decision option runs it through a linear to form a logits representing the choices and their weights. This is then utilized for a maxarg based decision, telling us where to place each input. The inputs are then routed to the appropriate output, and returned

Meanwhile, back at the logits point, an additional action occurs. An input into this layer should consist of any previous 
routing action. Once the logits are developed, each logit is sorted into two catagories, consisting of the null representitives and active representitives
At each decision point, we begin by defining a decision kernel, and decision space, consisting of taking the input 



We construct a graph using a sort of "routing" logic fairly similar to 

#### Forward


Accept in the values of h. Out of these, make the decisive logits. Using the decisive logits, perform the decision. Then we cache the decisive logits.

Then it is time to create the representation. The represention is created by summing up the logits per catagory, and is then conditioned by hinge loss later.


####


#### Loss

Loss is calculated in one of two ways. It is first calculated according to decision parameters. If this is passed, it is then calculated in terms of standard gradient descent. Due to labels frequently being of a particular size, it was felt this is needed. 

When loss is calculating in decision mode, the gradient parameters are the maximum and minimum counts for particular categories. The layers then must know what to do with this.

==

#### Backwards

During back propogation, additional tensor information is 



The decisive virtual gradient consists of 







Let it be the case that the actual decision is made by turning 

In [40]:
test = torch.arange(3000).view(100, 30).type(torch.float32)
layer = nn.LayerNorm(30)
output = layer(test)
torch.square(output).sum(dim=-1)

tensor([30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000,
        30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0001,
        30.0000, 30.0000, 29.9999, 30.0000, 30.0000, 30.0000, 30.0000, 29.9999,
        29.9999, 30.0000, 30.0000, 30.0000, 30.0000, 29.9999, 29.9999, 30.0000,
        30.0001, 30.0000, 30.0000, 30.0000, 30.0000, 30.0000, 30.0001, 30.0001,
        30.0000, 30.0000, 30.0000, 30.00

## Pyramid

It is the case that many of the models I am trying to develop follow a particular pattern. This pattern is that they are located orthogonally, with escalation, then higher order processing, then finally lower order processing occuring. It is the case that operating according to this pattern consistently without some framework will prove quite difficult. 

### Motivation

It is the case that the further you get from the input source, the harder a model gets to train. Gradient descent tends to blow up or decay to nothing, or even just have trouble getting started, if an excessively deep model is used. Nonetheless, it is the case that deep models are needed to handle complex behavior, but needs the shallow parts to handle the simple cases. 

This begs the question: What would be an ideal model construction? One might say that an ideal model might handle simple and shallow behavior with simple but extensive lower layers, and use higher but more compact layers to represent global state. Additionally, an ideal model might not force the entirety of the model to be used at all for more simple cases.

One method of constructing something such as this might be the escalating pyramid seen in (item). This is, without a doubt, functional. It is excellent in cases with few priors about the arrangement of information.  However, it is the case this may be significantly suboptimal for the case where local relations are present between the items. Additional problems also present themselves: frequently, it is the case that the big picture will subsequently inform the way the little details needed to be examined, and such a model does not efficiently take this into account, instead forcing such analysis on the hgiher levels of the model. Ideally, we would instead like tbhe model itself to possess the ability to condition the output using it's higher level items, in a somewhat recursive process.

It also is scarcely designed for transformer networks. An additional framework is needed, one which can deal with the local, while being informed by the global.

This section provides such framework. The class interface consists of three parts, and an init. These are the Processing, Integration, and Finally methods.
### Interface

**Init**

Init allows the definition of the model as an terminal or intermediary layer. It also controls whether feedback
is included per layer or not.

**Processing**

condition will accept the input and any feedback; in the case of no feedback, the feedback slot will be provided with None. It will develop two outputs which are the conditioning and processing tensor respectively. The processing tensor will be escalated deeper into the pyramid, while the conditioning tensor will be fed forward into the feedback layer

**Integration**

The integration layer is responsible for making the results more local. The integration layer is fed the conditioning tensor and the results of processing. It may then return either a single results tensor, or a results tensor and a feedback tensor. In the event of being fed a feedback tensor, this tensor will then be utilized to 

**Finally**

The finally method is fed the initial input and the integration output. It primarily exists for the purpose of normalization tricks. It should return a single output.




In [41]:
class PyramidMixIn():
    """
    
    This is a class for mixing in pyramid
    
    
    
    """
    
    
    
    
    
    
    def __init__(self, terminal=False):
        super().__init__() #Start up whatever environment we are in
        self.__terminal = terminal
        self.__made = False
        self.__graph = None
        
    #Abstract methods.
    def Processing(self, stream, feedback):
        """
        
        return: conditioning, processing
        
        """
        raise NotImplimentedError("processing must be implemented")
    def Primary(self, stream, feedback):
        raise NotImplimentedError("Primary must be implemented")
    def Integration(self, stream, conditioning):
        raise NotImplementedError("integration must be implemented")
    def Finally(self, input_stream, output_stream):
        raise NotImplementedError("Finally must be implemented")
        
        
    #Logical method. Contains the actual logic for the class.
    def logical(self, superior, stream, status):
        """
        
        The primary logic for the class is located here. 
        
        First, feedback is defined if not already in existance. Then
        """

        #Get the feedback. If there is not enough, set it to none.
        if len(status) > 0:
            feedback = status.pop()
        else:
            feedback = None

        #Perform processing. Catch any errors
        try:
            conditioning, processing = self.Processing(stream, feedback)
        except Exception as err:
            msg = "Problem running Processing:" + err
            raise Exception(msg) from err

        #Perform escalation or primary. Catch error
        try:
            if superior is None:
                if len(status) > 0:
                    feedback = status.pop()
                else:
                    feedback = None
                    
                items  = self.Primary(processing, feedback)
                #Take apart, form initial feedback
                if isinstance(items, tuple):
                    commands, feedback = items
                else:
                    commands = items
                    feedback = None
                
                #Start status.
                
                status = [feedback] #Starts here, at the terminal layer.
            else:
                commands, status = superior(processing)
        except Exception as err:
            msg = "Problem running escalation: " + error
            raise Exception(msg) from err

        #Perform integration, catch error, attach feedback.
        try:
            integration = self.Integration(commands, conditioning)
            
            if isinstance(integration, tuple):
                integration, status = integration
                status.append(item)
            else:
                status.append(None)
                
        except Exception as err:
            msg = "Problem running Integration: "  + err
            raise Exception(msg) from err

        #Perform finally. Catch error

        try:
            final = self.Finally(stream, integration)
        except Exception as err:
            msg = "Problem runnning Finally: " + err
            raise Exception(msg) from err
        
        #Finished. Return

        return final, status
    def construct(self, superior):
        def graph(stream, status):
            self.logical(superior, stream, status)
        return graph
            
    def run(self, item):
        """
        
        The run function contains the construction logic and application logic for the problem.

        
        """
        if not isinstance(item, (tuple, list)) and not callable(item):
            item = [item, []]        

        
        #Handle the construction logic.
        if self.__made == False: ## If I still need to construct my graph.
            if not callable(item):
                 
                item = lambda func : func(item) # Will return the final construction.
            
            
            if self.__terminal is True:
                #It is the case this is the terminal layer. Go and run all the factories. Then store, on last layer.
                self.__made = True
                print("bere")
                self.__graph = item(self.construct(None))
                print("there")
            else:
                #This is not the terminal. Make a factory.
                def factory(superior):
                    #I just got told what my superior's constructor is. 
                    self.__made = True #I will now know  I have already had my graph constructed.
                    return item(self.construct(superior)) #Construct it, then feed it up the chain.
                return factory
        
        elif self.__graph is not None:
            return self.__graph(*item) #At final layer. Execute graph
        else:
            return item #Wait until final layer to execute.
        


In [42]:
class Foo():
    def __init__(self):
        print("foo")
        super().__init__()

class Fum():
    def __init__(self):
        print("fum")
    def test(self):
        print("fum2")
        
class Fee(Foo, Fum):
    def __init__(self):
        super().__init__()
        
Fee()
        
print("test")

print(callable(3))

foo
fum
test
False


## 