In [142]:
import torch
from torch import nn
import tensorflow as tf
import numpy as np
import torch.nn.functional as F

In [240]:
x = np.random.randn(4, 32, 60)
conv = nn.Conv1d(32, 64, 2, stride=1, padding="same")

In [241]:
conv(torch.from_numpy(x).float()).shape

torch.Size([4, 64, 60])

In [98]:
params = 0
for x in conv.parameters():
    curr = 1
    for y in x.shape:
        curr *= y
        
    params += curr
params

320

In [242]:
x = np.random.randn(4, 60, 32)

In [243]:
tf.keras.layers.Conv1D(64, 2, 2)(x).shape

TensorShape([4, 30, 64])

In [244]:
class TorchMyMultiHeadAttention(nn.Module):
    def __init__(self,
            embed_dim,
            out_dim,
            qk_dim,
            v_dim,
            num_head,
            kernel_size=2,
            stride=2
        ):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.embed_dim = embed_dim
        self.num_head  = num_head
        self.qk_dim = qk_dim
        self.v_dim  = v_dim

        self.q = nn.Conv1d(embed_dim, qk_dim*num_head,kernel_size, stride)
        self.k = nn.Conv1d(embed_dim, qk_dim*num_head,kernel_size, stride) # stride=2 for token reduction, kernel>1 for mixing
        self.v = nn.Conv1d(embed_dim, v_dim*num_head,kernel_size, stride)

        self.out = nn.Conv1d(v_dim*num_head//kernel_size, out_dim, 1)
        self.scale = 1/(qk_dim**0.5)

    #https://github.com/pytorch/pytorch/issues/40497
    def forward(self, x):
        B,dim,L= x.shape

        num_head = self.num_head
        qk_dim = self.qk_dim
        v_dim = self.v_dim

        q = self.q(x) #B,qk_dim,L
        k = self.k(x)
        v = self.v(x)
        # B, N, L, Q
        q = q.reshape(B, num_head, qk_dim//self.kernel_size, L).permute(0,1,3,2).contiguous()
        k = k.reshape(B, num_head, qk_dim//self.kernel_size, L)#.permute(0,1,2,3).contiguous()
        v = v.reshape(B, num_head, v_dim//self.kernel_size,  L).permute(0,1,3,2).contiguous()

        dot = torch.matmul(q, k) * self.scale  # H L L
        attn = F.softmax(dot, -1)    # L L
        print(attn.shape, v.shape)
        v = torch.matmul(attn, v)  # L H dim
        v = v.permute(0,1,3,2).reshape(B, v_dim*num_head//self.kernel_size,L).contiguous()
        print(v.shape)
        out = self.out(v)
        return out

In [245]:
class MyMultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, out_dim, qk_dim, v_dim, num_head, kernel_size=2, stride=2):
        super(MyMultiHeadAttention, self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.embed_dim = embed_dim
        self.num_head = num_head
        self.qk_dim = qk_dim
        self.v_dim = v_dim

        self.q = tf.keras.layers.Conv1D(qk_dim*num_head, kernel_size, strides=stride)
        self.k = tf.keras.layers.Conv1D(qk_dim*num_head, kernel_size, strides=stride)
        self.v = tf.keras.layers.Conv1D(v_dim*num_head, kernel_size, strides=stride)

        self.out = tf.keras.layers.Conv1D(out_dim, 1)
        self.scale = 1 / np.sqrt(qk_dim)

    def call(self, x):
        B, L, dim = x.shape

        num_head = self.num_head
        qk_dim = self.qk_dim
        v_dim = self.v_dim

        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

        print("q shape:", q.shape)
        
        q = tf.reshape(q, (B, num_head, L, qk_dim // self.kernel_size))
        
        
        k = tf.reshape(k, (B, num_head, qk_dim // self.kernel_size, L))
        
        v = tf.reshape(v, (B, num_head, L, v_dim // self.kernel_size))
        
        dot = tf.matmul(q, k) * self.scale # H Q Q
        attn = tf.nn.softmax(dot, axis=-1)
        print(attn.shape, v.shape)
        v = tf.matmul(attn, v)
        v = tf.transpose(v, perm=(0, 2, 1, 3))
        v = tf.reshape(v, (B, L, v_dim*num_head//self.kernel_size))
        print(v.shape)
        out = self.out(v)
        return out

In [246]:
embed_dim = 512
out_dim   = 512
qk_dim    = 512//4#for one head
v_dim     = 512//4
num_head  = 4
max_length = 96
batch_size = 4

mha = MyMultiHeadAttention(
    embed_dim,
    out_dim,
    qk_dim,
    v_dim,
    num_head,
    kernel_size=2,
    stride=2
)
x  = np.random.uniform(-1,1,(batch_size, max_length, embed_dim))

In [250]:
tf_mha = mha(x)
tf_mha.shape, x.shape

q shape: (4, 48, 512)
(4, 4, 96, 96) (4, 4, 96, 64)
(4, 96, 256)


(TensorShape([4, 96, 512]), (4, 96, 512))

In [248]:
tmha = TorchMyMultiHeadAttention(
    embed_dim,
    out_dim,
    qk_dim,
    v_dim,
    num_head,
    kernel_size=1,
    stride=1
)

In [249]:
t_mha = tmha(torch.from_numpy(x.reshape(4, 512, 96)).float())
t_mha.shape

torch.Size([4, 4, 96, 96]) torch.Size([4, 4, 96, 128])
torch.Size([4, 512, 96])


torch.Size([4, 512, 96])

In [216]:
t_mha.shape

torch.Size([4, 512, 96])

In [217]:
tf_mha.shape

TensorShape([4, 96, 512])

In [198]:
t_mha.detach().numpy().reshape(4, 512, 96)

array([[[ 2.80867741e-02,  2.98190303e-02,  2.72386409e-02, ...,
          2.99011786e-02,  2.98229046e-02,  2.96823550e-02],
        [ 3.80886160e-02,  4.00520228e-02,  3.79090272e-02, ...,
          3.76826152e-02,  3.85475196e-02,  3.96766551e-02],
        [-7.68204685e-03, -8.99784546e-03, -9.86262318e-03, ...,
         -1.00490991e-02, -8.41784570e-03, -1.12638930e-02],
        ...,
        [-3.00749671e-02, -2.66081281e-02, -2.96260230e-02, ...,
         -2.78310794e-02, -3.06129511e-02, -2.95400303e-02],
        [ 6.34287521e-02,  6.58256114e-02,  6.29289597e-02, ...,
          6.65333048e-02,  6.73640817e-02,  6.86542392e-02],
        [ 3.19390371e-03,  3.80220124e-03,  1.92262977e-03, ...,
          5.13247959e-03,  3.90551286e-03,  3.53754871e-03]],

       [[-2.07740813e-05, -2.30960920e-03, -2.21466459e-03, ...,
          3.02448869e-04, -2.20946968e-05, -2.13022530e-03],
        [ 2.29673237e-02,  1.92526672e-02,  2.05304157e-02, ...,
          2.08583903e-02,  2.04178561e

In [251]:
class ECA(tf.keras.layers.Layer):
    def __init__(self, kernel_size=5, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.kernel_size = kernel_size
        self.conv = tf.keras.layers.Conv1D(1, kernel_size=kernel_size, strides=1, padding="same", use_bias=False)

    def call(self, inputs, mask=None):
        nn = tf.keras.layers.GlobalAveragePooling1D()(inputs, mask=mask)
        nn = tf.expand_dims(nn, -1)
        nn = self.conv(nn)
        nn = tf.squeeze(nn, -1)
        nn = tf.nn.sigmoid(nn)
        nn = nn[:,None,:]
        return inputs * nn

class CausalDWConv1D(tf.keras.layers.Layer):
    def __init__(self, 
        kernel_size=17,
        dilation_rate=1,
        use_bias=False,
        depthwise_initializer='glorot_uniform',
        name='', **kwargs):
        super().__init__(name=name,**kwargs)
        self.causal_pad = tf.keras.layers.ZeroPadding1D((dilation_rate*(kernel_size-1),0),name=name + '_pad')
        self.dw_conv = tf.keras.layers.DepthwiseConv1D(
                            kernel_size,
                            strides=1,
                            dilation_rate=dilation_rate,
                            padding='valid',
                            use_bias=use_bias,
                            depthwise_initializer=depthwise_initializer,
                            name=name + '_dwconv')
        self.supports_masking = True
        
    def call(self, inputs):
        x = self.causal_pad(inputs)
        x = self.dw_conv(x)
        return x

def Conv1DBlock(channel_size,
          kernel_size,
          dilation_rate=1,
          drop_rate=0.0,
          expand_ratio=2,
          se_ratio=0.25,
          activation='swish',
          name=None):
    '''
    efficient conv1d block, @hoyso48
    '''
    if name is None:
        name = str(tf.keras.backend.get_uid("mbblock"))
    # Expansion phase
    def apply(inputs):
        channels_in = tf.keras.backend.int_shape(inputs)[-1]
        channels_expand = channels_in * expand_ratio

        skip = inputs

        x = tf.keras.layers.Dense(
            channels_expand,
            use_bias=True,
            activation=activation,
            name=name + '_expand_conv')(inputs)

        # Depthwise Convolution
        x = CausalDWConv1D(kernel_size,
            dilation_rate=dilation_rate,
            use_bias=False,
            name=name + '_dwconv')(x)

        x = tf.keras.layers.BatchNormalization(momentum=0.95, name=name + '_bn')(x)

        x  = ECA()(x)

        x = tf.keras.layers.Dense(
            channel_size,
            use_bias=True,
            name=name + '_project_conv')(x)

        if drop_rate > 0:
            x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1), name=name + '_drop')(x)

        if (channels_in == channel_size):
            x = tf.keras.layers.add([x, skip], name=name + '_add')
        return x

    return apply

In [272]:
x = np.random.randn(4, 128, 384)
conv = Conv1DBlock(10, kernel_size=5)

In [273]:
conv(x).shape

TensorShape([4, 128, 10])

In [278]:
tf.keras.layers.Conv1D(10, 8)(x).shape

TensorShape([4, 121, 10])

In [304]:
class MyMultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, out_dim, num_heads, kernel_size=17):
        super(MyMultiHeadAttention, self).__init__()
        self.kernel_size = kernel_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.qk_dim = embed_dim // num_heads
        self.v_dim = embed_dim // num_heads

        assert (self.qk_dim * num_heads == embed_dim), "num_heads should be divisible by embed_dim"
        assert (self.v_dim * num_heads == embed_dim), "num_heads should be divisible by embed_dim"

        self.q = Conv1DBlock(qk_dim*num_head, kernel_size=kernel_size)
        self.k = Conv1DBlock(qk_dim*num_head, kernel_size=kernel_size)
        self.v = Conv1DBlock(v_dim*num_head, kernel_size=kernel_size)

        self.out = tf.keras.layers.Conv1D(out_dim, 1)
        self.scale = 1 / np.sqrt(qk_dim)

        self.drop = tf.keras.layers.Dropout(dropout)

    def call(self, x):
        B, L, dim = x.shape

        num_heads = self.num_heads
        qk_dim = self.qk_dim
        v_dim = self.v_dim

        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

#        print("q shape:", q.shape)
        
        q = tf.reshape(q, (B, num_heads, L, qk_dim))
        
        
        k = tf.reshape(k, (B, num_heads, qk_dim, L))
        
        v = tf.reshape(v, (B, num_heads, L, v_dim))
        
        dot = tf.matmul(q, k) * self.scale # H Q Q
        attn = tf.nn.softmax(dot, axis=-1)
        print(attn.shape, v.shape)
        v = tf.matmul(attn, v)
        v = tf.transpose(v, perm=(0, 2, 1, 3))
        v = tf.reshape(v, (B, L, v_dim*num_heads))
#        print(v.shape)
        out = self.out(v)
        return out

In [306]:
mha = MyMultiHeadAttention(embed_dim, 384, num_head)
out = mha(x)
out.shape, x.shape

(4, 4, 128, 128) (4, 4, 128, 128)


(TensorShape([4, 128, 384]), (4, 128, 384))

In [309]:
def TransformerBlock(embed_dim=512, dim=256, num_heads=4, expand=4, attn_dropout=0.2, drop_rate=0.2, activation='swish'):
    def apply(inputs):
        x = inputs
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
        x = MyMultiHeadAttention(embed_dim, out_dim=dim, num_heads=num_heads)(x) # add attentioin dropout 
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([inputs, x])
        attn_out = x

        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
        x = tf.keras.layers.Dense(dim*expand, use_bias=False, activation=activation)(x)
        x = tf.keras.layers.Dense(dim, use_bias=False)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([attn_out, x])
        return x
    return apply

In [310]:
transformer = TransformerBlock(dim=384)
out = transformer(x)
out.shape, x.shape

(4, 4, 128, 128) (4, 4, 128, 128)


(TensorShape([4, 128, 384]), (4, 128, 384))

In [311]:
x = tf.reshape(tf.range(12), (3,4))

p, q, r = tf.unstack(x)
p.shape.as_list()


[4]

In [312]:
p

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 1, 2, 3], dtype=int32)>

In [313]:
q

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 5, 6, 7], dtype=int32)>

In [314]:
r

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 8,  9, 10, 11], dtype=int32)>

In [315]:
p, q, r = x
p, q, r

(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 1, 2, 3], dtype=int32)>,
 <tf.Tensor: shape=(4,), dtype=int32, numpy=array([4, 5, 6, 7], dtype=int32)>,
 <tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 8,  9, 10, 11], dtype=int32)>)

In [316]:
i, j, k, l = tf.unstack(x, axis=1)
i.shape.as_list()


[3]

In [317]:
i

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 4, 8], dtype=int32)>

In [320]:
import numpy as np

# Create your 2D and 1D arrays
array_2d = np.array([[1, 2, 0],
                     [1, 0, 1],
                     [2, 2, 0]])
array_1d = np.array([2, 0, 1])

# Use advanced indexing to directly assign values in array_2d
# array_2d[np.arange(len(array_1d)), array_1d] = -5

# print(array_2d)

In [322]:
array_2d[np.arange(len(array_1d)), array_1d] = np.array([4, 2, 3])