# The TensorDictModule

Make sure to first read the tensordict tutorial

How do we use the TensorDict it in pratice? We introduce the TensorDictModule. The TensorDictModule is an nn.Module that takes a TensorDict in his forward method. The user defines the keys that the module will take as an input and write the output in the same TensorDict at a given set of key.

In [1]:
from torchrl.modules import TensorDictModule, TensorDictSequence
from torchrl.data import TensorDict
import torch.nn as nn
import torch

### Example: Simple Linear layer

Let's imagine we have 2 entries Tensor dict, a and b and we only want to affect a.

In [2]:
tensordict = TensorDict({"a": torch.randn(5, 3), "b": torch.randn(5, 4, 3)}, batch_size=[5])
linear = TensorDictModule(nn.Linear(3, 10),in_keys=["a"], out_keys=["a_out"])
linear(tensordict)

TensorDict(
    fields={
        a: Tensor(torch.Size([5, 3]), dtype=torch.float32),
        a_out: Tensor(torch.Size([5, 10]), dtype=torch.float32),
        b: Tensor(torch.Size([5, 4, 3]), dtype=torch.float32)},
    batch_size=torch.Size([5]),
    device=cpu,
    is_shared=False)

We can also do it inplace

In [3]:
tensordict = TensorDict({"a": torch.randn(5, 3), "b": torch.randn(5, 4, 3)}, batch_size=[5])
linear = TensorDictModule(nn.Linear(3, 10),in_keys=["a"], out_keys=["a"])
linear(tensordict)

TensorDict(
    fields={
        a: Tensor(torch.Size([5, 10]), dtype=torch.float32),
        b: Tensor(torch.Size([5, 4, 3]), dtype=torch.float32)},
    batch_size=torch.Size([5]),
    device=cpu,
    is_shared=False)

### Example: 2 input merging with 2 linear layer

Now lets imagine a more complex network that takes 2 entries and average them into a single output

In [4]:
class MergeLinear(nn.Module):
    def __init__(self, in_1, in_2, out):
        super().__init__()
        self.linear_1  = nn.Linear(in_1,out)
        self.linear_2  = nn.Linear(in_2,out)
    def forward(self, x_1, x_2):
        return (self.linear_1(x_1) + self.linear_2(x_2))/2

In [5]:
tensordict = TensorDict({"a": torch.randn(5, 3), "b": torch.randn(5, 4, 3), "c":torch.randn(5, 4)}, batch_size=[5])
mergelinear = TensorDictModule(MergeLinear(3, 4, 10),in_keys=["a","c"], out_keys=["output"])
mergelinear(tensordict)

TensorDict(
    fields={
        a: Tensor(torch.Size([5, 3]), dtype=torch.float32),
        b: Tensor(torch.Size([5, 4, 3]), dtype=torch.float32),
        c: Tensor(torch.Size([5, 4]), dtype=torch.float32),
        output: Tensor(torch.Size([5, 10]), dtype=torch.float32)},
    batch_size=torch.Size([5]),
    device=cpu,
    is_shared=False)

### Example: 1 input to 2 outputs linear layer
We can also map to multiple outputs

In [6]:
class MultiHeadLinear(nn.Module):
    def __init__(self, in_1, out_1, out_2):
        super().__init__()
        self.linear_1  = nn.Linear(in_1,out_1)
        self.linear_2  = nn.Linear(in_1,out_2)
    def forward(self, x):
        return self.linear_1(x), self.linear_2(x)

In [7]:
tensordict = TensorDict({"a": torch.randn(5, 3), "b": torch.randn(5, 4, 3)}, batch_size=[5])
mergelinear = TensorDictModule(MultiHeadLinear(3, 4, 10),in_keys=["a"], out_keys=["output_1", "output_2"])
mergelinear(tensordict)

TensorDict(
    fields={
        a: Tensor(torch.Size([5, 3]), dtype=torch.float32),
        b: Tensor(torch.Size([5, 4, 3]), dtype=torch.float32),
        output_1: Tensor(torch.Size([5, 4]), dtype=torch.float32),
        output_2: Tensor(torch.Size([5, 10]), dtype=torch.float32)},
    batch_size=torch.Size([5]),
    device=cpu,
    is_shared=False)

As we shown previously, the TensorDictModule can take any nn.Module and perform the operations inside a TensorDict. When having multiple input keys and output keys, make sure they match the order in the module.
The tensordictmodule allows to use only the tensors that we want and keep the output inside the same object. It can even perform the operations inplace by setting the output key to be the same as an already set key.

### Example: A transformer with TensorDict?
Let's attempt to create a transformer with TensorDict and TensorDictModule

Disclaimer: This implementation don't claim to be "better" than a classical tensor-based implementation. It is just meant to showcase the TensorDictModule features.
For simplicity we will not have positional encoders.

Let's first implement the classical transformers blocks

In [8]:
class TokensToQKV(nn.Module):
    def __init__(self, to_dim, from_dim, latent_dim):
        super().__init__()
        self.q = nn.Linear(to_dim, latent_dim)
        self.k = nn.Linear(from_dim, latent_dim)
        self.v = nn.Linear(from_dim, latent_dim)
    def forward(self, X_to, X_from):
        Q = self.q(X_to)
        K = self.k(X_from)
        V = self.v(X_from)
        return Q, K, V

class SplitHeads(nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        self.num_heads = num_heads
    def forward(self, Q, K, V):
        batch_size, to_num, latent_dim = Q.shape
        _, from_num, _ = K.shape
        d_tensor = latent_dim // self.num_heads
        Q = Q.reshape(batch_size, to_num, self.num_heads, d_tensor).transpose(1, 2)
        K = K.reshape(batch_size, from_num, self.num_heads, d_tensor).transpose(1, 2)
        V = V.reshape(batch_size, from_num, self.num_heads, d_tensor).transpose(1, 2)
        return Q, K, V
class Attention(nn.Module):
    def __init__(self, latent_dim, to_dim):
        super().__init__()
        self.softmax = nn.Softmax(dim=-1)
        self.out = nn.Linear(latent_dim, to_dim)
    def forward(self, Q, K, V):
        batch_size, n_heads, to_num, d_in = Q.shape
        attn = self.softmax(Q @ K.transpose(2, 3) / d_in)
        out = attn @ V
        out = self.out(out.transpose(1, 2).reshape(batch_size, to_num, n_heads*d_in))
        return out, attn
class SkipLayerNorm(nn.Module):
    def __init__(self, to_len, to_dim):
        super().__init__()
        self.layer_norm = nn.LayerNorm((to_len, to_dim))
    def forward(self, x_0, x_1):
        return self.layer_norm(x_0+x_1)
class FFN(nn.Module):
    def __init__(self, to_dim, hidden_dim, dropout_rate = 0.2):
        super().__init__()
        self.FFN = nn.Sequential(
            nn.Linear(to_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, to_dim),
            nn.Dropout(dropout_rate)
        )
    def forward(self, X):
        return self.FFN(X)


Now, we can build the TransformerBlock thanks to the TensorDictModule. Since the changes affect the tensor dict, we just need to map outputs to the right name such as it is picked up by the next block.

In [9]:
class TransformerBlockTensorDict(TensorDictSequence):
    def __init__(self, to_name, from_name, to_dim, to_len, from_dim, latent_dim, num_heads):
        super().__init__(
            TensorDictModule(TokensToQKV(to_dim, from_dim, latent_dim), in_keys=[to_name, from_name], out_keys=["Q", "K", "V"]),
            TensorDictModule(SplitHeads(num_heads), in_keys=["Q", "K", "V"], out_keys=["Q", "K", "V"]),
            TensorDictModule(Attention(latent_dim, to_dim), in_keys=["Q", "K", "V"], out_keys=["X_out","Attn"]),
            TensorDictModule(SkipLayerNorm(to_len, to_dim), in_keys=["X_to", "X_out"], out_keys=["X_to"]),
            TensorDictModule(FFN(to_dim, 4*to_dim), in_keys=["X_to"], out_keys=["X_out"]),
            TensorDictModule(SkipLayerNorm(to_len, to_dim), in_keys=["X_to", "X_out"], out_keys=["X_to"]),
        )

In [10]:
to_dim = 5
from_dim = 6
latent_dim = 10
to_len = 3
from_len = 10
batch_size = 8
num_heads = 2

tokens = TensorDict(
    {
        "X_to": torch.randn(batch_size, to_len, to_dim),
        "X_from": torch.randn(batch_size, from_len, from_dim)
    },
    batch_size=[batch_size]
)

transformer_block = TransformerBlockTensorDict(
    "X_to",
    "X_from",
    to_dim,
    to_len,
    from_dim,
    latent_dim,
    num_heads
)

transformer_block(tokens)

tokens

TensorDict(
    fields={
        Attn: Tensor(torch.Size([8, 2, 3, 10]), dtype=torch.float32),
        K: Tensor(torch.Size([8, 2, 10, 5]), dtype=torch.float32),
        Q: Tensor(torch.Size([8, 2, 3, 5]), dtype=torch.float32),
        V: Tensor(torch.Size([8, 2, 10, 5]), dtype=torch.float32),
        X_from: Tensor(torch.Size([8, 10, 6]), dtype=torch.float32),
        X_out: Tensor(torch.Size([8, 3, 5]), dtype=torch.float32),
        X_to: Tensor(torch.Size([8, 3, 5]), dtype=torch.float32)},
    batch_size=torch.Size([8]),
    device=cpu,
    is_shared=False)

The output of the transformer layer can now be found at tokens["X_to"]

In [11]:
tokens["X_to"]

tensor([[[ 0.1732,  1.0383, -1.5069, -0.8307, -0.5521],
         [-0.8138,  0.8951, -0.6802, -1.3608,  0.3892],
         [-0.3947,  0.7221, -0.4580,  1.4660,  1.9133]],

        [[-0.6783,  0.5550,  0.4649,  1.5167,  1.6206],
         [ 0.6388, -0.3218,  0.7403,  0.2852,  0.1950],
         [-0.6361, -1.8308,  0.0806, -1.9018, -0.7281]],

        [[ 0.9160, -0.4022, -0.8094,  0.6584, -1.5115],
         [ 0.9658, -1.1464, -0.9496, -1.4521,  0.9001],
         [ 1.2254,  1.4921, -0.1453,  0.7521, -0.4933]],

        [[-0.3162,  1.2837,  0.3168,  1.7858,  0.0061],
         [-0.1342, -0.4649, -1.2371, -0.1789, -0.4095],
         [ 1.8135, -0.2081, -0.6934, -1.9989,  0.4353]],

        [[-0.4506,  0.7247, -1.1347, -0.1918,  1.0423],
         [-0.4250,  0.6152, -0.7718, -1.5260,  0.6411],
         [ 2.3155,  0.8685, -0.9266,  0.1958, -0.9766]],

        [[ 1.4929, -0.4170, -0.6244,  0.0319, -0.4917],
         [-0.9035,  0.1552,  0.6767, -0.2369,  1.3653],
         [ 0.0061, -1.9266, -1.4868,  

We can now create a transformer easily

In [12]:
class TransformerTensorDict(TensorDictSequence):
    def __init__(
        self,
        num_blocks,
        to_name,
        from_name,
        to_dim,
        to_len,
        from_dim,
        latent_dim,
        num_heads
    ):
        super().__init__(*[TransformerBlockTensorDict(to_name, from_name, to_dim, to_len, from_dim, latent_dim, num_heads) for _ in range(num_blocks)])

In [13]:
to_dim = 5
from_dim = 6
latent_dim = 10
to_len = 3
from_len = 10
batch_size = 8
num_heads = 2

tokens = TensorDict(
    {
        "X_to":torch.randn(batch_size, to_len, to_dim),
        "X_from":torch.randn(batch_size, from_len, from_dim)
    },
    batch_size=[batch_size]
)

For an encoder, we just need to take the same tokens for both queries, keys and values.

In [14]:
transformer_encoder = TransformerTensorDict(
    6,
    "X_to",
    "X_to",
    to_dim,
    to_len,
    to_dim,
    latent_dim,
    num_heads
)

transformer_encoder(tokens)
tokens["X_to"]

tensor([[[ 0.1218,  1.6380,  0.5283, -1.2097, -2.1976],
         [ 0.2963,  0.8917,  0.7497, -0.1242, -0.3194],
         [ 0.5004,  1.3847, -0.8797, -0.3203, -1.0600]],

        [[ 0.3451,  1.8204,  1.1999, -1.6162, -0.2519],
         [-0.1433,  1.0322, -0.6018, -0.6799, -0.5113],
         [ 0.4102,  0.7934,  0.8460, -0.9515, -1.6914]],

        [[-0.2757,  1.4416,  0.8119, -0.6991, -1.2287],
         [ 1.0713,  1.3480, -0.0162, -1.2294, -2.0809],
         [-0.1340,  0.6964,  0.6220,  0.3045, -0.6318]],

        [[ 0.1715,  2.3998,  1.4253, -1.2790, -1.2869],
         [ 0.1807,  0.6704,  0.4473, -1.1606, -0.7941],
         [-0.0601,  0.0541,  0.4753, -0.2883, -0.9554]],

        [[ 0.2194,  1.3032, -0.0246, -0.1858, -1.3101],
         [-0.5537,  0.8703, -1.1531, -0.8894, -1.7302],
         [ 1.3058,  0.2501,  1.4498,  1.0468, -0.5985]],

        [[ 0.8845,  0.2856,  0.9751, -0.6513,  0.4614],
         [-0.5029,  1.6529,  1.3564, -0.8560, -1.0680],
         [-0.7042,  0.5084,  0.6723, -

For a decoder, we now can extract info from X_from into X_to. X_to will map to queries whereas X_from will map to keys and values.

In [15]:
transformer_decoder = TransformerTensorDict(
    6,
    "X_to",
    "X_from",
    to_dim,
    to_len,
    from_dim,
    latent_dim,
    num_heads
)

transformer_decoder(tokens)
tokens["X_to"]

tensor([[[-0.5958,  1.9269,  1.4385, -1.7099, -1.2024],
         [-0.1923,  0.6554,  0.8646, -0.9089,  0.4366],
         [-0.1544,  0.5690,  0.1171, -1.3729,  0.1287]],

        [[-0.6611,  1.6326,  0.9590, -1.3373,  0.1869],
         [-0.5724,  0.9246,  0.3437, -1.2203, -0.2542],
         [-0.2347,  0.9846,  1.5354, -1.5329, -0.7540]],

        [[-1.0947,  1.5092,  1.3402, -1.0084, -0.4252],
         [-0.3901,  1.3286,  1.2193, -1.4316, -1.3246],
         [-0.7211,  0.7870,  0.2914, -0.2573,  0.1774]],

        [[-1.0461,  2.1735,  1.8569, -0.7953, -0.6545],
         [-0.3190,  0.8452,  0.8193, -1.0471, -0.7304],
         [-0.2703,  0.0935,  0.5782, -0.6885, -0.8154]],

        [[-0.3828,  1.8378,  0.1272, -0.4188, -0.1151],
         [-0.9871,  0.3603, -1.4740, -1.2111, -1.6092],
         [ 0.2361,  0.9122,  1.5031,  0.4612,  0.7603]],

        [[ 0.1466,  0.6786,  1.1725, -0.5156,  0.1942],
         [-0.9457,  1.7856,  1.7606, -0.9622, -0.9245],
         [-0.8863,  0.1064,  0.6751, -

Now we can look at both models:

In [16]:
transformer_encoder

TransformerTensorDict(
    module=ModuleList(
      (0): TransformerBlockTensorDict(
          module=ModuleList(
            (0): TensorDictModule(
                module=TokensToQKV(
                  (q): Linear(in_features=5, out_features=10, bias=True)
                  (k): Linear(in_features=5, out_features=10, bias=True)
                  (v): Linear(in_features=5, out_features=10, bias=True)
                ), 
                device=cpu, 
                in_keys=['X_to', 'X_to'], 
                out_keys=['Q', 'K', 'V'])
            (1): TensorDictModule(
                module=SplitHeads(), 
                device=cpu, 
                in_keys=['Q', 'K', 'V'], 
                out_keys=['Q', 'K', 'V'])
            (2): TensorDictModule(
                module=Attention(
                  (softmax): Softmax(dim=-1)
                  (out): Linear(in_features=10, out_features=5, bias=True)
                ), 
                device=cpu, 
                in_keys=['Q', 'K', 'V'

In [17]:
transformer_decoder

TransformerTensorDict(
    module=ModuleList(
      (0): TransformerBlockTensorDict(
          module=ModuleList(
            (0): TensorDictModule(
                module=TokensToQKV(
                  (q): Linear(in_features=5, out_features=10, bias=True)
                  (k): Linear(in_features=6, out_features=10, bias=True)
                  (v): Linear(in_features=6, out_features=10, bias=True)
                ), 
                device=cpu, 
                in_keys=['X_to', 'X_from'], 
                out_keys=['Q', 'K', 'V'])
            (1): TensorDictModule(
                module=SplitHeads(), 
                device=cpu, 
                in_keys=['Q', 'K', 'V'], 
                out_keys=['Q', 'K', 'V'])
            (2): TensorDictModule(
                module=Attention(
                  (softmax): Softmax(dim=-1)
                  (out): Linear(in_features=10, out_features=5, bias=True)
                ), 
                device=cpu, 
                in_keys=['Q', 'K', '