# Investigating Adaptive Chunk-based Composition in RNNs

In this project, we will investigate the reuse of common elements (e.g. sub-sequences) present across training examples by probing the hidden activity of gated-recurrent units (GRUs) when they regenerate the original sequences.

In [60]:
import numpy as np
import matplotlib.pyplot as plt
import os
import torch as th
from torch import nn

In [59]:
# load dataset: n_words, n_repeats, n_samples, 2 (x,y)
all_words = np.load('./data/all_words.npz')
# validate dataset
if len(all_words.keys()) > 1:
    keys = list(all_words.keys())
    assert all([all_words[keys[0]].shape == all_words[keys[i]].shape for i in range(1, len(keys))])

In [None]:
class RNN(nn.Module):
    """
    gru with 1 hidden layer and trainable h0, and a fully connected output layer
    """
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_words: int,  device):
        # member variables
        super().__init__()
        self.device = device
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.n_layers = 1
        self.gru = nn.GRU(input_dim, hidden_dim, self.n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

        # initialize params
        for name, param in self.named_parameters():
            if name == "gru.weight_ih_l0":
                nn.init.xavier_uniform_(param)
            elif name == "gru.weight_hh_l0":
                nn.init.orthogonal_(param)
            elif name == "gru.bias_ih_l0":
                nn.init.zeros_(param)
            elif name == "gru.bias_hh_l0":
                nn.init.zeros_(param)
            elif name == "fc.weight":
                nn.init.xavier_uniform_(param)
            elif name == "fc.bias":
                nn.init.zeros_(param)
            else:
                raise ValueError(f"Parameter {name} not initialized")

        # trainable initial hidden states
        self.h0 = nn.Parameter(th.zeros(self.n_layers, num_words, hidden_dim), requires_grad=True)

        # move to device
        self.to(device)

    def forward(self, x: th.Tensor, h0: th.Tensor):
        z, h = self.gru(x[:, None, :], h0) # todo: see if this will work!
        u = self.sigmoid(self.fc(z)).squeeze(dim=1)
        return u, h

    def init_hidden(self, batch_size: int):
        assert hasattr(self, 'h0')
        return self.h0.repeat(1, batch_size, 1).to(self.device) # todo: will this work????

In [None]:
# todo: how to ensure that the h0 is kept separate for each minibatch (i.e. per word)?
# todo: or that training happens over the entire dataset (all words) at once per epoch?
# todo: the environment has to provide (mini)batchwise words, n_repeats, T steps, and as feedback the cumsum of actions 
#       (mimicking visual feedback)