# **Gated Transformer Network for Sequence Classification**
Thomas Hopkins

## **Reference**

[1] Liu, M., Ren, S., Ma, S., Jiao, J., Chen, Y., Wang, Z., & Song, W. (2021). Gated Transformer Networks for Multivariate Time Series Classification. arXiv preprint arXiv:2103.14438. ([link](https://arxiv.org/pdf/2103.14438.pdf))

## **Imports and Data**

In [None]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

torch.manual_seed(32)
np.random.seed(0)

BASE_DIR = '/kaggle/input/tabular-playground-series-apr-2022/'

In [None]:
train_sequences = pd.read_csv(BASE_DIR + "train.csv")
val_sequences = train_sequences.iloc[:23340]
train_sequences = train_sequences.iloc[23340:]
train_labels = pd.read_csv(BASE_DIR + "train_labels.csv")
test_sequences = pd.read_csv(BASE_DIR + "test.csv")

print('training size: ', len(train_sequences) / 60)
print('validation size: ', len(val_sequences) / 60)
print('testing size: ', len(test_sequences) / 60)

In [None]:
# this makes using DataLoader easier later on by allowing us to index based on
# the local sequence number
train_min_seq_num = train_sequences.sequence.min()
train_sequences['sequence_local'] = train_sequences.sequence - train_min_seq_num
val_min_seq_num = val_sequences.sequence.min()
val_sequences['sequence_local'] = val_sequences.sequence - val_min_seq_num
test_min_seq_num = test_sequences.sequence.min()
test_sequences['sequence_local'] = test_sequences.sequence - test_min_seq_num

In [None]:
train_sequences.head()

In [None]:
val_sequences.head()

In [None]:
test_sequences.head()

In [None]:
train_labels.head()

## **Gated Transformer Network - Overview of the Architecture [1]**
The Gated Transformer Network (GTN) for time-series classification is an architecture with two encodings (also known as towers) that are merged by a gating mechanism. The two encodings are for the channel-wise and the step-wise correlations.

In this case, we have 13 different sensors (or channels) and 60 time-steps per observation. Since the channels do not have any defined ordering, we only use positional encoding for the step-wise correlations. Furthermore, since our data is continuous, the embedding layer is simply a fully connected layer with *tanh* activation. 

Here is the full architecture from the paper:

![GTN architecure image](https://media.arxiv-vanity.com/render-output/5404421/Two_Tower_V5.png)

Now we will implement this in PyTorch along with a data loader.

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels=None):
        super().__init__()
        self.sequences = sequences
        self.labels = labels
        self.sensor_cols = [c for c in self.sequences.columns if 'sensor' in c]
        
    def __getitem__(self, seq_key):
        ''' Returns a single sequence (shape (60, num_features) and its label (0 or 1) '''
        actual_seq_key = self.sequences[self.sequences.sequence_local == seq_key].sequence.iloc[0]
        if self.labels is None:
            label_tensor = np.nan
        else:
            label = self.labels[self.labels.sequence == actual_seq_key].state.iloc[0]
            label_tensor = torch.tensor(label, dtype=torch.long)
        seq_df = self.sequences[self.sequences.sequence == actual_seq_key]
        sensors_arr = seq_df[self.sensor_cols].to_numpy()
        sensor_tensor = torch.tensor(sensors_arr, dtype=torch.float32)
        return sensor_tensor, label_tensor
    
    def __len__(self):
        return len(self.sequences) // 60
    

class PositionalEncoding(nn.Module):
    ''' Adapted from https://pytorch.org/tutorials/beginner/transformer_tutorial.html '''
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
    
class Gate(nn.Module):
    ''' Gating mechanism for the two towers as described in [1] '''
    def __init__(self, c_dim, s_dim):
        super().__init__()
        # project into 2 dimensions for the gating weights
        self.linear_project = nn.Linear(c_dim + s_dim, 2)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, c, s):
        c = c.mean(dim=0)
        s = s.mean(dim=0)
        x = torch.cat([c, s], dim=1)
        x = self.linear_project(x)
        g = self.softmax(x)
        g1 = g[:, 0].unsqueeze(-1)
        g2 = g[:, 1].unsqueeze(-1)
        return torch.cat([c * g1, s * g2], dim=1)


class GTN(nn.Module):
    def __init__(self, num_channels=13, num_steps=60, step_embed_dim=128,
                 channel_embed_dim=128, step_nheads=5, channel_nheads=5,
                 step_ff_dim=2048, channel_ff_dim=2048,
                 num_layers=4, dropout=0.1):
        super().__init__()
        # step-wise embedding: each step gets its own embedding
        self.step_embed = nn.Linear(num_channels, step_embed_dim)
        # channel-wise embedding: each channel gets its own embedding
        self.channel_embed = nn.Linear(num_steps, channel_embed_dim)
        self.tanh = nn.Tanh() 
        # only step-wise embeddings are ordered
        self.pe = PositionalEncoding(step_embed_dim, dropout=dropout, max_len=num_steps)  
        # step-wise transformer encoder
        step_encoder_layer = nn.TransformerEncoderLayer(step_embed_dim, step_nheads,
                                                        dim_feedforward=step_ff_dim)
        self.step_transf = nn.TransformerEncoder(step_encoder_layer, num_layers)
        
        # channel-wise tranformer encoder
        channel_encoder_layer = nn.TransformerEncoderLayer(channel_embed_dim, channel_nheads,
                                                           dim_feedforward=channel_ff_dim)
        self.channel_transf = nn.TransformerEncoder(channel_encoder_layer, num_layers)
        # gating
        self.gate = Gate(channel_embed_dim, step_embed_dim)
        # linear output
        self.out = nn.Linear(channel_embed_dim + step_embed_dim, 2)
        
        
    def forward(self, x):
        # input is (60, N, 13)
        # x_channel should be (13, N, 60)
        # x_step should be (60, N, 13)
        x_step = x
        x_channel = x.transpose(0, 2).contiguous()
        # embedding (linear) layers
        x_step = self.tanh(self.step_embed(x_step))
        x_channel = self.tanh(self.channel_embed(x_channel))
        # positional encoding (only for step embeds)
        x_step = self.pe(x_step)
        # transformer encodings
        x_step = self.step_transf(x_step)
        x_channel = self.channel_transf(x_channel)
        # gating
        x = self.gate(x_channel, x_step)
        return self.out(x)
    

def train_one_epoch(train_loader, optimizer, model, loss_func, avg_losses, device, disable=True):
    model.train()
    for x, y in tqdm(train_loader, disable=disable):
        # transformer takes batch as second dim
        x = x.transpose(0, 1).contiguous()
        optimizer.zero_grad()
        x = x.to(device)
        y = y.to(device)
        preds = model(x)
        loss = loss_func(preds, y)
        avg_losses.append(loss.item())
        loss.backward()
        optimizer.step()


def validate(val_loader, model, criterion, device, disable=True):
    model.eval()
    for x, y in tqdm(val_loader, disable=disable):
        with torch.no_grad():
            x = x.transpose(0, 1).contiguous()
            x = x.to(device)
            preds = model(x)
            probs = F.softmax(preds.cpu(), dim=1)[:, 1]
            c = criterion(y, probs)
    return c

Now we set up our training hyperparameters and get the data ready. I'm not sure what works well here but I will most likely test a few different parameters on a subset of the training data.

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
print(f"Using device: {device}")

num_channels = 13          # number of sensors
num_steps = 60             # number of time-steps
step_embed_dim = 256       # embedding size for steps
channel_embed_dim = 256    # embedding size for channels
step_nheads = 16           # number of heads in step-wise attention module
channel_nheads = 16        # number of heads in channel-wise attention module
step_ff_dim = 2048         # linear layer size within step-wise transformer
channel_ff_dim = 2048      # linear layer size within channel-wise transformer
num_layers = 4             # number of transformer layers for each
dropout = 0.1              # dropout regularization rate
epochs = 50                # number of passes over training data
batch_size = 256           # number of observations in model input
lr = 0.001                 # optimizer learning rate
avg_losses = []

train_dataset = SequenceDataset(train_sequences, labels=train_labels)
val_dataset = SequenceDataset(val_sequences, labels=train_labels)
test_dataset = SequenceDataset(test_sequences)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=400, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model = GTN(num_channels=num_channels, num_steps=num_steps, step_embed_dim=step_embed_dim,
            channel_embed_dim=channel_embed_dim, step_nheads=step_nheads, channel_nheads=channel_nheads,
            step_ff_dim=step_ff_dim, channel_ff_dim=channel_ff_dim,
            num_layers=num_layers, dropout=dropout)
model = model.to(device)

optimizer = optim.Adagrad(model.parameters(), lr=lr)
criterion = roc_auc_score
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=10)
loss_func = nn.CrossEntropyLoss()

In [None]:
for e in tqdm(range(epochs), total=epochs, disable=False):
    train_one_epoch(train_loader, optimizer, model, loss_func, avg_losses, device, disable=True)
    val_auc = validate(val_loader, model, criterion, device, disable=True)
    print(f'ROC score: {val_auc}')
    lr_scheduler.step(val_auc)

In [None]:
plt.plot(range(len(avg_losses)), avg_losses)
plt.title('Average Loss Per Batch During Training')
plt.ylabel('Avg Loss')
plt.xlabel('Batch #')
plt.show()

In [None]:
model.eval()
state_probabilities = []
for x, _ in tqdm(test_loader):
    with torch.no_grad():
        x = x.reshape(num_steps, -1, num_channels)
        x = x.to(device)
        preds = model(x)
        probs = F.softmax(preds, dim=1)[0]
        state_probabilities.append(probs[1].item())

In [None]:
submission = pd.read_csv(BASE_DIR + 'sample_submission.csv')
submission.head()

In [None]:
submission.state = state_probabilities
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)