# **Bi-Directional LSTM for Sequence Classification**
Thomas Hopkins

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

torch.manual_seed(32)
np.random.seed(0)

BASE_DIR = '/kaggle/input/tabular-playground-series-apr-2022/'

In [None]:
train_sequences = pd.read_csv(BASE_DIR + "train.csv")
train_labels = pd.read_csv(BASE_DIR + "train_labels.csv")
test_sequences = pd.read_csv(BASE_DIR + "test.csv")

In [None]:
# this makes using DataLoader easier later on by allowing us to index based on
# the local sequence number
train_min_seq_num = train_sequences.sequence.min()
train_sequences['sequence_local'] = train_sequences.sequence - train_min_seq_num
test_min_seq_num = test_sequences.sequence.min()
test_sequences['sequence_local'] = test_sequences.sequence - test_min_seq_num

In [None]:
train_sequences.head()

In [None]:
test_sequences.head()

In [None]:
train_labels.head()

## **Viewing a full sequence**
Here we will look at how a single sequence varies over time for all of the different sensors. The first plot will be for a sequence with `state = 0` while the other will be for a sequence with `state = 1`.

In [None]:
def plot_sensor(data, sensor_num, ax=None):
    if ax is None:
        ax = plt.gca()
    sensor_series = data["sensor_" + sensor_num]
    ax.plot(range(60), sensor_series, label=sensor_num)

In [None]:
seq0 = 7 # has state = 0
seq1 = 4 # has state = 1
if train_labels[train_labels.sequence == seq0].state.iloc[0] == 1:
    print(f'Warning: sequence {seq0} has state = 1')
if train_labels[train_labels.sequence == seq1].state.iloc[0] == 0:
    print(f'Warning: sequence {seq1} has state = 0')
sequence_0 = train_sequences[train_sequences.sequence == seq0] 
sequence_1 = train_sequences[train_sequences.sequence == seq1] 
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 20))

# comment out any sensors you don't want to see in the output
plot_sensor(sequence_0, '00', ax=ax1)
plot_sensor(sequence_0, '01', ax=ax1)
plot_sensor(sequence_0, '02', ax=ax1)
plot_sensor(sequence_0, '03', ax=ax1)
plot_sensor(sequence_0, '04', ax=ax1)
plot_sensor(sequence_0, '05', ax=ax1)
plot_sensor(sequence_0, '06', ax=ax1)
plot_sensor(sequence_0, '07', ax=ax1)
plot_sensor(sequence_0, '08', ax=ax1)
plot_sensor(sequence_0, '09', ax=ax1)
plot_sensor(sequence_0, '10', ax=ax1)
plot_sensor(sequence_0, '11', ax=ax1)
plot_sensor(sequence_0, '12', ax=ax1)
ax1.set_title(f"Sensor data for Sequence {seq0}: state = 0")
ax1.set_xlabel("Step")
ax1.set_ylabel("Reading")
ax1.legend()

plot_sensor(sequence_1, '00', ax=ax2)
plot_sensor(sequence_1, '01', ax=ax2)
plot_sensor(sequence_1, '02', ax=ax2)
plot_sensor(sequence_1, '03', ax=ax2)
plot_sensor(sequence_1, '04', ax=ax2)
plot_sensor(sequence_1, '05', ax=ax2)
plot_sensor(sequence_1, '06', ax=ax2)
plot_sensor(sequence_1, '07', ax=ax2)
plot_sensor(sequence_1, '08', ax=ax2)
plot_sensor(sequence_1, '09', ax=ax2)
plot_sensor(sequence_1, '10', ax=ax2)
plot_sensor(sequence_1, '11', ax=ax2)
plot_sensor(sequence_1, '12', ax=ax2)
ax2.set_title(f"Sensor data for Sequence {seq1}: state = 1")
ax2.set_xlabel("Step")
ax2.set_ylabel("Reading")
ax2.legend();

Cycling through a couple different examples (by changing `seq0` and `seq1`), we can see some differences in pattern between sequences with `state = 0` and `state = 1`. Don't let this influence your feature engineering because it might bias your decisions. This is purely to see if there are *any* differences we can immediately notice. If you have any questions about this point, please leave a comment so we can discuss.

Let's see if a deep learning model can find these differences.

## **Forward Lag and Difference Feature Engineering**
Here we will add some basic feautures to make learning a bit easier. Lag takes a value from the previous time-step and makes it available for the current time-step. Difference is simply the difference between the current time-step value and the previous. This is taken in the forward direction which means in the order that the sequence actually occurred. Since this is classification and not forecasting, we can also use the backward lag and difference features.

In [None]:
def create_new_features(dataset):
    ''' Create forward and backward lag and difference features for each sensor '''
    sensor_cols = [c for c in dataset.columns if 'sensor' in c]
    for sensor in sensor_cols:
        dataset[f'{sensor}_lag1_forw'] = dataset.groupby('sequence')[sensor].shift(1)
        dataset[f'{sensor}_lag1_forw'].fillna(dataset[sensor].median(), inplace=True)
        dataset[f'{sensor}_diff_forw'] = dataset[sensor] - dataset[sensor + '_lag1_forw']
        
        dataset[f'{sensor}_lag1_back'] = dataset.groupby('sequence')[sensor].shift(-1)
        dataset[f'{sensor}_lag1_back'].fillna(dataset[sensor].median(), inplace=True)
        dataset[f'{sensor}_diff_back'] = dataset[sensor] - dataset[sensor + '_lag1_back']
    return dataset

In [None]:
train_seq = create_new_features(train_sequences)
test_seq = create_new_features(test_sequences)

In [None]:
train_seq.head()

In [None]:
# sanity check
train_seq[['sequence', 'sensor_00', 'sensor_00_lag1_forw', 'sensor_00_diff_forw', 'sensor_00_lag1_back', 'sensor_00_diff_back']].head()

## **Bidirectional LSTM**
Here we will use a bidirectional LSTM for sequence classification. See https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM for details.

First we will define a dataset so the data is easier to work with for our training loop.

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels=None):
        super().__init__()
        self.sequences = sequences
        self.labels = labels
        self.sensor_cols = [c for c in self.sequences.columns if 'sensor' in c]
        
    def __getitem__(self, seq_key):
        ''' Returns a single sequence (shape (60, num_features) and its label (0 or 1) '''
        if self.labels is None:
            label_tensor = np.nan
        else:
            label = self.labels[self.labels.sequence == seq_key].state.iloc[0]
            label_tensor = torch.tensor(label, dtype=torch.long)
        seq_df = self.sequences[self.sequences.sequence_local == seq_key]
        sensors_arr = seq_df[self.sensor_cols].to_numpy()
        sensor_tensor = torch.tensor(sensors_arr, dtype=torch.float32)
        return sensor_tensor, label_tensor
    
    def __len__(self):
        if self.labels is None:
            return len(self.sequences) // 60
        return len(self.labels)
    
    
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_size, num_layers, dropout_rate):
        super().__init__()
        self.LSTM = nn.LSTM(input_dim, hidden_size, num_layers, batch_first=True,
                         dropout=dropout_rate, bidirectional=True)
        self.l1 = nn.Linear(2 * hidden_size * 60,  128)
        self.l2 = nn.Linear(128, 2)
        
    def forward(self, x):
        # should be (N, L, 13)
        x = self.LSTM(x)[0].flatten(start_dim=1)
        # should be (N, 2 * hidden_size * 60)
        x = F.relu(self.l1(x))
        return self.l2(x)

Now we set up our training hyperparameters and get the data ready. I'm not sure what works well here but I will most likely test a few different parameters on a subset of the training data.

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
print(f"Using device: {device}")
    
EPOCHS = 40
BATCH_SIZE = 128
HIDDEN_SIZE = 256
NUM_LAYERS = 8
DROPOUT_RATE = 0.4
LEARNING_RATE = 0.0005
AVG_LOSSES = []

train_dataset = SequenceDataset(train_seq, labels=train_labels)
test_dataset = SequenceDataset(test_seq)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model = LSTMClassifier(len(train_dataset.sensor_cols), HIDDEN_SIZE, NUM_LAYERS, DROPOUT_RATE)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss()

In [None]:
for e in tqdm(range(EPOCHS), total=EPOCHS):
    for x, y in train_loader:
        optimizer.zero_grad()
        x = x.to(device)
        y = y.to(device)
        preds = model(x)
        loss = loss_func(preds, y)
        AVG_LOSSES.append(loss.item())
        loss.backward()
        optimizer.step()

In [None]:
plt.close()
plt.plot(range(len(AVG_LOSSES)), AVG_LOSSES)
plt.title('Average Loss Per Batch During Training')
plt.ylabel('Avg Loss')
plt.xlabel('Batch #')
plt.show()

In [None]:
model.eval()
state_probabilities = []
for x, _ in tqdm(test_loader):
    with torch.no_grad():
        x = x.to(device)
        preds = model(x)
        probs = F.softmax(preds, dim=1)[0]
        state_probabilities.append(probs[1].item())

In [None]:
submission = pd.read_csv(BASE_DIR + 'sample_submission.csv')
submission.head()

In [None]:
submission.state = state_probabilities
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)