In [13]:
from utility_funcs import get_train_labels_test, split_train_data, scale_and_as_array

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [14]:
train, labels, test = get_train_labels_test()

In [15]:
features = [f for f in train.columns if 'sensor' in f]

In [16]:
groups = train["sequence"]
train = train.drop(["subject", "step",'sequence'], axis=1)
test = test.drop([ "subject", "step",'sequence'], axis=1)

In [17]:
scaler = StandardScaler()
scaler = scaler.fit(train[features])



In [18]:

X_train, X_valid, y_train, y_valid = split_train_data(train, labels)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

X_train = scale_and_as_array(X_train, features, scaler, scale_data = True)
X_valid = scale_and_as_array(X_valid,  features,scaler, scale_data = True)
X_test = scale_and_as_array(test,  features,scaler, scale_data = True)

(1402260, 13) (23371, 2)
(155820, 13) (2597, 2)


In [19]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, seq_num):
        super().__init__()
        self.X = X
        self.y = y
        self.seq_num = seq_num
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx[0]//self.seq_num]



class TestDataset(torch.utils.data.Dataset):
    def __init__(self, X, seq_num):
        super().__init__()
        self.X = X
        self.seq_num = seq_num
    def __len__(self):
        return len(self.X) // 60
    def __getitem__(self, idx):
        return self.X[idx]



def prepare_data(data, data_labels, seq_num, data_num, mode='train'):
    if data_labels is not None:
        data_labels = data_labels['state'].values
    
    sampler = np.array([list(range(i * seq_num, (i + 1) * seq_num)) for i in range(data_num // seq_num)])
    if mode == 'train':
        dataset = TrainDataset(data, data_labels, seq_num)
    else:
        dataset = TestDataset(data, seq_num)

    dataloader = DataLoader(dataset, batch_size=64, sampler=sampler)
    return dataloader

train_dataloader = prepare_data(X_train, y_train, 60, X_train.shape[0])
valid_dataloader = prepare_data(X_valid, y_valid, 60, X_valid.shape[0])


In [20]:

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [21]:
class LSTM(nn.Module):
    def __init__(self,input_size,num_classes,hidden_size,num_layers):
        super().__init__()
        
        self.lstm1 = nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,dropout=0,bidirectional=True)

        self.final = nn.Sequential(
            nn.ReLU(),
            nn.Linear(hidden_size*60*2, num_classes),
        )
            
    def forward(self,x):
        
        out, _ = self.lstm1(x)
        
        out = out.reshape(out.shape[0],-1)
        
        out = self.final(out)
        return out

In [22]:
input_size = len(features)
hidden_size = 128
num_classes = 1
learning_rate = 1e-4
num_epochs = 100

model = LSTM(input_size,num_classes,hidden_size,1)
model.to(device)

LSTM(
  (lstm1): LSTM(13, 128, batch_first=True, bidirectional=True)
  (final): Sequential(
    (0): ReLU()
    (1): Linear(in_features=15360, out_features=1, bias=True)
  )
)

In [23]:

criterion = nn.MSELoss() # seems to preform better than logits
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
num_warmup_steps = int(0.1 * num_epochs * len(train_dataloader))
num_training_steps = int(num_epochs * len(train_dataloader))
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps
)

In [24]:
torch.cuda.empty_cache()
for epoch in tqdm(range(num_epochs-1)): 
    model.train()
    for trainX, train_y in train_dataloader:
        outputs = model(trainX.to(device,dtype=torch.float32)).squeeze(-1)
        optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        loss = criterion(outputs, train_y.to(device,dtype=torch.float32))
        loss.backward()

        optimizer.step()
        scheduler.step()

    model.eval()
    for validX, valid_y in valid_dataloader:
        with torch.no_grad():
            val_out = model(validX.to(device,dtype=torch.float32)).squeeze(-1)
            vall_loss = criterion(val_out,valid_y.to(device,dtype=torch.float32))

    if epoch % 10 == 0:
        
          print("Epoch: %d, loss: %1.5f valid loss:  %1.5f " %(epoch, loss.cpu().item(),vall_loss.cpu().item()))

  1%|          | 1/99 [00:06<10:26,  6.39s/it]

Epoch: 0, loss: 0.21787 valid loss:  0.30503 


 11%|█         | 11/99 [01:09<09:24,  6.41s/it]

Epoch: 10, loss: 0.12338 valid loss:  0.16807 


 21%|██        | 21/99 [02:22<09:52,  7.60s/it]

Epoch: 20, loss: 0.14793 valid loss:  0.11872 


 31%|███▏      | 31/99 [03:39<08:44,  7.71s/it]

Epoch: 30, loss: 0.15276 valid loss:  0.11184 


 41%|████▏     | 41/99 [04:56<07:27,  7.72s/it]

Epoch: 40, loss: 0.13408 valid loss:  0.12084 


 52%|█████▏    | 51/99 [06:14<06:11,  7.75s/it]

Epoch: 50, loss: 0.12112 valid loss:  0.12610 


 62%|██████▏   | 61/99 [07:32<04:55,  7.77s/it]

Epoch: 60, loss: 0.10569 valid loss:  0.13577 


 72%|███████▏  | 71/99 [08:50<03:41,  7.91s/it]

Epoch: 70, loss: 0.09633 valid loss:  0.14291 


 82%|████████▏ | 81/99 [10:08<02:20,  7.82s/it]

Epoch: 80, loss: 0.08372 valid loss:  0.14745 


 92%|█████████▏| 91/99 [11:27<01:03,  7.97s/it]

Epoch: 90, loss: 0.07229 valid loss:  0.15044 


100%|██████████| 99/99 [12:31<00:00,  7.59s/it]


In [25]:
test_loader = prepare_data(X_test, None, 60, X_test.shape[0], 'test')

In [26]:
def predict(
    model,
    loader,
):
    model.eval()
    
    preds = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            pred = model(data.float())
            preds.append(pred.detach().cpu().numpy())

    preds = np.concatenate(preds, 0)
    
    return preds

soutions_test = predict(model, test_loader)

In [52]:
import pandas as pd
sub = pd.read_csv('../Data/sample_submission.csv')
sub['state'] = np.where(soutions_test >= 0.5, 1, 0)

In [54]:
sub.state.value_counts()

0    6217
1    6001
Name: state, dtype: int64

In [55]:
sub.to_csv("../submissions/lstm_submit.csv", index = False)