In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn, optim
import torch.nn.functional as F

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
if device == torch.device('cuda'):
    print( torch.cuda.get_device_properties( device ) )

### Load Data

In [None]:
df_X_train = pd.read_csv('/kaggle/input/career-con-2019/X_train.csv')
df_y_train = pd.read_csv('/kaggle/input/career-con-2019/y_train.csv')

df_X_train_train = df_X_train[df_X_train['series_id'] % 4 != 2]
df_X_train_val = df_X_train[df_X_train['series_id'] % 4 == 2]
df_y_train_train = df_y_train[df_y_train['series_id'] % 4 != 2]
df_y_train_val = df_y_train[df_y_train['series_id'] % 4 == 2]

### Dataset Class and DataLoader

In [None]:
class RoboDataset(Dataset):
    def __init__(self, X_df, y_df, sequence_length=128):
        self.X = X_df
        self.y = y_df
        self.seq_len = sequence_length
        self.class_onehot = {
            'carpet': 0,
            'concrete': 1,
            'fine_concrete': 2,
            'hard_tiles': 3,
            'hard_tiles_large_space': 4,
            'soft_pvc': 5,
            'soft_tiles': 6,
            'tiled': 7,
            'wood': 8
        }
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        data = self.X.iloc[self.seq_len*idx : self.seq_len*(idx+1)].drop(
            columns=['row_id','series_id','measurement_number']).values.astype('float32')
        label = self.class_onehot[self.y.iloc[idx]['surface']]
        
        return_dict = {'data': data, 'label': label}
        return return_dict

In [None]:
trainset = RoboDataset(df_X_train_train, df_y_train_train)
valset = RoboDataset(df_X_train_val, df_y_train_val)

batch_size=64
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
valloader = DataLoader(valset, shuffle=False, batch_size=batch_size)

### LSTM Model

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim=10, hidden_dim=256, num_layers=2, output_dim=9, 
                 dropout=0):
        """
        input_dim = number of features at each time step 
                    (number of features given to each LSTM cell)
        hidden_dim = number of features produced by each LSTM cell (in each layer)
        num_layers = number of LSTM layers
        output_dim = number of classes of the floor texture
        """
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, 
                            num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        
    def forward(self, X):
        hidden_features, (h_n, c_n) = self.lstm(X)  # (h_0, c_0) default to zeros
        hidden_features = hidden_features[:,-1,:]  # index only the features produced by the last LSTM cell
        out = self.fc(hidden_features)
        return out
    

### Training/Validation

In [None]:
lr = 0.001
n_epochs = 1000
iterations_per_epoch = len(trainloader)
best_acc = 0
patience, patience_counter = 50, 0

model = LSTMClassifier(dropout=0.75)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
print('Start model training')

for epoch in range(1, n_epochs + 1):
    
    # initialize losses
    loss_train_total = 0
    loss_val_total = 0
    
    # Training loop
    for i, batch_data in enumerate(trainloader):
        model.train()
        X_batch = batch_data['data'].to(device)
        y_batch = batch_data['label'].to(device).long()
        optimizer.zero_grad()
        
        y_pred = model(X_batch) 
        loss = criterion(y_pred, y_batch)
        loss_train_total += loss.cpu().detach().item() * batch_size
        
        loss.backward()
        optimizer.step()
    
    loss_train_total = loss_train_total / len(trainset)
    
    
    # Validation loop
    with torch.no_grad():
        for i, batch_data in enumerate(valloader):
            model.eval()
            X_batch = batch_data['data'].to(device)
            y_batch = batch_data['label'].to(device).long()

            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss_val_total += loss.cpu().detach().item() * batch_size

    loss_val_total = loss_val_total / len(valset)
    
    
    # Validation Accuracy
    correct, total = 0, 0
    with torch.no_grad():
        model.eval()
        for i, batch_data in enumerate(valloader):
            X_batch = batch_data['data'].to(device)
            y_batch = batch_data['label'].to(device).long()

            y_pred = model(X_batch)
            class_predictions = F.log_softmax(y_pred, dim=1).argmax(dim=1)
            total += y_batch.size(0)
            correct += (class_predictions == y_batch).sum().item()

    acc = correct / total

    
    # Logging
    if epoch % 5 == 0:
        print(f'Epoch: {epoch:3d}. Train Loss: {loss_train_total:.4f}. Val Loss: {loss_val_total:.4f} Acc.: {acc:2.2%}')

    if acc > best_acc:
        patience_counter = 0
        best_acc = acc
        torch.save(model.state_dict(), 'best.pth')
        print(f'Epoch {epoch} best model saved with accuracy: {best_acc:2.2%}')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f'Early stopping on epoch {epoch}')
            break