In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import os
import random
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
from scipy.stats import skew
from tqdm import tqdm
from torch.nn.modules.loss import _WeightedLoss
import pickle
import seaborn as sns
plt.rcParams["figure.figsize"] = (12, 6)
sns.set(font_scale=1.6)


def show_close_plt():
    plt.tight_layout()
    plt.show()
    plt.clf()


def seed_everything(seed=42):
    print('Setting Random Seed')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# Config

In [None]:
# version 3 - added spatial dropout, dropped some of the train sequences with worst CV errors from TRAIN dataset, changed folds to groupfold style, lstm dim = 1024
# and scale features after generating 'shift' features

In [None]:
class CONFIG:
    PATH = '..//input//tabular-playground-series-apr-2022//'
    if os.path.exists(PATH):
        KAGGLE = True
    else:
        KAGGLE = False
        PATH = 'NA'

    N_FOLDS = 5
    N_SENSORS = 13

    N_EPOCHS = 12

    LR = 1e-04
    GAMMA = 0.9
    BATCH_SIZE = 32
    INFERENCE_BATCH_SIZE = 128

    SMOOTHING = 0.0

    FOLD_LIMIT = 5

    LSTM_DIM = 1024
    C1D_CHANNELS = 128
    DENSE_DROPOUT = 0.5

In [None]:
SENSOR_LIST = [f'sensor_{S:02d}' for S in range(0, CONFIG.N_SENSORS)]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load and process data

In [None]:
train = pd.read_csv(f'{CONFIG.PATH}train.csv')
test = pd.read_csv(f'{CONFIG.PATH}test.csv')
train_labels = pd.read_csv(f'{CONFIG.PATH}train_labels.csv')
sample_submission = pd.read_csv(f'{CONFIG.PATH}sample_submission.csv')
print(train.shape, test.shape, train_labels.shape)
display(train.head(10))

In [None]:
display(train_labels.head(10))

In [None]:
shifts = [1, 2, ]
for S in SENSOR_LIST:
    for shift in shifts:
        train[f'delta_{shift}_{S}'] = train.groupby(['sequence'])[S].shift(shift).values - train[S]    
        train[f'delta_{shift}_{S}'] = train[f'delta_{shift}_{S}'].fillna(value=0)

        test[f'delta_{shift}_{S}'] = test.groupby(['sequence'])[S].shift(shift).values - test[S]
        test[f'delta_{shift}_{S}'] = test[f'delta_{shift}_{S}'].fillna(value=0)

for S in SENSOR_LIST:    
    for shift in shifts:
        SENSOR_LIST = SENSOR_LIST + [f'delta_{shift}_{S}']

train[SENSOR_LIST].head(10)

In [None]:
SENSOR_LIST

In [None]:
stsc = StandardScaler()

train[SENSOR_LIST] = stsc.fit_transform(train[SENSOR_LIST])
test[SENSOR_LIST] = stsc.transform(test[SENSOR_LIST])

In [None]:
features_agg = {
    S: [list] for S in SENSOR_LIST
}
train = train.groupby(['sequence']).agg(features_agg)
train.columns = [a for a, b in train.columns]
print(train.shape)
train.head(10)

In [None]:
test = test.groupby(['sequence']).agg(features_agg)
test.columns = [a for a, b in test.columns]
print(test.shape)
test.head(10)

In [None]:
train_labels.head(10)

In [None]:
train['target'] = train.index.map(dict(zip(train_labels['sequence'], train_labels['state'])))
train = train.reset_index(drop=False)  # move sequence into the columns
train['target'].value_counts()

In [None]:
target = 'target'
fold_df = pd.read_csv('../input/tab-april22-seq-data-prep-v4/train_labels.csv')
train['fold'] = train.index.map(dict(zip(fold_df['sequence'], fold_df[f'fold_subject_{CONFIG.N_FOLDS}'])))
print(train[f'fold'].value_counts())

# Load CV predictions

In [None]:
# an existing set of CV predictions
cv_preds = np.load('../input/tps-april-tensorflow-bi-lstm-10f-spatialdoexpts/cv_preds.npy')

In [None]:
train['state'] = train['sequence'].map(dict(zip(train_labels['sequence'], train_labels['state'])))
train['cv_preds'] = cv_preds
roc_auc_score(train['state'],train['cv_preds'])

In [None]:
train['error'] = np.abs(train['cv_preds'] - train['state'])
sns.kdeplot(train['error'])

In [None]:
#filter out sequences with a CV absolute error of > limit
LIMIT = 0.99
sum(train['error']>LIMIT)

In [None]:
allowed_sequences = train['error']<=LIMIT
sum(allowed_sequences) / len(train)

# Define Dataset, Model

In [None]:
class SeqDataset(Dataset):
    def __init__(self, csv, training=False):
        self.csv = csv
        self.training = training

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, item):

        features = np.concatenate(
            [x for x in [np.array(x).reshape(-1, 1) for x in self.csv.loc[item, SENSOR_LIST].values]], axis=1)

        if self.training:
            return {'features': torch.tensor(features, dtype=torch.float),
                    'labels': torch.tensor(self.csv.loc[item, 'target'], dtype=torch.long)}

        else:
            return {'features': torch.tensor(features, dtype=torch.float), }


example_data = SeqDataset(train, training=True)
example_data[0]

In [None]:
example_data[0]['features'].shape

In [None]:
class SeqModel(torch.nn.Module):
    def __init__(self, input_dim=13*2, lstm_dim=512,
                 logit_dim=128, num_classes=1,c1d_channels=128,
                dense_dropout=0.5):
        super().__init__()

        self.c1d_module = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=c1d_channels,
                      kernel_size=2, padding='same', stride=1),
            nn.Conv1d(in_channels=c1d_channels, out_channels=c1d_channels*2,
                      kernel_size=2, padding='same', stride=1)

        )
        
        self.spatial_dropout = torch.nn.Dropout2d(p=0.25)

        self.lstm1 = nn.LSTM(c1d_channels*2, lstm_dim, batch_first=True, bidirectional=True, dropout=0.0)
        self.lstm2 = nn.LSTM(lstm_dim * 2, lstm_dim // 2, batch_first=True, bidirectional=True, dropout=0.0)
        self.lstm3 = nn.LSTM(lstm_dim , lstm_dim // 4, batch_first=True, bidirectional=True, dropout=0.0)

        self.logits = nn.Sequential(
            nn.LazyLinear(out_features=logit_dim),  
            nn.ReLU(),
            nn.Dropout(p=dense_dropout),
            nn.Linear(logit_dim, num_classes),
        )

    def forward(self, x):
        
        x = x.permute(0, 2, 1)
        x = self.spatial_dropout(x)
        
        x = self.c1d_module(x)
        
        features1, _ = self.lstm1(x.permute(0, 2, 1))
        features2, _ = self.lstm2(features1)
        features3, _ = self.lstm3(features2)        
        
        features = features3.reshape(features3.shape[0], -1)
        
        pred = self.logits(features)
        
        return pred.squeeze(1)

In [None]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets: torch.Tensor, n_labels: int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
                                           self.smoothing)

        loss = torch.nn.functional.binary_cross_entropy_with_logits(inputs, targets, self.weight)

        if self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()

        return loss

# Run Training

In [None]:
FOLDS = sorted(train['fold'].unique())

test_dataset = SeqDataset(test.reset_index(drop=True), training=False)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG.INFERENCE_BATCH_SIZE,
    shuffle=False,
    drop_last=False
)

train['oof'] = 0.0

for f in FOLDS[:CONFIG.FOLD_LIMIT]:
    seed_everything(seed=42 + f)

    trn_idx = (train['fold'] != f) & (allowed_sequences)
    val_idx = train['fold'] == f

    baseline_log_loss = log_loss(train.loc[val_idx, target], np.full(train.loc[val_idx, target].shape,
                                                                     fill_value=train.loc[val_idx, target].mean()))
    print(f'training fold {f}')
    print(f'baseline log loss {baseline_log_loss}')

    train_dataset = SeqDataset(train.loc[trn_idx].reset_index(drop=True), training=True)
    val_dataset = SeqDataset(train.loc[val_idx].reset_index(drop=True), training=True)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG.BATCH_SIZE,
        shuffle=True,
        drop_last=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG.INFERENCE_BATCH_SIZE,
        shuffle=False,
        drop_last=False
    )

    model = SeqModel(input_dim = len(SENSOR_LIST),
                     lstm_dim=CONFIG.LSTM_DIM,
                     c1d_channels=CONFIG.C1D_CHANNELS,
                     dense_dropout=CONFIG.DENSE_DROPOUT)

    model = model.to(DEVICE)

    optimizer = torch.optim.AdamW(params=model.parameters(),
                                  lr=CONFIG.LR, )

    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer,
                                                       gamma=CONFIG.GAMMA)

    # criterion = nn.BCEWithLogitsLoss(reduction='mean')
    criterion = SmoothBCEwLogits(smoothing=CONFIG.SMOOTHING)

    best_fold_score = -999
    train_aucs = []
    val_aucs = []

    for epoch in range(CONFIG.N_EPOCHS):
        tbar = tqdm(train_loader)

        model.train()
        epoch_loss = 0

        tr_labels = []
        tr_predictions = []
        for count, batch in enumerate(tbar):
            optimizer.zero_grad()

            features = batch['features'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            preds = model(features)

            loss = criterion(preds, labels.float())

            loss.backward()

            epoch_loss += loss.item()

            optimizer.step()

            tr_labels += [labels.detach().cpu().numpy()]
            tr_predictions += [torch.sigmoid(preds).detach().cpu().numpy()]

        scheduler.step()

        tr_labels = np.concatenate([x for x in tr_labels], axis=0)
        tr_predictions = np.concatenate([x for x in tr_predictions], axis=0)
        tr_predictions = np.clip(tr_predictions, 0.00001, 1 - 0.00001)

        tr_loss = log_loss(tr_labels, tr_predictions)
        tr_auc = roc_auc_score(tr_labels, tr_predictions)

        print(f'epoch {epoch} training log loss {tr_loss} training AUC {tr_auc}')
        train_aucs += [tr_auc]

        # Validation fold for epoch
        tbar = tqdm(val_loader)

        model.eval()
        val_labels = []
        val_predictions = []
        for count, batch in enumerate(tbar):
            features = batch['features'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            with torch.no_grad():
                preds = model(features)

            val_labels += [labels.detach().cpu().numpy()]
            val_predictions += [torch.sigmoid(preds).detach().cpu().numpy()]

        val_labels = np.concatenate([x for x in val_labels], axis=0)
        val_predictions = np.concatenate([x for x in val_predictions], axis=0)
        val_predictions = np.clip(val_predictions, 0.00001, 1 - 0.00001)

        train.loc[val_idx, 'oof'] = val_predictions

        val_loss = log_loss(val_labels, val_predictions)
        val_auc = roc_auc_score(val_labels, val_predictions)
        print(f'epoch {epoch} validation log loss {val_loss},val auc {val_auc}')

        print('learning rate ', optimizer.param_groups[0]['lr'])

        torch.save(model.state_dict(), f'model_fold_{f}_epoch_{epoch}')

        val_aucs += [val_auc]

        if val_auc > best_fold_score:
            print(f'best ROC AUC improved to {val_auc}')
            best_fold_score = val_auc
            torch.save(model.state_dict(), f'model_fold_{f}_best')
        else:
            print('validation score not improved')

    # complete fold
    print(f'finished fold with best AUC score {best_fold_score} final AUC score {val_aucs[-1]}')

    plt.plot(range(CONFIG.N_EPOCHS),
             train_aucs, color='Blue')
    plt.plot(range(CONFIG.N_EPOCHS),
             val_aucs, color='Red')

    print('reloading weights from best epoch for inference')
    model.load_state_dict(torch.load(f'model_fold_{f}_best'))

    # final out of fold predictions with best weights
    model.eval()
    val_labels = []
    val_predictions = []
    for count, batch in enumerate(val_loader):
        features = batch['features'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        with torch.no_grad():
            preds = model(features)

        val_predictions += [torch.sigmoid(preds).detach().cpu().numpy()]

    val_predictions = np.concatenate([x for x in val_predictions], axis=0)
    train.loc[val_idx, 'oof'] = val_predictions

    # final test inference with best weights
    test_predictions = []
    for count, batch in enumerate(test_loader):
        features = batch['features'].to(DEVICE)

        with torch.no_grad():
            preds = model(features)

        test_predictions += [torch.sigmoid(preds).detach().cpu().numpy()]

    test_predictions = np.concatenate([x for x in test_predictions], axis=0)

    sample_submission[f'fold_{f}'] = test_predictions

plt.title('Train (Blue) and Validation (Red) AUC Scores')
show_close_plt()

# Review Results & Export Submission

In [None]:
total_auc = roc_auc_score(train[target][train['fold'].isin(FOLDS[:CONFIG.FOLD_LIMIT])],
                          train['oof'][train['fold'].isin(FOLDS[:CONFIG.FOLD_LIMIT])])
print(f'total OOF AUC score {total_auc} from CV on {len(FOLDS[:CONFIG.FOLD_LIMIT])} folds')
train.to_csv('train_with_oof.csv')

In [None]:
for f in FOLDS[:CONFIG.FOLD_LIMIT]:
    fold_score = roc_auc_score(train[target][train['fold']==f],
                          train['oof'][train['fold']==f])
    print(f'fold {f} AUC CV score is {fold_score}')

In [None]:
sns.boxplot(x = train[target][train['fold'].isin(FOLDS[:CONFIG.FOLD_LIMIT])],
                        y = train['oof'][train['fold'].isin(FOLDS[:CONFIG.FOLD_LIMIT])])
plt.title('Distribution of OOF by target value')
show_close_plt()

In [None]:
for f in FOLDS[:CONFIG.FOLD_LIMIT]:
    sns.kdeplot(sample_submission[f'fold_{f}'])
    
plt.title('Test predictions by CV fold')
plt.legend(FOLDS[:CONFIG.FOLD_LIMIT])
show_close_plt()

In [None]:
sample_submission['mean_prediction'] = sample_submission[[f'fold_{f}' for f in FOLDS[:CONFIG.FOLD_LIMIT]]].mean(axis=1)
sns.kdeplot(sample_submission['mean_prediction'],color='Red')
sns.kdeplot(train['oof'][train['fold'].isin(FOLDS[:CONFIG.FOLD_LIMIT])], color='Blue', shade=True)
plt.legend(['Test mean prediction distribution', 'CV prediction distribution'])
show_close_plt()

In [None]:
sample_submission['state'] = sample_submission['mean_prediction']
sample_submission[['sequence', 'state']].head(10)

In [None]:
sample_submission[['sequence', 'state']].to_csv('submission.csv', index=False)