This is a pytorch lightning training code that I used and wrote for this competiton.

A lot were inspired from https://www.kaggle.com/luffy521/lstm-by-pytorch-with-unified-wi-fi-feats

The code will not work right away in Kaggle notebooks (you will have to change details) and I used it in my local machine. 

Here is the link to my github repository in case you want to see all of the original code.

https://github.com/dongkyuk/Kaggle_Indoor_Loc

The current scores for this code is 4.965 with postprocessing and 6.74 without postprocessing.

I used this notebook https://www.kaggle.com/higepon/visualize-submissions-with-post-processing for postprocessing.

Config

In [None]:
class Config():
    DATA_DIR = 'data'
    SAVE_DIR = 'save'
    
    seed = 42
    epochs = 300
    num_wifi_feats = 20
    fold_num = 5
    train_batch_size = 256
    val_batch_size = 256
    num_workers = 16
    device = 'gpu'
    lr = 5e-3

DataModule

In [None]:
import pandas as pd
from pytorch_lightning import LightningDataModule
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
import numpy as np


class IndoorDataset(Dataset):
    def __init__(self, data, bssid_feats, rssi_feats, flag='TRAIN'):
        self.data = data
        self.flag = flag
        self.bssid_feats = bssid_feats
        self.rssi_feats = rssi_feats

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        tmp_data = self.data.iloc[index]
        if self.flag == 'TRAIN':
            return {
                'BSSID_FEATS': tmp_data[self.bssid_feats].values.astype(int),
                'RSSI_FEATS': tmp_data[self.rssi_feats].values.astype(np.float32),
                'site_id': tmp_data['site_id'].astype(int),
                'x': tmp_data['x'],
                'y': tmp_data['y'],
                'floor': tmp_data['floor'],
            }
        elif self.flag == 'TEST':
            return {
                'BSSID_FEATS': tmp_data[self.bssid_feats].values.astype(int),
                'RSSI_FEATS': tmp_data[self.rssi_feats].values.astype(np.float32),
                'site_id': tmp_data['site_id'].astype(int)
            }


class IndoorDataModule(LightningDataModule):
    def __init__(self, train_data, test_data, kfold=False):
        self.train_data = train_data
        self.test_data = test_data
        self.kfold = kfold

    def set_fold_num(self, fold_num):
        self.fold_num = fold_num

    def _init_feats(self):
        self.bssid_feats = [f'bssid_{i}' for i in range(Config.num_wifi_feats)]
        self.rssi_feats = [f'rssi_{i}' for i in range(Config.num_wifi_feats)]

    def _init_wifi_bssids(self):
        wifi_bssids = []
        for i in range(100):
            wifi_bssids += self.train_data[f'bssid_{i}'].values.tolist()
            wifi_bssids += self.test_data[f'bssid_{i}'].values.tolist()

        self.wifi_bssids = list(set(wifi_bssids))
        self.wifi_bssids_size = len(self.wifi_bssids)

    def _init_transforms(self):
        self.wifi_bssids_encoder = LabelEncoder()
        self.wifi_bssids_encoder.fit(self.wifi_bssids)

        self.site_id_encoder = LabelEncoder()
        self.site_id_encoder = self.site_id_encoder.fit(
            self.train_data['site_id'])

        self.rssi_normalizer = StandardScaler()
        self.rssi_normalizer.fit(self.train_data[self.rssi_feats])

    def _transform(self, data):
        for bssid_feat in self.bssid_feats:
            data[bssid_feat] = self.wifi_bssids_encoder.transform(
                data[bssid_feat])
        data['site_id'] = self.site_id_encoder.transform(data['site_id'])
        data[self.rssi_feats] = self.rssi_normalizer.transform(
            data[self.rssi_feats])
        return data

    def _kfold(self):
        ''' Group Kfold wrt path and Stratified Kfold wrt site_id
        '''
        skf = StratifiedKFold(n_splits=Config.fold_num,
                                   shuffle=True, random_state=Config.seed)
        self.train_data['site_id_f'] = self.train_data['site_id'] + self.train_data['floor'].astype(str)
        for n, (train_index, val_index) in enumerate(
            skf.split(
                X = self.train_data['path'],
                y = self.train_data['path']
            )
        ):
            self.train_data.loc[val_index, 'kfold'] = int(n)

    def prepare_data(self):
        # Init cross validation
        if self.kfold:
            self._kfold()

        # Init preprocessing
        self._init_feats()
        self._init_wifi_bssids()
        self._init_transforms()
        self.site_id_dim = len(self.train_data['site_id'].unique())
        self.train_data = self._transform(self.train_data)
        self.test_data = self._transform(self.test_data)

    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            if self.kfold:
                train_df = self.train_data[self.train_data['kfold'] !=
                                           self.fold_num].reset_index(drop=True)
                val_df = self.train_data[self.train_data['kfold'] ==
                                         self.fold_num].reset_index(drop=True)
            self.train = IndoorDataset(
                train_df, self.bssid_feats, self.rssi_feats, flag="TRAIN")
            self.val = IndoorDataset(
                val_df, self.bssid_feats, self.rssi_feats, flag="TRAIN")

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.test = IndoorDataset(
                self.test_data, self.bssid_feats, self.rssi_feats, flag="TEST")

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=Config.train_batch_size, num_workers=Config.num_workers, shuffle=True, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=Config.val_batch_size, num_workers=Config.num_workers, shuffle=True, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.test, batch_size=Config.val_batch_size, num_workers=Config.num_workers, shuffle=False, pin_memory=True)

Model (Torch NN Module)

In [None]:
import numpy as np
import torch
import torch.nn as nn

class SeqLSTM(nn.Module):
    def __init__(self, wifi_num, bssid_dim, site_id_dim, embedding_dim=64):
        """SeqLSTM Model
        Args:
            wifi_num (int): number of wifi signals to use
            bssid_dim (int): total number of unique bssids
            site_id_dim (int): total number of unique site ids
            embedding_dim (int): Dimension of bssid embedding. Defaults to 64.
        """
        super(SeqLSTM, self).__init__()
        self.wifi_num = wifi_num
        self.feature_dim = 256

        # Embedding
        self.embd_bssid = nn.Embedding(bssid_dim, embedding_dim)
        self.embd_site_id = nn.Embedding(site_id_dim, embedding_dim)

        # Linear
        self.fc_rssi = nn.Linear(1, embedding_dim)
        self.fc_features = nn.Linear(embedding_dim * 3, self.feature_dim)
        self.fc_output = nn.Linear(16, 3)

        # Other
        self.bn_rssi = nn.BatchNorm1d(embedding_dim)
        self.bn_features = nn.BatchNorm1d(self.feature_dim)
        self.dropout = nn.Dropout(0.3),

        self.lstm1 = nn.LSTM(input_size=256, hidden_size=128,
                             dropout=0.3, bidirectional=False)
        self.lstm2 = nn.LSTM(input_size=128, hidden_size=16,
                             dropout=0.1, bidirectional=False)

    def forward(self, x):
        embd_bssid = self.embd_bssid(x['BSSID_FEATS'])  # (,wifi_num,embedding_dim)

        embd_site_id = self.embd_site_id(x['site_id'])  # (,embedding_dim)
        embd_site_id = torch.unsqueeze(embd_site_id, dim=1)  # (,1,embedding_dim)
        embd_site_id = embd_site_id.repeat(
            1, self.wifi_num, 1)  # (,wifi_num,embedding_dim)

        rssi_feat = x['RSSI_FEATS']  # (,wifi_num)
        rssi_feat = torch.unsqueeze(rssi_feat, dim=-1)   # (,wifi_num,1)
        rssi_feat = self.fc_rssi(rssi_feat)              # (,wifi_num,embedding_dim)
        rssi_feat = self.bn_rssi(rssi_feat.transpose(1, 2)).transpose(1, 2)
        rssi_feat = torch.relu(rssi_feat)

        x = torch.cat([embd_bssid, embd_site_id, rssi_feat],
                      dim=-1)  # (,wifi_num,embedding_dim*3)

        x = self.fc_features(x)  # (,wifi_num, feature_dim)
        x = self.bn_features(x.transpose(1, 2)).transpose(1, 2)
        x = torch.relu(x)

        x = torch.transpose(x, 0, 1)  # (wifi_num,,128)
        x, _ = self.lstm1(x)

        x = x[-1] # (256,16)
        x = torch.relu(x)

        output = self.fc_output(x).squeeze()  # (,3)

        return output

LightningModule 

In [None]:
import torch.nn as nn
import torch
import numpy as np
from pytorch_lightning import LightningModule
from pytorch_lightning.metrics import Accuracy
from torch import optim

def xy_loss(xy_hat, xy_label):
    xy_loss = torch.mean(torch.sqrt(
        (xy_hat[:, 0]-xy_label[:, 0])**2 + (xy_hat[:, 1]-xy_label[:, 1])**2))
    return xy_loss


def floor_loss(floor_hat, floor_label):
    floor_loss = 15 * torch.mean(torch.abs(floor_hat-floor_label))
    return floor_loss


class IndoorLocModel(LightningModule):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
        self.lr = Config().lr

        self.critertion_xy = xy_loss
        self.criterion_floor = floor_loss

    def forward(self, x):
        x = self.model(x)
        return x

    def training_step(self, batch, batch_nb):
        x, y, f = batch['x'].unsqueeze(
            -1), batch['y'].unsqueeze(-1), batch['floor']

        xy_label = torch.cat([x, y], dim=-1)

        output = self(batch)
        xy_hat = output[:, 0:2]
        f_hat = output[:, 2]

        loss_xy = self.critertion_xy(xy_hat, xy_label)
        loss_floor = self.criterion_floor(f_hat, f)
        loss = loss_xy + loss_floor

        return {'loss': loss, 'loss_xy': loss_xy, 'loss_floor': loss_floor, 'xy_label': xy_label, 'xy_hat': xy_hat, 'floor_hat': f_hat, 'f': f}

    def training_epoch_end(self, outputs):
        loss_xy = torch.mean(torch.stack(
            [output['loss_xy'] for output in outputs], dim=0))
        loss_floor = torch.mean(torch.stack(
            [output['loss_floor'] for output in outputs], dim=0))
        loss = torch.mean(torch.stack([output['loss']
                          for output in outputs], dim=0))

    def validation_step(self, batch, batch_nb):
        x, y, f = batch['x'].unsqueeze(
            -1), batch['y'].unsqueeze(-1), batch['floor']

        xy_label = torch.cat([x, y], dim=-1)

        output = self(batch)
        xy_hat = output[:, 0:2]
        f_hat = output[:, 2]

        return {'xy_label': xy_label, 'xy_hat': xy_hat, 'f_hat': f_hat, 'f': f}

    def validation_epoch_end(self, outputs):
        xy_label = torch.cat([output['xy_label'] for output in outputs], dim=0)
        xy_hat = torch.cat([output['xy_hat'] for output in outputs], dim=0)
        f_hat = torch.cat([output['f_hat']
                           for output in outputs], dim=0)
        f_hat = torch.squeeze(f_hat)
        f = torch.cat([output['f'] for output in outputs], dim=0)

        loss_xy = self.critertion_xy(xy_hat, xy_label)
        loss_floor = self.criterion_floor(f_hat, f)
        loss = loss_xy + loss_floor

        self.log('val_loss', loss, prog_bar=True)
        self.log('val_metric', loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5)

        return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss', }

In [None]:
import pandas as pd
import torch
import os
import logging
import warnings
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

def init_config(seed=42):
    logging.basicConfig(level=logging.INFO)
    warnings.filterwarnings("ignore")
    seed_everything(seed)

def load_data():
    # Load data
    train_data_dir = os.path.join(Config.DATA_DIR, 'train_all.csv')
    test_data_dir = os.path.join(Config.DATA_DIR, 'test_all.csv')

    train_data = pd.read_csv(train_data_dir)
    test_data = pd.read_csv(test_data_dir)

    # Init datamodule
    idm = IndoorDataModule(train_data, test_data, kfold=True)
    idm.prepare_data()
    return idm


def train_model(idm: IndoorDataModule, fold: int):
    # Set fold
    idm.set_fold_num(fold)
    idm.setup()
    
    # Init model
    model = IndoorLocModel(SeqLSTM(
        Config.num_wifi_feats, idm.wifi_bssids_size, idm.site_id_dim))

    # Init callback
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath=os.path.join(Config.SAVE_DIR, f'{fold}'),
        filename='{epoch:02d}-{val_loss:.2f}-{val_metric:.2f}.pth',
        save_top_k=5,
        mode='min',
    )
    early_stopping = EarlyStopping(
        monitor='val_loss',
        mode='min',
        patience=10,
    )

    # Init trainer
    trainer = Trainer(
        gpus=1,
        num_sanity_val_steps=-1,
        deterministic=True,
        max_epochs=Config.epochs,
        callbacks=[checkpoint_callback, early_stopping],
        # profiler="simple",
    )
    # trainer.tune(model, idm)

    # Train
    trainer.fit(model, idm)


def main():
    init_config()

    idm = load_data()

    for fold in range(Config.fold_num):
        train_model(idm, fold)


if __name__ == "__main__":
    main()