This notebook is demonstration code of training and inference part for the [Indoor Location & Navigation](https://www.kaggle.com/c/indoor-location-navigation) competition.  
The following changes have been made from the code executed by google colab.
- Changed the variable "inference_only" from False to True


Here are links to a great notebook of my teammates.
- [Minh Tri Phan](https://www.kaggle.com/shinomoriaoshi)'s training, inference  
  https://www.kaggle.com/shinomoriaoshi/iln-transformer-train-oof  
  https://www.kaggle.com/shinomoriaoshi/iln-transformer-inference  
- [Kouki](https://www.kaggle.com/kokitanisaka)'s training, inference, and post process   
  https://www.kaggle.com/kokitanisaka/self-attentintive-lstm-by-keras  
  https://www.kaggle.com/kokitanisaka/fix-snapped-waypoints  
  https://www.kaggle.com/kokitanisaka/create-arrayed-map
- [darich](https://www.kaggle.com/daaariiich)'s post process     
  https://www.kaggle.com/daaariiich/nonlinear-cost-minimization-with-geojson
  

# Overview of the model

In [None]:
from IPython.display import Image
Image(filename="../input/iln-dataset/mymodel-1.png",
      format='png')

In [None]:
%pip install einops

In [None]:
import numpy as np
import pandas as pd

import os, random, pickle, gc, time, json, math, copy
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from matplotlib import pyplot as plt

import torch
from torch import nn, optim
from torch.nn.parameter import Parameter
from torch.nn.utils import rnn
from einops import rearrange

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
dat_ver = 'ILN_631dat'
model_ver = 'ILN_632'
subm_file = '../input/indoor-location-navigation/sample_submission.csv'

N_SPLITS = 10
NUM_FEATS = 40
SEED = 1605
batch_size = 128
inference_only = True

if inference_only:
    n_epoch = 1
    data_dir = '../input/iln-dataset/ILN_train_results/'
else:
    n_epoch = 400
    data_dir = ''

set_seed(SEED)
    
ID_FEATS = [f'id_{i}' for i in range(NUM_FEATS)]
STRG_FEATS  = [f'strength_{i}' for i in range(NUM_FEATS)]
IMU_FEATS = ['gyro_x_mean', 'gyro_y_mean', 'gyro_z_mean',
             'gyro_x_std', 'gyro_y_std', 'gyro_z_std',
             'gyro_x_max', 'gyro_y_max', 'gyro_z_max',
             'gyro_x_min', 'gyro_y_min', 'gyro_z_min',
             'gyro_x_skew', 'gyro_y_skew', 'gyro_z_skew',
             'acce_x_mean', 'acce_y_mean', 'acce_z_mean',
             'acce_x_std', 'acce_y_std', 'acce_z_std',
             'acce_x_max', 'acce_y_max', 'acce_z_max',
             'acce_x_min', 'acce_y_min', 'acce_z_min',
             'acce_x_skew', 'acce_y_skew', 'acce_z_skew',
             'ahrs_x_mean', 'ahrs_y_mean', 'ahrs_z_mean',
             'ahrs_x_std', 'ahrs_y_std', 'ahrs_z_std',
             'ahrs_x_max', 'ahrs_y_max', 'ahrs_z_max',
             'ahrs_x_min', 'ahrs_y_min', 'ahrs_z_min',
             'ahrs_x_skew', 'ahrs_y_skew', 'ahrs_z_skew',
             'head_magn_x_mean', 'head_magn_y_mean',
             'head_magn_x_std', 'head_magn_y_std',
             'head_magn_x_max', 'head_magn_y_max',
             'head_magn_x_min', 'head_magn_y_min',
             'head_magn_x_skew', 'head_magn_y_skew',
             'magn_z_mean', 'magn_z_std',
             'magn_z_max', 'magn_z_min', 'magn_z_skew']

In [None]:
floor_map = {"B2":-2, "B1":-1,
             "F1":0, "F2":1, "F3":2, "F4":3, "F5":4,
             "F6":5, "F7":6, "F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4,
             "6F":5, "7F":6, "8F":7, "9F":8}
site_list = ['5d2709a003f801723c3251bf','5a0546857ecc773753327266',
             '5c3c44b80379370013e0fd2b','5d2709b303f801723c327472',
             '5d2709bb03f801723c32852c','5d2709c303f801723c3299ee',
             '5d2709d403f801723c32bd39','5d2709e003f801723c32d896',
             '5d27075f03f801723c2e360f','5d27096c03f801723c31e5e0',
             '5d27097f03f801723c320d97','5d27099f03f801723c32511d',
             '5da138b74db8ce0c98bd4774','5da958dd46f8266d0737457b',
             '5da1382d4db8ce0c98bbe92e','5da1383b4db8ce0c98bc11ab',
             '5da1389e4db8ce0c98bd0547','5da138274db8ce0c98bbd3d2',
             '5da138314db8ce0c98bbf3a0','5da138364db8ce0c98bc00f1',
             '5da138754db8ce0c98bca82f','5da138764db8ce0c98bcaa46',
             '5dbc1d84c1eb61796cf7c010','5dc8cea7659e181adb076a3f']

"df_TestTimeLag" in the below cell is a time lag table for each path in the test data, and created with [this notebook](https://www.kaggle.com/horsek/ilnpre2-create-test-ts-lag).   
  
"ILN_631dat_df_wifi_all.pkl" in the below cell is signal sequence data created with [this notebook](https://www.kaggle.com/horsek/iln-preprocess-create-dataset-public).

In [None]:
df_TestTimeLag = pd.read_csv('../input/iln-preprocess-time-lag-table-of-test-data/test_ts_lag.csv',index_col=0)
with open(f'../input/iln-preprocess-create-dataset-public/{dat_ver}_df_wifi_all.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
'''
- Cutoff based on signal strength
  If "thresh" is set less than or equals to -100, cutoff is not performed.
- Divide long sequence into short ones (maximum length is max_seq_len)
  by GPU memory limitation.
'''
min_seq_len = 10
max_seq_len = 200
tmp = []
for gid, grp in tqdm(data.groupby('path'), ncols=60):
    if grp.shape[0] > min_seq_len:
        thresh = np.sort(grp['strength_0'].values)[-min_seq_len]
        thresh = min([-100, thresh])
        grp = grp[grp['strength_0']>=thresh].copy()
    grp['seq_label'] = grp['path'].str.cat((np.arange(grp.shape[0])
                                            //max_seq_len).astype(str),
                                           sep='_')
    tmp.append(grp)
data = pd.concat(tmp)
data.reset_index(drop=True, inplace=True)
del tmp; gc.collect()
print(f'total num of path data: {data.shape[0]}')

In [None]:
# for s in data.columns:
#     tmp = [a for a in data.columns if a==s]
#     if len(tmp)>=2:
#         print(tmp)
#     # print((data[s].iloc[:,0]==data[s].iloc[:,1]).all())

In [None]:
tslabel ='sys_ts'
data['site_path_timestamp'] = (data['site']
                               .str.cat(data['path'], sep='_'))
# timestamp is subtract with time lag value (test data only)
mask = data['train/test']=='train'
data.loc[ mask,'timestamp'] = data.loc[mask,tslabel]
data.loc[~mask,'timestamp'] = (data.loc[~mask,tslabel]
                              -df_TestTimeLag.loc[data.loc[~mask,'path']]
                              .values.reshape(-1))
data['site_path_timestamp'] = (data['site_path_timestamp']
                               .str.cat(data['timestamp']
                                        .astype('int64').astype(str)
                                        .str.zfill(13), sep='_'))

In [None]:
OTHER_FEATS = [itm for itm in data.columns
               if (itm[:3]!='id_')&(itm[:9]!='strength_')&
                  (itm[-5:]!='_mean')&(itm[-4:]!='_std')&
                  (itm[-4:]!='_max')&(itm[-4:]!='_min')&
                  (itm[-5:]!='_skew')]
data = data[OTHER_FEATS+ID_FEATS+STRG_FEATS+IMU_FEATS]

In [None]:
# signal ID vocabulary for embedding layer
ID_vocab = sorted(list(set(data[ID_FEATS].values.reshape(-1))))
print(f'ID vocabulary: {len(ID_vocab)}')

In [None]:
''' label encoding '''
le_id = LabelEncoder()
le_id.fit(ID_vocab)
le_site = LabelEncoder()
le_site.fit(data['site'].unique())
for i in ID_FEATS:
    data.loc[:,i] = le_id.transform(data.loc[:,i])+1
data.loc[:, 'site'] = le_site.transform(data.loc[:, 'site'])
 
# strength=nan -> token=0 (for zero padding)
data[ID_FEATS] *= (~(data[STRG_FEATS].isna()).values)
 
data[STRG_FEATS] = data[STRG_FEATS].fillna(-100)+100

In [None]:
''' signal strength normalization (path wise) '''
data.sort_index(inplace=True)
tmp = []
for _, gdf in tqdm(data.groupby('path',sort=False),
                   ncols=60):
    arr = gdf[STRG_FEATS].values.flatten()
    gdf[STRG_FEATS] = (gdf[STRG_FEATS]-arr.min())/(arr.max()-arr.min())
    tmp.append(gdf[STRG_FEATS])
tmp = pd.concat(tmp)
tmp.sort_index(inplace=True)
data[STRG_FEATS] = tmp.values
del arr,tmp; gc.collect()

In [None]:
def replace_outlier(series, thresh=0.25, bias=1.5):
    q1 = series.quantile(thresh)
    q3 = series.quantile(1-thresh)
    iqr = q3 - q1

    outlier_min = q1 - (iqr) * bias
    outlier_max = q3 + (iqr) * bias

    series = series.clip(outlier_min, outlier_max)
    return series

In [None]:
''' IMU data standardization '''
data[['acce_z_mean','acce_z_max','acce_z_min']] = \
    data[['acce_z_mean','acce_z_max','acce_z_min']]-9.80665
for sens in['gyro','acce','ahrs','magn']:
    ss = StandardScaler()
    if sens=='magn':
        tmpFEATS = [sens+'_z_mean']
        ss.fit(data[tmpFEATS].values.reshape(-1,1))
    else: 
        tmpFEATS = [sens+'_x_mean', sens+'_y_mean', sens+'_z_mean']
        ss.fit(np.tile(data[tmpFEATS].values.reshape(-1,1),(1,3)))
    data[tmpFEATS] = ss.transform(data[tmpFEATS])
    for itm in tmpFEATS:
        data[itm]=replace_outlier(data[itm])    

    ss = StandardScaler()
    if sens=='magn':
        tmpFEATS = [sens+'_z_std']
        ss.fit(data[tmpFEATS].values.reshape(-1,1))
    else: 
        tmpFEATS = [sens+'_x_std', sens+'_y_std', sens+'_z_std']
        ss.fit(np.tile(data[tmpFEATS].values.reshape(-1,1),(1,3)))
    data[tmpFEATS] = ss.transform(data[tmpFEATS])
    for itm in tmpFEATS:
        data[itm]=replace_outlier(data[itm])    

    ss = StandardScaler()
    if sens=='magn':
        tmpFEATS = [sens+'_z_skew']
        ss.fit(data[tmpFEATS].values.reshape(-1,1))
    else: 
        tmpFEATS = [sens+'_x_skew', sens+'_y_skew', sens+'_z_skew']
        ss.fit(np.tile(data[tmpFEATS].values.reshape(-1,1),(1,3)))
    data[tmpFEATS] = ss.transform(data[tmpFEATS])
    for itm in tmpFEATS:
        data[itm]=replace_outlier(data[itm])    

    ss = StandardScaler()
    if sens=='magn':
        tmpFEATS = [sens+'_z_max', sens+'_z_min']
        ss.fit(np.tile(data[tmpFEATS].values.reshape(-1,1),(1,2)))
    else: 
        tmpFEATS = [sens+'_x_max', sens+'_x_min',
                    sens+'_y_max', sens+'_y_min',
                    sens+'_z_max', sens+'_z_min']
        ss.fit(np.tile(data[tmpFEATS].values.reshape(-1,1),(1,6)))
    data[tmpFEATS] = ss.transform(data[tmpFEATS])
    for itm in tmpFEATS:
        data[itm]=replace_outlier(data[itm])

In [None]:
''' relative position '''
DELTA_FEATS = ['delta_x_hat', 'delta_y_hat',
               'delta_x_mag', 'delta_y_mag']
tmp = []
for gid, grp in tqdm(data.groupby('seq_label'), ncols=60):
    grp[DELTA_FEATS[0]] = grp['rel_x'].diff().fillna(0)
    grp[DELTA_FEATS[1]] = grp['rel_y'].diff().fillna(0)
    tmp.append(grp[DELTA_FEATS[0:2]])
assert (pd.concat(tmp).index==data.index).all()
data[DELTA_FEATS[0:2]] = pd.concat(tmp)

delta_l = np.sqrt(np.square(data[DELTA_FEATS[0:2]]).sum(axis=1))
data[DELTA_FEATS[2]] = delta_l * data['head_magn_x_mean']
data[DELTA_FEATS[3]] = delta_l * data['head_magn_y_mean']

del tmp, delta_l; gc.collect()
ss = StandardScaler()
ss.fit(np.tile(data[DELTA_FEATS].values.reshape(-1,1),(1,4)))
data[DELTA_FEATS] = ss.transform(data[DELTA_FEATS])
for itm in DELTA_FEATS:
    data[itm]=replace_outlier(data[itm])

In [None]:
# for col in IMU_FEATS:
#     print(col)
#     print(data[col].describe())
#     data[col].hist(bins=50)
#     plt.show()

In [None]:
# for col in DELTA_FEATS:
#     print(col)
#     print(data[col].describe())
#     data[col].hist(bins=50)
#     plt.show()

In [None]:
''' DataFrame split '''
trvl_data = data[data['train/test']=='train'].copy()
test_data = data[data['train/test']=='test'].copy()
del data; gc.collect()

In [None]:
''' train val split '''
grb = trvl_data[['seq_label','site','floor']].groupby('seq_label')
df_stra = grb.first()
df_stra['seq_len'] = grb['seq_label'].count().values
df_stra['seq_len_cut'] = pd.qcut(df_stra['seq_len'],4)
df_stra['stratify'] = df_stra.apply(lambda x:
                                    str(x['seq_len_cut'])+'_'+
                                    str(x['site'])+'_'+
                                    str(x['floor']),
                                    axis=1)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True,
                      random_state=SEED)
for i, (_, val_idx) in enumerate(skf.split(df_stra['stratify'],
                                           df_stra['stratify'])):
    tmp = np.zeros(df_stra.shape[0]).astype(bool)
    tmp[val_idx] = True
    df_stra[f'fold_{i}'] = tmp
df_stra.shape

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('using device:', device)

In [None]:
''' create tensor for train data '''
tr_id, tr_strg, tr_dlt, tr_imu, tr_site, tr_pos, tr_xy, tr_mask = \
    [],[],[],[],[],[],[],[]
for gid, grp in tqdm(trvl_data.groupby('seq_label'), ncols=60):
    tr_id.append(torch.from_numpy(grp[ID_FEATS].values).long())
    tr_strg.append(torch.from_numpy(grp[STRG_FEATS].values).float())
    tr_dlt.append(torch.from_numpy(grp[DELTA_FEATS].values).float())
    tr_imu.append(torch.from_numpy(grp[IMU_FEATS].values).float())
    tr_site.append(torch.from_numpy(grp['site'].values).long())
    tr_pos.append(torch.from_numpy(grp['len_pos'].values).float())
    tr_xy.append(torch.from_numpy(grp[['x','y']].values).float())
    ''' tr_mask: the tensor for padding '''
    tr_mask.append(tr_id[-1].sum(dim=-1)==0)
tr_id = rnn.pad_sequence(tr_id, batch_first=True).to(device)
tr_strg = rnn.pad_sequence(tr_strg, batch_first=True).to(device)
tr_dlt = rnn.pad_sequence(tr_dlt, batch_first=True).to(device)
tr_imu = rnn.pad_sequence(tr_imu, batch_first=True).to(device)
tr_site = rnn.pad_sequence(tr_site, batch_first=True).to(device)
tr_pos = rnn.pad_sequence(tr_pos, batch_first=True).to(device)
tr_xy = rnn.pad_sequence(tr_xy, batch_first=True).to(device)
tr_mask = rnn.pad_sequence(tr_mask, batch_first=True,
                           padding_value=True).to(device)

In [None]:
class ILNdatasets(torch.utils.data.Dataset):
    def __init__(self, x_id, x_strg, x_dlt, x_imu, x_site, x_pos, x_mask,
                 y, train=True):
        self.train = train
        self.x_id = x_id
        self.x_strg = x_strg
        self.x_dlt = x_dlt
        self.x_imu = x_imu
        self.x_site = x_site
        self.x_pos = x_pos
        self.x_mask = x_mask
        self.y = y
        self.datanum = len(y)
    
    def __len__(self):
        return self.datanum
    
    def __getitem__(self, idx):
        return (self.x_id[idx], self.x_strg[idx], self.x_dlt[idx], self.x_imu[idx],
                self.x_site[idx], self.x_pos[idx], self.x_mask[idx],
                self.y[idx])

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

    def forward(self, x, position, mask):
        if position is None:
            position = torch.cumsum(mask.logical_not(), dim=1)-1
        else:
            position *= mask.logical_not()
        position = rearrange(position, 'bn seq -> bn seq 1')
        
        pe = torch.zeros(x.shape).to(x.device)
        div_term = self.div_term.to(x.device)
        pe[:,:,0::2] = torch.sin(position * div_term)
        pe[:,:,1::2] = torch.cos(position * div_term)
        x = x + pe
        
        return self.dropout(x)

In [None]:
''' Special Thanks to 
    https://www.kaggle.com/shinomoriaoshi/iln-transformer-train-k1?scriptVersionId=59645688 '''

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def attention(query, key, value, key_padding_mask=None, attn_weight=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if key_padding_mask is not None:
        scores = scores.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), -1e9)
    if attn_weight is not None:
        if len(attn_weight.shape) != len(scores.shape):
            attn_weight = attn_weight.unsqueeze(-3)
        scores -= attn_weight
    p_attn = nn.functional.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % nhead == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // nhead
        self.nhead = nhead
        self.linears = clones(nn.Linear(d_model, d_model, bias=False), 4) # Q, K, V, last
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, key_padding_mask=None, attn_weight=None):
        "Implements Figure 2"
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = \
            [l(x).view(nbatches, -1, self.nhead, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, key_padding_mask=key_padding_mask,
                                 attn_weight=attn_weight,
                                 dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
            .view(nbatches, -1, self.nhead * self.d_k)
        return self.linears[-1](x)


class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(nn.functional.relu(self.w_1(x))))
    
class CustomEncoderLayer(nn.Module):
    """
    Single Encoder block of SAINT
    """
    def __init__(self, d_model, nhead, dim_feedforward = 1024, dropout = 0.1):
        super().__init__()
        self._self_attn = MultiHeadedAttention(d_model, nhead, dropout)
        self._ffn = PositionwiseFeedForward(d_model, dim_feedforward, dropout)
        self._layernorms = clones(nn.LayerNorm(d_model, eps=1e-6), 2)
        self._dropout = nn.Dropout(dropout)

    def forward(self, src, key_padding_mask = None, attn_weight = None):
        """
        query: question embeddings
        key: interaction embeddings
        """
        # self-attention block
        src2 = self._self_attn(query=src, key=src, value=src, key_padding_mask=key_padding_mask,
                               attn_weight=attn_weight)
        src = src + self._dropout(src2)
        src = self._layernorms[0](src)
        src2 = self._ffn(src)
        src = src + self._dropout(src2)
        src = self._layernorms[1](src)
        return src

In [None]:
# class CorrectWithDelta(nn.Module):

#     def __init__(self):
#         super(CorrectWithDelta, self).__init__()

#     def forward(self, x, c, x_dlt, x_pos, x_mask):
#         x_dlt = (x_dlt - x_dlt[:,0:1,:]) * x_mask.unsqueeze(-1).logical_not()
#         x_rel = x_dlt.cumsum(dim=1)
#         XX = (rearrange(x_rel, 'bn seq d -> bn seq 1 d')
#              -rearrange(x_rel, 'bn seq d -> bn 1 seq d'))
#         XX += rearrange(x, 'bn seq d -> bn 1 seq d')
#         attn = rearrange(c, 'bn seq -> bn 1 seq 1')
#         attn_wt = (rearrange(x_pos, 'bn seq -> bn seq 1 1')
#                   -rearrange(x_pos, 'bn seq -> bn 1 seq 1')).abs()
#         attn = attn - torch.log(1.0 + attn_wt)
#         attn -= (1e8 * rearrange(x_mask, 'bn seq -> bn 1 seq 1'))
#         x = (XX * nn.functional.softmax(attn, dim=2)).sum(dim=2)
#         return x

In [None]:
''' Learning Model '''
class ILNnet(nn.Module):
    def __init__(self, input_dim, id_num_embd, id_embd_dim,
                 strg_dim, site_num_embd, imu_dim,
                 ninp, nhead, nhid, nlayers, dropout):
        super(ILNnet, self).__init__()
 
        ''' embedding and concat '''
        self.id_embd = nn.Embedding(id_num_embd, id_embd_dim,
                                    padding_idx=0)
        self.strg_ln = nn.LayerNorm(input_dim)
        self.strg_lin = nn.Linear(input_dim, strg_dim)
        self.site_embd = nn.Embedding(site_num_embd, ninp)
        cat_dim = input_dim*id_embd_dim + strg_dim
        self.main_seq1 = nn.Sequential(nn.Linear(cat_dim, ninp),
                                       nn.LayerNorm(ninp),
                                       nn.ReLU(),
                                       nn.Dropout(dropout))
        self.dltimu_lin = nn.Linear(4+imu_dim, ninp)
 
        ''' main stream(1) TRANSFORMER with positional decay '''
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        # encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        # self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.attn_wt_coef = nn.Parameter(torch.clamp(torch.randn(nhead), min=1e-3))
        self.customTR1 = CustomEncoderLayer(ninp, nhead, nhid, dropout)
        self.customTR2 = CustomEncoderLayer(ninp, nhead, nhid, dropout)
 
        ''' main stream(2) LSTM '''
        self.lstm1 = nn.LSTM(batch_first=True,
                             bidirectional=True, num_layers=1,
                             input_size=ninp, hidden_size=ninp//2)
        self.lstm2 = nn.LSTM(batch_first=True,
                             bidirectional=True, num_layers=1,
                             input_size=ninp, hidden_size=ninp//2)
        # self.lstm3 = nn.LSTM(batch_first=True,
        #                      bidirectional=True, num_layers=1,
        #                      input_size=ninp, hidden_size=ninp//2)
        self.main_seq2 = nn.Sequential(nn.Linear(ninp,ninp),
                                       nn.LayerNorm(ninp),
                                       nn.Dropout(dropout),
                                       nn.Linear(ninp,2))
        self.init_weights()
        # self.correct = CorrectWithDelta()
    
    def init_weights(self):
        self.id_embd.weight.data.zero_()
        self.site_embd.weight.data.zero_()
    
    def forward(self, x_id, x_strg, x_dlt, x_imu, x_site, x_pos, x_mask):
        
        ''' embedding and concat '''
        x_id = self.id_embd(x_id)
        x_id = rearrange(x_id, 'bn seq d1 d2 -> bn seq (d1 d2)')
        x_strg = self.strg_ln(x_strg)
        x_strg = torch.relu(self.strg_lin(x_strg))
        x_site = self.site_embd(x_site)
        x = torch.cat((x_id, x_strg), dim=-1)
        x = self.main_seq1(x) + x_site
 
        ''' concat delta & IMU '''
        x_dltimu = torch.cat((x_dlt, x_imu),-1)
        x_dltimu = torch.relu(self.dltimu_lin(x_dltimu))
        x = x + x_dltimu
        
        ''' main stream(1) TRANSFORMER with positional decay '''
        xt = self.pos_encoder(x, None, x_mask)
        attn_wt = (rearrange(x_pos, 'bn seq -> bn 1 seq 1')
                  -rearrange(x_pos, 'bn seq -> bn 1 1 seq')).abs()
        attn_coef = rearrange(self.attn_wt_coef, 'nh -> 1 nh 1 1')
        xt = self.customTR1(xt, x_mask, torch.log(1.0 + attn_wt) * attn_coef)
        xt = self.customTR2(xt, x_mask, torch.log(1.0 + attn_wt) * attn_coef)
        x = x + xt
        
        ''' main stream(2) LSTM '''
        lengths = x_mask.logical_not().sum(dim=1).to('cpu')
        x = rnn.pack_padded_sequence(x, lengths, batch_first=True,
                                     enforce_sorted=False)
        x,_ = self.lstm1(x)
        x,_ = self.lstm2(x)
        # x,_ = self.lstm3(x)
        x = rnn.pad_packed_sequence(x, batch_first=True)[0]
        if x.size(1)!=x_mask.size(1):
            pad_len = x_mask.size(1)-x.size(1)
            x = torch.cat([x, torch.zeros(x.size(0),pad_len,x.size(2)).to(device)],
                          dim=1)
            assert x.size(1)==x_mask.size(1)
        x = self.main_seq2(x)
 
        # ''' correct with delta '''
        # c = torch.relu(x[:,:,2])
        # x = x[:,:,0:2]
        # x = self.correct(x, c, x_dlt, x_pos, x_mask)
 
        return x

In [None]:
def ILN_loss(output, filter_, delta, label):
    
    metr1 = (output[:,:,:2]-label[:,:,:2]).square()
    metr1 = metr1.sum(dim=-1)+1e-6
    metr1 = metr1.sqrt().sum()
 
    # metr2 = ((output[:,1:,:2]-output[:,:-1,:2])
    #          -delta[:,1:,:])**2
    # metr2 = metr2.sum(dim=-1)+1e-6
    # metr2 = (metr2.sqrt()*filter_[:,1:]).sum()
 
    # metr3 = ((output[:,1:,:2]-output[:,:-1,:2]).square().sum(dim=-1)+1e-6).sqrt() \
    #         -(delta[:,1:,:].square().sum(dim=-1)+1e-6).sqrt()
    # metr3 = (metr3 * filter_[:,1:]).abs().sum()
 
    # loss = (metr1 + 10.0 * metr3)/filter_.sum()
    metric = metr1/filter_.sum()
    loss = metric
 
    return loss, metric

In [None]:
class LearningRateScheduler:
    def __init__(self, lr:list, 
                 switch_epoch: list):
        self.lr = lr
        self.switch_epoch = switch_epoch
 
    def __call__(self, epoch:int):
        idx = [i>epoch for i
               in self.switch_epoch+[1e9]].index(True)
        return self.lr[idx]

In [None]:
''' This function plays the same role as model.summary() of keras '''
def param_count(model, print_all=False):
    if print_all:
        print('-'*80)
    psum=0
    for n,p in model.named_parameters():
        if p.requires_grad:
            if print_all:
                print(f'{n}:')
                print(f'     params:{p.numel():,},  {p.shape}')
#                 print(p)
            psum += p.numel()
    print(f'Total params: {psum:,}')
    if print_all:
        print('-'*80)

In [None]:
def train_val(net, trainloader, valloader, bsz):
    ''' train '''
    loss_sum, metric_sum = 0., 0.
    net.train()
    optimizer.zero_grad()
    # note: "prefix" and "descpost" are the variable just for tqdm
    with tqdm(trainloader, ncols=80) as pbar:
        prefix = f"epoch {i_epoch+1} train"
        pbar.set_description(prefix)
        descpost = None
        for i, (tr_id, tr_strg, tr_dlt, tr_imu, tr_site, tr_pos,
                tr_mask, tr_xy) in enumerate(pbar):

            output = net(tr_id, tr_strg, tr_dlt, tr_imu, tr_site, tr_pos, tr_mask)
            filter_ = tr_mask.logical_not()
            output *= filter_.unsqueeze(-1)
            
            loss, metric = ILN_loss(output, filter_, tr_dlt, tr_xy)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
                
            loss_sum += loss.item()
            metric_sum += metric.item()
            descpost = f'{loss_sum/(i+1):.2f}/{metric_sum/(i+1):.2f}'
            pbar.set_postfix({"l/m":descpost})
        history['epoch'].append(i_epoch+1)
        history['train_loss'].append(loss_sum/(i+1))
        history['train_metric'].append(metric_sum/(i+1))

    ''' validation '''
    with torch.no_grad():
        loss_sum, metric_sum = 0., 0.
        net.eval()
        with tqdm(valloader, ncols=80) as pbar:
            prefix = ' '*(len(prefix)-10)+'validation'
            pbar.set_description(prefix)
            descpost = None
            for i, (vl_id, vl_strg, vl_dlt, vl_imu, vl_site, vl_pos, vl_mask, vl_xy)\
              in enumerate(pbar):

                output = net(vl_id, vl_strg, vl_dlt, vl_imu, vl_site, vl_pos, vl_mask)
                filter_ = vl_mask.logical_not()
                output *= filter_.unsqueeze(-1)
                
                loss, metric = ILN_loss(output, filter_, vl_dlt, vl_xy)
                loss_sum += loss.item()
                metric_sum += metric.item()
                descpost = f'{loss_sum/(i+1):.2f}/{metric_sum/(i+1):.2f}'
                pbar.set_postfix({"l/m":descpost})
    history['val_loss'].append(loss_sum/(i+1))
    history['val_metric'].append(metric_sum/(i+1))
    lr_scheduler.step()
    
    return net, history['val_metric'][-1]

In [None]:
if not inference_only:
    for i_fold in range(N_SPLITS):
        print('='*20 + f' FOLD {i_fold} ' + '='*20)

        ''' model comopile '''
        net = ILNnet(NUM_FEATS, len(ID_vocab)+1, 128, 1280, 24, len(IMU_FEATS),
                     ninp=256, nhead=2, nhid=512, nlayers=2, dropout=0.1)
        net = net.to(device)
    #     param_count(net, print_all=True)

        mask = df_stra[f'fold_{i_fold}'].values
    #     trn_idx = np.arange(df_stra.shape[0]) # train with all data
        trn_idx = np.where(~mask)[0]
        val_idx = np.where(mask)[0]

        trainvaldata = ILNdatasets(tr_id, tr_strg, tr_dlt, tr_imu, tr_site,
                                   tr_pos, tr_mask, tr_xy)
        traindata = torch.utils.data.dataset.Subset(trainvaldata,
                                                    trn_idx)
        valdata = torch.utils.data.dataset.Subset(trainvaldata,
                                                  val_idx)
        trainloader = torch.utils.data.DataLoader(traindata, 
                                                  # pin_memory=True,
                                                  shuffle=True,
                                                  batch_size=batch_size)
        valloader = torch.utils.data.DataLoader(valdata, 
                                                # pin_memory=True,
                                                shuffle=True,
                                                batch_size=batch_size)

        optimizer = optim.Adam(net.parameters(), lr=5e-4)
        lr_scheduler_func = LearningRateScheduler([1.0, 0.1],
                                                  [int(0.75*n_epoch)])
        lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                           lr_lambda=lr_scheduler_func)

        history = {'epoch':[], 'train_loss': [], 'val_loss': [],
                   'train_metric': [], 'val_metric': []}

        ''' train and validation '''
        time.sleep(1)
        early_stop = 0
        min_metric = 1000.
        for i_epoch in range(n_epoch):
            net, es_metric = train_val(net, trainloader, valloader, batch_size)

            ''' history json dump '''
            with open(f'_history_{model_ver}_fold{i_fold}_latest.json', 'w') as f:
                json.dump(history, f, indent=4)

    #         ''' early stopping '''
    #         es_metric = round(es_metric,2)
    #         if es_metric<min_metric:
    #             min_metric=es_metric
    #             early_stop=0
    #         else:
    #             early_stop+=1 
    #         if early_stop>10 and i_epoch>int(n_epoch*0.5):
    #             print('early stopping.')
    #             break

        i_epoch += 1

        ''' model output '''
        model_path = f'{model_ver}_fold{i_fold}_epoch{i_epoch}.pth'
        torch.save(net.state_dict(), model_path)

In [None]:
''' create tensor for test data '''
te_id, te_strg, te_dlt, te_imu, te_site, te_pos, te_mask = \
    [],[],[],[],[],[],[]
for gid, grp in tqdm(test_data.groupby('seq_label'), ncols=60):
    te_id.append(torch.from_numpy(grp[ID_FEATS].values).long())
    te_strg.append(torch.from_numpy(grp[STRG_FEATS].values).float())
    te_dlt.append(torch.from_numpy(grp[DELTA_FEATS].values).float())
    te_imu.append(torch.from_numpy(grp[IMU_FEATS].values).float())
    te_site.append(torch.from_numpy(grp['site'].values).long())
    te_pos.append(torch.from_numpy(grp['len_pos'].values).float())
    ''' te_mask: the tensor for padding '''
    te_mask.append(te_id[-1].sum(dim=-1)==0)
te_id = rnn.pad_sequence(te_id, batch_first=True).to(device)
te_strg = rnn.pad_sequence(te_strg, batch_first=True).to(device)
te_dlt = rnn.pad_sequence(te_dlt, batch_first=True).to(device)
te_imu = rnn.pad_sequence(te_imu, batch_first=True).to(device)
te_site = rnn.pad_sequence(te_site, batch_first=True).to(device)
te_pos = rnn.pad_sequence(te_pos, batch_first=True).to(device)
te_mask = rnn.pad_sequence(te_mask, batch_first=True,
                           padding_value=True).to(device)

In [None]:
test_data.set_index('site_path_timestamp', inplace=True)
''' model comopile '''
net = ILNnet(NUM_FEATS, len(ID_vocab)+1, 128, 1280, 24, len(IMU_FEATS),
             ninp=256, nhead=2, nhid=512, nlayers=2, dropout=0.1)
net = net.to(device)

''' prediction '''
xy_all = []
for i_fold in range(N_SPLITS):
    print('='*20 + f' FOLD {i_fold} ' + '='*20)
    time.sleep(1)

    ''' model load '''
    with open(f'{data_dir}_history_{model_ver}_fold{i_fold}_latest.json','r') as f:
        history = json.load(f)
    i_epoch = history['epoch'][-1]
    model_path = f'{data_dir}{model_ver}_fold{i_fold}_epoch{i_epoch}.pth'
    net.load_state_dict(torch.load(model_path))
    
    net.eval()
    pred = []
    with torch.no_grad():
        for te_id_ch, te_strg_ch, te_dlt_ch, te_imu_ch, \
            te_site_ch, te_pos_ch, te_mask_ch in \
            (zip(tqdm(te_id.split(batch_size),ncols=60),
                 te_strg.split(batch_size), te_dlt.split(batch_size), 
                 te_imu.split(batch_size), te_site.split(batch_size), 
                 te_pos.split(batch_size), te_mask.split(batch_size))):
            output = net(te_id_ch, te_strg_ch, te_dlt_ch, te_imu_ch, te_site_ch,
                         te_pos_ch, te_mask_ch)
            output  = rearrange(output, 'bn seq d -> (bn seq) d')
            filter_ = rearrange(te_mask_ch.logical_not(),
                                'bn seq -> (bn seq)')
            pred.append(output[filter_])
    test_data[['x','y']] = torch.cat(pred, dim=0).to('cpu').numpy()
    test_data['floor']=0
    
    subm = pd.read_csv(subm_file, index_col=0)
    subm.loc[:,:]=np.nan
    all_preds = pd.concat([test_data[subm.columns].copy() ,subm])
    all_preds.sort_index(inplace=True)
    df_tmp = pd.Series(all_preds.index).str.split('_', expand=True)
    df_tmp.index = all_preds.index
    df_tmp.columns = ['site','path','timestamp']
    all_preds = pd.concat([all_preds, df_tmp],axis=1)
    del df_tmp; gc.collect()
    all_preds['timestamp'] = all_preds['timestamp'].astype('int')

    tmp = []
    for gid, gdf in tqdm(all_preds.groupby('path'),ncols=60):
        gdf.reset_index(drop=False, inplace=True)
        gdf.set_index('timestamp', inplace=True)
        gdf.sort_index(inplace=True)
#         for itm in ['x','y']:
#             gdf[itm].interpolate('nearest', inplace=True)
#             gdf[itm].fillna(method='bfill', inplace=True)
#             gdf[itm].fillna(method='ffill', inplace=True)
        gdf[['x','y']] = gdf[['x','y']].interpolate(limit_direction='both',
                                                    method='index')
        gdf.set_index('site_path_timestamp', inplace=True)
        tmp.append(gdf[['floor','x','y']])

    all_preds = pd.concat(tmp).groupby(level=0).mean().reindex(subm.index)
    
    simple_accurate_99 = pd.read_csv('../input/simple-99-accurate-floor-model/submission.csv')
    all_preds['floor'] = simple_accurate_99['floor'].values
    
    all_preds.to_csv(f'{model_ver}_fold{i_fold}_submission.csv')
    xy_all.append(all_preds[['x','y']].values)
    
all_preds[['x','y']] = np.stack(xy_all, axis=-1).mean(axis=-1)
all_preds.to_csv(f'./{model_ver}_fold_all_submission.csv')