In [None]:
# https://github.com/location-competition/indoor-location-competition-20
# https://www.kaggle.com/c/indoor-location-navigation/data
# https://www.kaggle.com/titericz/eda-loading-data-and-visualizing-paths
# https://www.kaggle.com/npa02012/time-to-complete-trace-eda
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

# www.reddit.com/r/MachineLearning/comments/4dzxs3/best_way_to_deal_with_time_series_data
# https://arxiv.org/pdf/1907.03907.pdf
# https://github.com/YuliaRubanova/latent_ode

import os
import glob
import numpy as np
import pandas as pd
import torch
import tqdm
import matplotlib
import time
import pickle

settings = {
    'beacon_seq_len' : 50
    ,'beacon_embed_dim' : 256
    ,'device' : torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ,'floor_mapping' : {'B1' : -1, 'B2' : -2, 'B3' : -3
                         ,'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4
                         ,'F6' : 5, 'F7' : 6 , 'F8' : 7, 'F9' : 8, 'F10' : 9
                         ,'1F' : 0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4
                         ,'6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8
                        }
    ,'n_floor' : 13
    ,'max_beacon_distance' : 200
    ,'path_train' : '../input/indoor-location-navigation/train/*/*/*'
    ,'path_test' : '../input/indoor-location-navigation/test/*'
    ,'path_sample' : '../input/indoor-location-navigation/sample_submission.csv'
}

In [None]:
with open('../input/location-data/beacon_train.pkl', 'rb') as handle:
    train = pickle.load(handle)

In [None]:
# Delete rows with 'unusual' floors
i = train['df'][~train['df']['floor'].isin(settings['floor_mapping'].keys())].index
train['df'].drop(i, inplace=True)

# Only keep initial rows of a trace
train['df'] = train['df'].groupby('trace_id')\
                .head(settings['beacon_seq_len']).reset_index(drop=True)

# Convert distance column to int
tmp = settings['max_beacon_distance']
train['df']['distance'] = np.where(train['df']["distance"]>tmp, tmp, train['df']["distance"])
train['df']['distance'] = np.where(train['df']["distance"]<0, 0, train['df']["distance"])
train['df']['distance'] = train['df']['distance'].astype(int) + 1

# Map columns
train['df']['site_id'] = train['df']['site_id'].astype('category').cat.codes + 1
train['df']['UUID'] = train['df']['UUID'].astype('category').cat.codes + 1
train['df']['MinorID'] = train['df']['MinorID'].astype('category').cat.codes + 1
train['df']['MajorID'] = train['df']['MajorID'].astype('category').cat.codes + 1
train['df']['MAC_Address'] = train['df']['MAC_Address'].astype('category').cat.codes + 1
train['df'].replace({'floor' : settings['floor_mapping']}, inplace=True)

# Record settings
settings['n_uuids'] = train['df']['UUID'].max() + 1
settings['n_minor_ids'] = train['df']['MinorID'].max() + 1
settings['n_major_ids'] = train['df']['MajorID'].max() + 1
settings['n_macs'] = train['df']['MAC_Address'].max() + 1
settings['n_sites'] = train['df']['site_id'].max() + 1

# Convert to dictionary
train['df'] = {k: table for k, table in train['df'].groupby("trace_id")} # slower, but easier

train['df']['5d09b22fcfb49b00085466a0']

In [None]:
# Make validation set
np.random.seed(1)
val_idx = np.random.choice(list(train['df'].keys())
                           ,int(.2 * len(train['df'].keys())), replace=False)
valid = {'df' : {}}
for i in val_idx:
    valid['df'][i] = train['df'][i].copy()
    del train['df'][i]

In [None]:
class beacon_dataset(torch.utils.data.Dataset):
    
    def __init__(self, group, settings):
        super(beacon_dataset, self).__init__()
        self.beacon_seq_len = settings['beacon_seq_len']
        self.n_floor = settings['n_floor']
        self.group = group
        self.trace_ids = list(group.keys())
        
    def __len__(self):
        return(len(self.trace_ids))
    
    def __getitem__(self, index):
        # Get the relevant user row
        sample = self.group[self.trace_ids[index]]
        
        # Get contents as np.int64s
        uuids = sample['UUID'].values
        distances = sample['distance'].values
        minor_ids = sample['MinorID'].values
        major_ids = sample['MajorID'].values
        macs = sample['MAC_Address'].values
        sites = sample['site_id'].values
        
        # Pad if needed
        n_pad = self.beacon_seq_len - len(uuids)
        if n_pad > 0:
            uuids = np.concatenate((uuids, np.full(n_pad, 0).astype(np.int64)))
            distances = np.concatenate((distances, np.full(n_pad, 0).astype(np.int64)))
            minor_ids = np.concatenate((minor_ids, np.full(n_pad, 0).astype(np.int64))) 
            major_ids = np.concatenate((major_ids, np.full(n_pad, 0).astype(np.int64)))
            macs = np.concatenate((macs, np.full(n_pad, 0).astype(np.int64)))
            sites = np.concatenate((sites, np.full(n_pad, 0).astype(np.int64)))
        else:
            uuids = uuids[:self.beacon_seq_len]
            distances = distances[:self.beacon_seq_len]
            minor_ids = minor_ids[:self.beacon_seq_len]
            major_ids = major_ids[:self.beacon_seq_len]
            macs = macs[:self.beacon_seq_len]
            sites = sites[:self.beacon_seq_len]

        
        # Return
        return({
            'floor' : sample.iloc[0]['floor'] + 3#np.array(floor)
            ,'uuids' : uuids
            ,'distances' : distances
            ,'minor_ids' : minor_ids
            ,'major_ids' : major_ids
            ,'macs' : macs
            ,'sites' : sites
        })
    

train_dataset = beacon_dataset(group = train['df']
                              ,settings = settings
                              )
train_dataloader = torch.utils.data.DataLoader(train_dataset
                                                ,batch_size = 128
                                                ,drop_last = True
                                                ,shuffle = True
                                                ,num_workers = 4
                                               )

valid_dataset = beacon_dataset(group = valid['df']
                              ,settings = settings
                              )
valid_dataloader = torch.utils.data.DataLoader(valid_dataset
                                                ,batch_size = 10
                                                ,num_workers = 4
                                               )

valid_dataset.__getitem__(1)

In [None]:
class floor_model(torch.nn.Module):
    def __init__(self, settings):
        super(floor_model, self).__init__()
        self.embed_dim = settings['beacon_embed_dim']
        self.seq_len = settings['beacon_seq_len']
        self.device = settings['device']
        
        self.minor_id_embedding = torch.nn.Embedding(settings['n_minor_ids']
                                                    ,self.embed_dim)
        self.major_id_embedding = torch.nn.Embedding(settings['n_major_ids']
                                                    ,self.embed_dim)
        self.uuid_embedding = torch.nn.Embedding(settings['n_uuids']
                                                 ,self.embed_dim)
        self.mac_embedding = torch.nn.Embedding(settings['n_macs']
                                                 ,self.embed_dim)
        # Site embedding doesn't make to much sense (same site always)
        self.site_embedding = torch.nn.Embedding(settings['n_sites']
                                                ,self.embed_dim)
        self.distance_embedding = torch.nn.Embedding(settings['max_beacon_distance']+2
                                                     ,self.embed_dim)
        self.pos_embedding = torch.nn.Embedding(self.seq_len, self.embed_dim)
        self.multi_att = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                     ,num_heads = 4
                                                     ,dropout = 0.2)

        self.lin_1 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(self.embed_dim, 1)
        self.dropout = torch.nn.Dropout(0.2)
        
        self.pred = torch.nn.Linear(self.seq_len, settings['n_floor'])
        
        self.tmp = True
            
    def forward(self, batch):        
        # Minor id embedding
        x = self.minor_id_embedding(batch['minor_ids'].long())
        
        # MAC Address embedding
        x = x + self.mac_embedding(batch['macs'].long())
        
        # Site embedding
        x = x + self.site_embedding(batch['sites'].long())
        
        # Major Id embedding
        x = x + self.major_id_embedding(batch['major_ids'].long())
        
        # UUID embedding
        #x = x + self.uuid_embedding(batch['uuids'].long())
        
        # Distance embedding
        x = x + self.distance_embedding(batch['distances'])
        
        # Position embedding
        pos_id = torch.arange(x.shape[1])[None, :].to(self.device)
        x = x + self.pos_embedding(pos_id)
        
        # Permute
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # MultiHead Attention and permute back
        attn_output, _ = self.multi_att(x, x, x)
        x = x + attn_output
        x = x.permute(1, 0, 2)
        
        # Feed forward
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        x = self.dropout(x)
        
        # Predict
        x = x[:, :, -1]
        x = self.pred(x)
        
        # Return
        return(x)
        

# Setup model, optimizer and criterion
model = floor_model(settings)
optimizer = torch.optim.Adam(model.parameters(), lr=.002)
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = torch.nn.CrossEntropyLoss()
all_auc = []

# Move model and criteriod to device
model.to(settings['device'])
criterion.to(settings['device'])
all_loss = []


In [None]:
for _ in range(25):
    tbar = tqdm.tqdm(train_dataloader)
    for batch in tbar:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])
        optimizer.zero_grad()
        pred = model(batch)
        loss = criterion(pred, batch['floor'].long())
        loss.backward()
        optimizer.step()
        
        # Record metrics
        all_loss.append(loss.item())

print(np.array(all_loss[-200:]).mean())
matplotlib.pyplot.plot(all_loss)
matplotlib.pyplot.show()

In [None]:
def get_acc(dataset):
    if dataset == 'valid':
        dl = valid_dataloader
    else:
        dl = train_dataloader
    # Accuracy mesurments
    preds = np.empty(0, dtype=np.int64)
    labels = np.empty(0, dtype=np.int64)

    model.eval()
    for batch in dl:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])

        # Get predictions
        pred = model(batch)
        p = pred.detach().to('cpu').numpy()
        p = np.argmax(p, axis = 1)
        preds = np.concatenate((preds, p))

        # Label
        l = batch['floor'].detach().to('cpu').numpy()
        labels = np.concatenate((labels, l))
    model.train()
    print(dataset)
    print(np.sum(preds == labels)/preds.shape[0])
    print((15 * abs(preds - labels)).mean())
    
    
# Get accuracy
get_acc('valid')
get_acc('train')

In [None]:
# .729