In [None]:
#
# Continuous embedding:
#  https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/210171
#  https://github.com/dkletran/riiid-challenge-4th-place/blob/main/modeling_training/modeling.py
#  https://arxiv.org/pdf/2010.12042.pdf
#

import os
import glob
import numpy as np
import pandas as pd
import torch
import tqdm
import matplotlib
import time
import pickle

settings = {
    'beacon_seq_len' : 50
    ,'beacon_embed_dim' : 256
    ,'device' : torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ,'floor_mapping' : {'B1' : -1, 'B2' : -2, 'B3' : -3
                         ,'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4
                         ,'F6' : 5, 'F7' : 6 , 'F8' : 7, 'F9' : 8, 'F10' : 9
                         ,'1F' : 0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4
                         ,'6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8
                        }
    ,'n_floor' : 13
    ,'max_beacon_distance' : 200
    ,'path_train' : '../input/indoor-location-navigation/train/*/*/*'
    ,'path_test' : '../input/indoor-location-navigation/test/*'
    ,'path_sample' : '../input/indoor-location-navigation/sample_submission.csv'
}

In [None]:
with open('../input/location-data/wp_train.pkl', 'rb') as handle:
    train = pickle.load(handle)
    
with open('../input/location-data/beacon_train.pkl', 'rb') as handle:
    tmp = pickle.load(handle)
    
# Handle this in other script eventually
train['wp'] = train.pop('df')
train['beacon'] = tmp.pop('df')
train['wp'].x = train['wp'].x.astype('float')
train['wp'].y = train['wp'].y.astype('float')

In [None]:
with open('../input/location-data/wp_train.pkl', 'rb') as handle:
    train = pickle.load(handle)
    
with open('../input/location-data/beacon_train.pkl', 'rb') as handle:
    tmp = pickle.load(handle)
    
# Handle this in other script eventually
train['wp'] = train.pop('df')
train['beacon'] = tmp.pop('df')
train['wp'].x = train['wp'].x.astype('float')
train['wp'].y = train['wp'].y.astype('float')

#
# Delete trace_ids not in beacon
# ***** Delete this eventually
#
train['wp'] = train['wp'].loc[train['wp'].trace_id.isin(train['beacon'].trace_id)]

# Record settings for embedding
settings['n_uuids'] = train['beacon']['UUID'].nunique() + 1
settings['n_minor_ids'] = train['beacon']['MinorID'].nunique() + 1
settings['n_major_ids'] = train['beacon']['MajorID'].nunique() + 1
settings['n_macs'] = train['beacon']['MAC_Address'].nunique() + 1
settings['n_sites'] = train['beacon']['site_id'].nunique() + 1

#
# Make validation set
#

# Get trace_ids to be in validation set
np.random.seed(1)
tmp = train['wp'].trace_id.unique()
valid_ids = np.random.choice(tmp, int(.2 * tmp.shape[0]), replace=False)

# Make validation set
valid = {}
valid['beacon'] = train['beacon'].loc[train['beacon'].trace_id.isin(valid_ids)]
valid['wp'] = train['wp'].loc[train['wp'].trace_id.isin(valid_ids)]

# Delete validation set from train
train['beacon'] = train['beacon'].loc[~train['beacon'].trace_id.isin(valid_ids)]
train['wp'] = train['wp'].loc[~train['wp'].trace_id.isin(valid_ids)]

In [None]:
def clean_data(data, settings):
    
    # Delete rows with 'non-testing' floors
    i = data['wp'][~data['wp']['floor'].isin(settings['floor_mapping'].keys())].index
    data['wp'].drop(i, inplace=True)
    i = data['beacon'][~data['beacon']['floor']\
                        .isin(settings['floor_mapping'].keys())].index
    data['beacon'].drop(i, inplace=True)
    
    # Reset indices
    data['wp'].reset_index(drop=True, inplace=True)
    data['beacon'].reset_index(drop=True, inplace=True)

    # Only keep initial rows of a trace
    data['beacon'] = data['beacon'].groupby('trace_id')\
                    .head(settings['beacon_seq_len']).reset_index(drop=True)

    # Convert distance column to int
    tmp = settings['max_beacon_distance']
    data['beacon']['distance'] = np.where(data['beacon']["distance"] > tmp
                                       ,tmp, data['beacon']["distance"])
    data['beacon']['distance'] = np.where(data['beacon']["distance"] < 0
                                       ,0, data['beacon']["distance"])
    data['beacon']['distance'] = data['beacon']['distance'].astype(int) + 1

    # Map columns
    data['beacon']['site_id'] = data['beacon']['site_id'].astype('category').cat.codes + 1
    data['beacon']['UUID'] = data['beacon']['UUID'].astype('category').cat.codes + 1
    data['beacon']['MinorID'] = data['beacon']['MinorID'].astype('category').cat.codes + 1
    data['beacon']['MajorID'] = data['beacon']['MajorID'].astype('category').cat.codes + 1
    data['beacon']['MAC_Address'] = data['beacon']['MAC_Address'].astype('category').cat.codes + 1
    data['beacon'].replace({'floor' : settings['floor_mapping']}, inplace=True)

    # Convert to dictionary
    data['beacon'] = {k: table for k, table in data['beacon'].groupby("trace_id")} # Can optimize
    data['wp'] = data['wp'].to_dict(orient='index')
    
    # Return
    return(data)

train = clean_data(train, settings)
valid = clean_data(valid, settings)

print(train['wp'][0])
valid['beacon'][list(valid['beacon'].keys())[0]].head()

In [None]:
class location_dataset(torch.utils.data.Dataset):
    
    def __init__(self, data, settings):
        super(location_dataset, self).__init__()
        self.beacon_seq_len = settings['beacon_seq_len']
        self.n_floor = settings['n_floor']
        self.data = data
        
    def __len__(self):
        return(len(self.data['wp'].keys()))
    
    def __getitem__(self, index):
        # Get the relevant user data
        wp = self.data['wp'][index]
        beacon = self.data['beacon'][wp['trace_id']]
        
        # Get contents as np.int64s
        uuids = beacon['UUID'].values
        distances = beacon['distance'].values
        minor_ids = beacon['MinorID'].values
        major_ids = beacon['MajorID'].values
        macs = beacon['MAC_Address'].values
        sites = beacon['site_id'].values
        
        # Pad if needed
        n_pad = self.beacon_seq_len - len(uuids)
        if n_pad > 0:
            uuids = np.concatenate((uuids, np.full(n_pad, 0).astype(np.int64)))
            distances = np.concatenate((distances, np.full(n_pad, 0).astype(np.int64)))
            minor_ids = np.concatenate((minor_ids, np.full(n_pad, 0).astype(np.int64))) 
            major_ids = np.concatenate((major_ids, np.full(n_pad, 0).astype(np.int64)))
            macs = np.concatenate((macs, np.full(n_pad, 0).astype(np.int64)))
            sites = np.concatenate((sites, np.full(n_pad, 0).astype(np.int64)))
        else:
            uuids = uuids[:self.beacon_seq_len]
            distances = distances[:self.beacon_seq_len]
            minor_ids = minor_ids[:self.beacon_seq_len]
            major_ids = major_ids[:self.beacon_seq_len]
            macs = macs[:self.beacon_seq_len]
            sites = sites[:self.beacon_seq_len]
            
        # Location of waypoint timestamp
        #tmp = train['beacon']['5d09b23ccfb49b00085466a6'].timestamp
        #print(tmp)
        #tmp2 = tmp.loc[tmp > 15609164469999].index.min()

        
        # Return
        return({
            'x' : wp['x']
            ,'y' : wp['y']
            ,'uuids' : uuids
            ,'distances' : distances
            ,'minor_ids' : minor_ids
            ,'major_ids' : major_ids
            ,'macs' : macs
            ,'sites' : sites
        })
    

train_dataset = location_dataset(data = train
                              ,settings = settings
                              )
train_dataloader = torch.utils.data.DataLoader(train_dataset
                                                ,batch_size = 256
                                                ,drop_last = True
                                                ,shuffle = True
                                                ,num_workers = 4
                                               )

valid_dataset = location_dataset(data = valid
                              ,settings = settings
                              )
valid_dataloader = torch.utils.data.DataLoader(valid_dataset
                                                ,batch_size = 1000
                                                ,num_workers = 4
                                               )
valid_dataset.__getitem__(1)

In [None]:
class xy_model(torch.nn.Module):
    def __init__(self, settings):
        super(xy_model, self).__init__()
        self.embed_dim = settings['beacon_embed_dim']
        self.seq_len = settings['beacon_seq_len']
        self.device = settings['device']
        
        self.minor_id_embedding = torch.nn.Embedding(settings['n_minor_ids']
                                                    ,self.embed_dim)
        self.major_id_embedding = torch.nn.Embedding(settings['n_major_ids']
                                                    ,self.embed_dim)
        self.uuid_embedding = torch.nn.Embedding(settings['n_uuids']
                                                 ,self.embed_dim)
        self.mac_embedding = torch.nn.Embedding(settings['n_macs']
                                                 ,self.embed_dim)
        # Site embedding doesn't make to much sense (same site always)
        self.site_embedding = torch.nn.Embedding(settings['n_sites']
                                                ,self.embed_dim)
        self.distance_embedding = torch.nn.Embedding(settings['max_beacon_distance']+2
                                                     ,self.embed_dim)
        self.pos_embedding = torch.nn.Embedding(self.seq_len, self.embed_dim)
        self.multi_att = torch.nn.MultiheadAttention(embed_dim = self.embed_dim
                                                     ,num_heads = 4
                                                     ,dropout = 0.2)

        self.lin_1 = torch.nn.Linear(self.embed_dim, self.embed_dim)
        self.relu = torch.nn.ReLU()
        self.lin_2 = torch.nn.Linear(self.embed_dim, 1)
        self.dropout = torch.nn.Dropout(0.2)
        
        self.pred = torch.nn.Linear(self.seq_len, 2)
        
            
    def forward(self, batch):        
        # Minor id embedding
        x = self.minor_id_embedding(batch['minor_ids'].long())
        
        # MAC Address embedding
        x = x + self.mac_embedding(batch['macs'].long())
        
        # Site embedding
        x = x + self.site_embedding(batch['sites'].long())
        
        # Major Id embedding
        x = x + self.major_id_embedding(batch['major_ids'].long())
        
        # UUID embedding
        #x = x + self.uuid_embedding(batch['uuids'].long())
        
        # Distance embedding
        x = x + self.distance_embedding(batch['distances'])
        
        # Position embedding
        pos_id = torch.arange(x.shape[1])[None, :].to(self.device)
        x = x + self.pos_embedding(pos_id)
        
        # Permute
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        
        # MultiHead Attention and permute back
        attn_output, _ = self.multi_att(x, x, x)
        x = x + attn_output
        x = x.permute(1, 0, 2)
        
        # Feed forward
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        x = self.dropout(x)
        
        # Predict
        x = x[:, :, -1]
        x = self.pred(x)
        
        # Return
        return(x)
        

# Setup model, optimizer and criterion
model = xy_model(settings)
optimizer = torch.optim.Adam(model.parameters(), lr=.002)
criterion = torch.nn.MSELoss()

# Move model and criteriod to device
model.to(settings['device'])
criterion.to(settings['device'])
all_loss = []


In [None]:
for _ in range(1):
    tbar = tqdm.tqdm(train_dataloader)
    for batch in tbar:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])
        optimizer.zero_grad()
        pred = model(batch)
        targ = torch.cat((batch['x'][:, None], batch['y'][:, None]), 1).float()
        loss = criterion(pred, targ)
        loss.backward()
        optimizer.step()
        
        # Record metrics
        all_loss.append(loss.item())

print(np.array(all_loss[-200:]).mean())
matplotlib.pyplot.plot(all_loss)
matplotlib.pyplot.show()

In [None]:
def score(dataset, settings):
    if dataset == 'valid':
        dl = valid_dataloader
    else:
        dl = torch.utils.data.DataLoader(train_dataset
                                        ,batch_size = 1000
                                        ,num_workers = 4
                                       )
    # Accuracy mesurments
    pred_x = np.empty(0, dtype=np.float)
    pred_y = np.empty(0, dtype=np.float)
    targ_x = np.empty(0, dtype=np.float)
    targ_y = np.empty(0, dtype=np.float)

    model.eval()
    for batch in dl:
        for k in batch.keys():
            batch[k] = batch[k].to(settings['device'])

        # Get predictions
        pred = model(batch)
        p = pred.detach().to('cpu').numpy()
        p_x = p[:, 0]
        p_y = p[:, 1]
        pred_x = np.concatenate((pred_x, p_x))
        pred_y = np.concatenate((pred_y, p_y))

        # Target
        t_x = batch['x'].detach().to('cpu').numpy()
        t_y = batch['x'].detach().to('cpu').numpy()
        targ_x = np.concatenate((targ_x, t_x))
        targ_y = np.concatenate((targ_y, t_y))
        
    model.train()
    tmp = sum(np.sqrt(np.square(pred_x - targ_x) + np.square(pred_y - targ_y)))/pred_x.shape[0]
    print(tmp)
    
    
# Get accuracy
score('valid', settings)
score('train', settings)