This notebook improve [previous score using leak](https://www.kaggle.com/tomooinubushi/postprocessing-based-on-leakage)(LB: 4.718 -> 4.683).

This notebook aims to improve score considering device id and time differences between paths.
Basic idea is described in [chris's great discussion](https://www.kaggle.com/c/indoor-location-navigation/discussion/234543).

> 3. Device leakage
>
> This is the data leak that I don't think has been talked about on this forum before.
>
> If you look at the uncalibrated sensor information, you might see a row like:
>
> 0000000000276 TYPE_MAGNETIC_FIELD_UNCALIBRATED -106.762695 35.142517 -355.44434 -83.41217 16.18042 -325.82092 3
>
> Those last 3 float values (before the accuracy) are device specific. Since the same devices are used for the training and test paths, it SEEMS like this is a huge leak - because combining with leak #1, I can almost exactly specify what device was recording which path at what time. (and so exactly specify the test floors)

Thanks to his discussion, I found good postprocess based on leakage considering device id.

Respecting to chris' attitude to open leaks, I also publish this notebook.

In this notebook, each device id is defined by sum of magn biases(x, y, and z), which are recorded in sensor data (TYPE_MAGNETIC_FIELD_UNCALIBRATED).

In some path, however, magn bias is not recorded.

In such case, calculate the difference betwenn calibrated and uncalibrated and we define it as device ID.

Although the bias is not exactly same along a path, there is not so large difference.

In [None]:
import os
import glob
import numpy as np
import pandas as pd 
import math
import sys

from dataclasses import dataclass

import warnings # Supress warnings 
warnings.filterwarnings('ignore')
from matplotlib import pyplot as plt

from tqdm.notebook import tqdm

import json
from datetime import datetime
import glob

In [None]:
@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray
    starttime: int
    endtime: int

def read_data_file(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []
    starttime = -1
    endtime = -1

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data: continue
        if len(line_data) > 20: 
            if line_data[2:7] == 'start':
                if len(line_data.split(':')) > 1: starttime = int(line_data.split(':')[1])
                else: starttime = int(line_data.split('\t')[2])
                continue   
            if line_data[2:5] == 'end':
                if len(line_data.split(':')) > 1: endtime = int(line_data.split(':')[1])
                else: endtime = int(line_data.split('\t')[2])
                continue 

        if line_data[0] == '#': continue

        line_data = line_data.split('\t')
        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])
            continue
       
        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue
        
        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue
        
        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue
        
        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            if len(line_data) > 7:
                magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4]), float(line_data[5]), float(line_data[6]), float(line_data[7])])
            else:
                magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4]), 0, 0, 0])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            rssi = line_data[6]
            dist = line_data[7]
            last_seen_timestamp = line_data[9]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi, dist,last_seen_timestamp]
            ibeacon.append(ibeacon_data)
            continue
        
    
    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)
    
    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint, starttime, endtime)

In [None]:
# great public notebook
df_sample = pd.read_csv('../input/3-3-g6-indoor-navigation-snap-to-grid/submission_snap_to_grid.csv')
df_sample["site_id"] = df_sample["site_path_timestamp"].apply(lambda x:x.split('_')[0])
df_sample["path_id"] = df_sample["site_path_timestamp"].apply(lambda x:x.split('_')[1])
df_sample["timestamp"] = df_sample["site_path_timestamp"].apply(lambda x:x.split('_')[2]).astype(int)
list_site = df_sample["site_id"].unique()

In [None]:
# make train data for each site
df_leak = pd.DataFrame()
for site_id in tqdm(list_site):
    print(site_id)
    ## train
    list_train_files = glob.glob(f"../input/indoor-location-navigation/train/{site_id}/**/*.txt", recursive = True)
    l_pid = []
    l_sts = []
    l_ets = []
    l_swpx = []
    l_swpy = []
    l_ewpx = []
    l_ewpy = []
    l_d = []
    for filename in tqdm(list_train_files):
        path_id = filename.split(".")[2].split("/")[6]
        try: df_all = read_data_file(filename)
        except:continue
        
        # get start and end timestamp
        sts = df_all.starttime
        ets = df_all.endtime
        
        df_waypoint = pd.DataFrame(df_all.waypoint)
        df_waypoint.columns = ['timestamp', 'waypoint_x','waypoint_y']
        df_waypoint["timestamp"] = (df_waypoint["timestamp"]).astype(float)
        
        # search start and end waypoints of each path
        swpx = df_waypoint.query("timestamp == @df_waypoint.timestamp.min()")["waypoint_x"].iloc[0]
        swpy = df_waypoint.query("timestamp == @df_waypoint.timestamp.min()")["waypoint_y"].iloc[0]
        ewpx = df_waypoint.query("timestamp == @df_waypoint.timestamp.max()")["waypoint_x"].iloc[0]
        ewpy = df_waypoint.query("timestamp == @df_waypoint.timestamp.max()")["waypoint_y"].iloc[0]

        # search device id besed on magn bias
        d = sum(df_all.magn_uncali[0,4:7])
        if d == 0:d = sum(df_all.magn_uncali[0,1:4] - df_all.magn[0,1:4])
        d = round(d,2)

        l_pid += [path_id]
        l_sts += [sts]
        l_ets += [ets]
        l_swpx += [swpx]
        l_swpy += [swpy]
        l_ewpx += [ewpx]
        l_ewpy += [ewpy]
        l_d += [d]        
    df_mart_train = pd.DataFrame(data={"path_id": l_pid,
                                       "start_time": l_sts, "end_time": l_ets,
                                       "start_waypoint_x": l_swpx, "start_waypoint_y": l_swpy,
                                       "end_waypoint_x": l_ewpx, "end_waypoint_y": l_ewpy,
                                       "device": l_d},
                                 columns=["path_id","start_time","end_time","start_waypoint_x","start_waypoint_y",
                                          "end_waypoint_x", "end_waypoint_y", "device"])

    l_pid = []
    l_sts = []
    l_ets = []
    l_swpx = []
    l_swpy = []
    l_ewpx = []
    l_ewpy = []
    l_d = []
    
    ## test
    df_sample_site = df_sample.query("site_id == @site_id")
    df_sample_site["timestamp"] = df_sample_site["timestamp"].astype(float)
    list_path = df_sample_site["path_id"].unique()
    for path_id in tqdm(list_path):
        df_sample_path = df_sample_site.query("path_id == @path_id")
        filename = f"../input/indoor-location-navigation/test/{path_id}.txt"
        df_all = read_data_file(filename)
        df_wifi = pd.DataFrame(df_all.wifi)
        df_wifi.columns = ['timestamp', 'ssid', 'bssid', 'rssi', 'last_seen_timestamp']
        df_wifi["timestamp"] = (df_wifi["timestamp"]).astype(float)
        df_wifi["last_seen_timestamp"] = (df_wifi["last_seen_timestamp"]).astype(float)  
        
        df_ibeacon = pd.DataFrame(df_all.ibeacon)
        # retrieve raw timestamp
        if len(df_ibeacon) > 0:
            df_ibeacon.columns = ['timestamp', 'uuid', 'rssi', 'dist','last_seen_timestamp']
            df_ibeacon["timestamp"] = (df_ibeacon["timestamp"]).astype(float)
            df_ibeacon["last_seen_timestamp"] = (df_ibeacon["last_seen_timestamp"]).astype(float)
            time_diff = df_ibeacon.loc[0,"last_seen_timestamp"]-df_ibeacon.loc[0,"timestamp"]
        else:
            time_diff = (df_wifi["last_seen_timestamp"] - df_wifi["timestamp"]).max()
            
        # search device id besed on magn bias
        d = sum(df_all.magn_uncali[0,4:7])
        if d == 0:d = sum(df_all.magn_uncali[0,1:4] - df_all.magn[0,1:4])
        d = round(d,2)
        
        sts = df_all.starttime + time_diff
        ets = df_all.endtime + time_diff
        swpx = np.nan;swpy = np.nan; ewpx = np.nan; ewpy = np.nan;floor = np.nan
        # x and y
        df_start = df_mart_train.query("device == @d and start_time > @ets - 2000 and start_time < @ets + 10000").sort_values("start_time").reset_index()
        df_end = df_mart_train.query("device == @d and end_time < @sts + 2000 and end_time > @sts - 10000").sort_values("end_time",ascending=False).reset_index()
        if len(df_start) > 0:
            ewpx = df_start.iloc[0]["start_waypoint_x"]
            ewpy = df_start.iloc[0]["start_waypoint_y"]
        if len(df_end) > 0:
            swpx = df_end.iloc[0]["end_waypoint_x"]
            swpy = df_end.iloc[0]["end_waypoint_y"]

        l_pid += [path_id]
        l_sts += [sts]
        l_ets += [ets]
        l_swpx += [swpx]
        l_swpy += [swpy]
        l_ewpx += [ewpx]
        l_ewpy += [ewpy]
        l_d += [d]    
      
    df_mart_test = pd.DataFrame(data={"path_id": l_pid,
                                       "start_time": df_all.starttime, "end_time": df_all.endtime,
                                       "start_waypoint_x": l_swpx, "start_waypoint_y": l_swpy,
                                       "end_waypoint_x": l_ewpx, "end_waypoint_y": l_ewpy,
                                       "device": l_d},
                                 columns=["path_id","start_time","end_time","start_waypoint_x","start_waypoint_y",
                                          "end_waypoint_x", "end_waypoint_y","device"])
    
    df_leak = df_leak.append(df_mart_test)
    
    # calculate time difference and waypoint difference
    df_tr = df_mart_train.sort_values(["device","start_time"]).reset_index(drop=True)
    df_tr["time_diff"] = df_tr["start_time"] - df_tr.groupby("device").shift(1)["end_time"]
    df_tr["x_diff"] = df_tr["start_waypoint_x"] - df_tr.groupby("device").shift(1)["end_waypoint_x"]
    df_tr["y_diff"] = df_tr["start_waypoint_y"] - df_tr.groupby("device").shift(1)["end_waypoint_y"]
    
    # visualize relationship between time and waypoint differences 
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(12, 6))
    _ = df_tr.query("time_diff > -2000 and time_diff < 1000*60").plot.scatter(x="time_diff",y="x_diff",ax=axes[0])
    _ = df_tr.query("time_diff > -2000 and time_diff < 1000*60").plot.scatter(x="time_diff",y="y_diff",ax=axes[1])

    display(fig)

As shown in above, the greater the time difference, the greater the difference in waypoints.

In [None]:
# apply leakage waypoint
df_sub = df_sample.copy()
list_path = df_sub["path_id"].unique()
for path_id in tqdm(list_path):
    df_sub_path = df_sub.query("path_id == @path_id")
    start_idx = df_sub.loc[df_sub["path_id"] == path_id].index.min()
    end_idx = df_sub.loc[df_sub["path_id"] == path_id].index.max()
    start_x = df_sub_path.at[start_idx,"x"]
    start_y = df_sub_path.at[start_idx,"y"]
    end_x = df_sub_path.at[end_idx,"x"]
    end_y = df_sub_path.at[end_idx,"y"]
    start_x_leak = df_leak.query("path_id == @path_id")["start_waypoint_x"].iloc[0]
    start_y_leak = df_leak.query("path_id == @path_id")["start_waypoint_y"].iloc[0]
    end_x_leak = df_leak.query("path_id == @path_id")["end_waypoint_x"].iloc[0]
    end_y_leak = df_leak.query("path_id == @path_id")["end_waypoint_y"].iloc[0]
    if not np.isnan(start_x_leak):
        df_sub.at[start_idx,"x"] = start_x_leak
        df_sub.at[start_idx,"y"] = start_y_leak
    if not np.isnan(end_x_leak):
        df_sub.at[end_idx,"x"] = end_x_leak
        df_sub.at[end_idx,"y"] = end_y_leak

In [None]:
df_sub.drop(["site_id","path_id","timestamp"],axis=1).to_csv("submission.csv",index = False)
df_sub