# Wifi feature as another source of leakage
Previously, I have shown that [the paths in train and test set are divided from a single measurement](https://www.kaggle.com/tomooinubushi/postprocessing-based-on-leakage).
When I examined the data in detail, I found a much clearer evidence of the leakage in wifi data.

I will show two things in this notebook.

* The same wifi records are shared among train and test dataset.
* From this, I could partially recover user ID, which could be a predictor of the waypoints.


I use some codes and ideas from following notebooks. Thank you very much.

* https://www.kaggle.com/kenmatsu4/feature-store-for-indoor-location-navigation
* https://www.kaggle.com/jiweiliu/fix-the-timestamps-of-test-data-using-dask

In [None]:
import json
import re
import gc
import pickle
import itertools
import pandas as pd
import numpy as np
from glob import glob
from datetime import datetime as dt
from pathlib import Path
from tqdm import tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # ut(10 digit) -> date

# pandas settings -----------------------------------------
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = '{:,.5f}'.format

# Graph drawing -------------------------------------------
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
from matplotlib_venn import venn2, venn2_circles
from matplotlib import animation as ani
from IPython.display import Image
from pylab import imread

plt.rcParams["patch.force_edgecolor"] = True
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

%matplotlib inline
%config InlineBackend.figure_format='retina'

# ML -------------------------------------------
from sklearn.preprocessing import LabelEncoder


import dill
from collections import defaultdict, OrderedDict
from scipy.spatial import distance

In [None]:
def unpickle(filename):
    with open(filename, 'rb') as fo:
        p = pickle.load(fo)
    return p

def to_pickle(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, -1)



class FeatureStore():
    
    # necessayr to re-check
    floor_convert = {'1F' :  0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4, 
                     '6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8,
                     'B'  : -1, 'B1' : -1, 'B2' : -2, 'B3' : -3, 
                     'BF' : -1, 'BM' : -1, 
                     'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4, 
                     'F6' : 5, 'F7' : 6, 'F8' : 7, 'F9' : 8, 'F10': 9,
                     'L1' : 0, 'L2' : 1, 'L3' : 2, 'L4' : 3, 'L5' : 4, 
                     'L6' : 5, 'L7' : 6, 'L8' : 7, 'L9' : 8, 'L10': 9, 
                     'L11': 10,
                     'G'  : 0, 'LG1': 0, 'LG2': 1, 'LM' : 0, 'M'  : 0, 
                     'P1' : 0, 'P2' : 1,}
    
    df_types = ['accelerometer',
                'accelerometer_uncalibrated',
                'beacon',
                'gyroscope',
                'gyroscope_uncalibrated',
                'magnetic_field',
                'magnetic_field_uncalibrated',
                'rotation_vector',
                'waypoint',
                'wifi']
    
    # https://github.com/location-competition/indoor-location-competition-20
    df_type_cols = {'accelerometer': ["timestamp", "x", "y", "z", "accuracy"],
                'accelerometer_uncalibrated': ["timestamp", "x", "y", "z", 
                                               "x2", "y2", "z2", "accuracy" ],
                'beacon': ["timestamp", "uuid", "major_id", "minor_id", "tx_power", 
                           "rssi", "distance", "mac_addr", "timestamp2"],
                'gyroscope': ["timestamp", "x", "y", "z", "accuracy"],
                'gyroscope_uncalibrated': ["timestamp", "x", "y", "z", 
                                           "x2", "y2", "z2", "accuracy" ],
                'magnetic_field': ["timestamp", "x", "y", "z", "accuracy"],
                'magnetic_field_uncalibrated': ["timestamp", "x", "y", "z", 
                                                "x2", "y2", "z2", "accuracy" ],
                'rotation_vector': ["timestamp", "x", "y", "z", "accuracy"],
                'waypoint': ["timestamp", "x", "y"],
                'wifi': ["timestamp", "ssid", "bssid","rssi","frequency",
                         "last_seen_timestamp",]}

    dtype_dict = {}
    dtype_dict["accelerometer"] = {"timestamp":int, "x":float, "y":float, "z":float, 
                                   "accuracy":int}
    dtype_dict["accelerometer_uncalibrated"] = {"timestamp":int, "x":float, "y":float, 
                                                "z":float, "x2":float, "y2":float, 
                                                "z2":float, "accuracy":int}
    dtype_dict["beacon"] = {"timestamp":int, "uuid":str, "major_id":str, 
                            "minor_id":str, "tx_power":int,  "rssi":int, 
                            "distance":float, "mac_addr":str, "timestamp2":int}
    dtype_dict["gyroscope"] = {"timestamp":int, "x":float, "y":float, "z":float, 
                               "accuracy":int}
    dtype_dict["gyroscope_uncalibrated"] = {"timestamp":int, "x":float, "y":float, 
                                            "z":float, "x2":float, "y2":float, 
                                            "z2":float, "accuracy":int}
    dtype_dict["magnetic_field"] = {"timestamp":int, "x":float, "y":float, 
                                    "z":float, "accuracy":int}
    dtype_dict["magnetic_field_uncalibrated"] = {"timestamp":int, "x":float, 
                                                 "y":float, "z":float, "x2":float, 
                                                 "y2":float, "z2":float, "accuracy":int}
    dtype_dict["rotation_vector"] = {"timestamp":int, "x":float, "y":float, 
                                     "z":float, "accuracy":int}
    dtype_dict["waypoint"] = {"timestamp":int, "x":float, "y":float, "z":float}
    dtype_dict["wifi"] = {"timestamp":int, "ssid":str, "bssid":str,
                          "rssi":int,"frequency":int, "last_seen_timestamp":int}

    def __init__(self, site_id, floor, path_id, 
                 input_path="../input/indoor-location-navigation/",
                 save_path="../mid"):
        self.site_id = site_id.strip()
        self.floor = floor.strip()
        self.n_floor = self.floor_convert[self.floor]
        self.path_id = path_id.strip()
        
        self.input_path = input_path
        assert Path(input_path).exists(), f"input_path do not exist: {input_path}"
        
        self.save_path = save_path
        Path(save_path).mkdir(parents=True, exist_ok=True)
        
        self.site_info = SiteInfo(site_id=self.site_id, floor=self.floor, input_path=self.input_path)
        
    def _flatten(self, l):
        return list(itertools.chain.from_iterable(l))
    
    def multi_line_spliter(self, s):
        matches = re.finditer("TYPE_", s)
        matches_positions = [match.start() for match in matches]
        split_idx = [0] + [matches_positions[i]-14 for i in range(1, len(matches_positions))] + [len(s)]
        return [s[split_idx[i]:split_idx[i+1]] for i in range(len(split_idx)-1)]
    
    def load_df(self, ):
        path = str(Path(self.input_path)/f"train/{self.site_id}/{self.floor}/{self.path_id}.txt")
        with open(path) as f:
            data = f.readlines()
        
        modified_data = []
        for s in data:
            if s.count("TYPE_")>1:
                lines = self.multi_line_spliter(s)
                modified_data.extend(lines)
            else:
                modified_data.append(s)
        del data
        self.meta_info_len = len([d for d in modified_data if d[0]=="#"])
        self.meta_info_df = pd.DataFrame([m.replace("\n", "").split(":") 
                                          for m in self._flatten([d.split("\t") 
                                                                  for d in modified_data if d[0]=="#"]) if m!="#"])

        data_df = pd.DataFrame([d.replace("\n", "").split("\t") for d in modified_data if d[0]!="#"])
        for dt in self.df_types:
            # select data type
            df_s = data_df[data_df[1]==f"TYPE_{dt.upper()}"]
            if len(df_s)==0:
                setattr(self, dt, pd.DataFrame(columns=self.df_type_cols[dt]))
            else:
                # remove empty cols
                na_info = df_s.isna().sum(axis=0) == len(df_s)
                df_s = df_s[[i for i in na_info[na_info==False].index if i!=1]].reset_index(drop=True)
                
                if len(df_s.columns)!=len(self.df_type_cols[dt]):
                    df_s.columns = self.df_type_cols[dt][:len(df_s.columns)]
                else:
                    df_s.columns = self.df_type_cols[dt]
            
                # set dtype          
                for c in df_s.columns:
                    df_s[c] = df_s[c].astype(self.dtype_dict[dt][c])
                                     
                # set DataFrame to attr
                setattr(self, dt, df_s)
    
    def get_site_info(self, keep_raw=False):
        self.site_info.get_site_info(keep_raw=keep_raw)
            
    def load_all_data(self, keep_raw=False):     
        self.load_df()
        self.get_site_info(keep_raw=keep_raw)
        
    def __getitem__(self, item):
        if item in self.df_types:
            return getattr(self, item)
        else:
            return None
    
    def save(self, ):
        # to be implemented
        pass
    
    
class SiteInfo():
    def __init__(self, site_id, floor, input_path="../input/indoor-location-navigation/"):
        self.site_id = site_id
        self.floor = floor
        self.input_path = input_path
        assert Path(input_path).exists(), f"input_path do not exist: {input_path}"
        
    def get_site_info(self, keep_raw=False):
        floor_info_path = f"{self.input_path}/metadata/{self.site_id}/{self.floor}/floor_info.json"
        with open(floor_info_path, "r") as f:
            self.floor_info = json.loads(f.read())
            self.site_height = self.floor_info["map_info"]["height"]
            self.site_width = self.floor_info["map_info"]["width"]
            if not keep_raw:
                del self.floor_info
            
        geojson_map_path = f"{self.input_path}/metadata/{self.site_id}/{self.floor}/geojson_map.json"
        with open(geojson_map_path, "r") as f:
            self.geojson_map = json.loads(f.read())
            self.map_type = self.geojson_map["type"]
            self.features = self.geojson_map["features"]
            
            self.floor_coordinates = self.features[0]["geometry"]["coordinates"]
            self.store_coordinates = [self.features[i]["geometry"]["coordinates"] 
                                          for i in range(1, len(self.features))]
                
            if not keep_raw:
                del self.geojson_map
    
    def show_site_image(self):
        path = f"{self.input_path}/metadata/{self.site_id}/{self.floor}/floor_image.png"
        plt.imshow(imread(path), extent=[0, self.site_width, 0, self.site_height])

    def draw_polygon(self, size=8, only_floor=False):

        fig = plt.figure()
        ax = plt.subplot(111)
            
        xmax, xmin, ymax, ymin = self._draw(self.floor_coordinates, ax, calc_minmax=True)
        if not only_floor:
            self._draw(self.store_coordinates, ax, fill=True)
        plt.legend([])
        
        xrange = xmax - xmin
        yrange = ymax - ymin
        ratio = yrange / xrange
        
        self.x_size = size
        self.y_size = size*ratio

        fig.set_figwidth(size)
        fig.set_figheight(size*ratio)
        # plt.show()
        return ax
        
    def _draw(self, coordinates, ax, fill=False, calc_minmax=False):
        xmax, ymax = -np.inf, -np.inf
        xmin, ymin = np.inf, np.inf
        for i in range(len(coordinates)):
            ndim = np.ndim(coordinates[i])
            if ndim==2:
                corrd_df = pd.DataFrame(coordinates[i])
                if fill:
                    ax.fill(corrd_df[0], corrd_df[1], alpha=0.7)
                else:
                    corrd_df.plot.line(x=0, y=1, style="-", ax=ax)
                        
                if calc_minmax:
                    xmax = max(xmax, corrd_df[0].max())
                    xmin = min(xmin, corrd_df[0].min())

                    ymax = max(ymax, corrd_df[1].max())
                    ymin = min(ymin, corrd_df[1].min())
            elif ndim==3:
                for j in range(len(coordinates[i])):
                    corrd_df = pd.DataFrame(coordinates[i][j])
                    if fill:
                        ax.fill(corrd_df[0], corrd_df[1], alpha=0.6)
                    else:
                        corrd_df.plot.line(x=0, y=1, style="-", ax=ax)
                        
                    if calc_minmax:
                        xmax = max(xmax, corrd_df[0].max())
                        xmin = min(xmin, corrd_df[0].min())

                        ymax = max(ymax, corrd_df[1].max())
                        ymin = min(ymin, corrd_df[1].min())
            else:
                assert False, f"ndim of coordinates should be 2 or 3: {ndim}"
        if calc_minmax:
            return xmax, xmin, ymax, ymin
        else:
            return None

In [None]:
def pickle_dump_dill(obj, path):
    with open(path, mode='wb') as f:
        dill.dump(obj, f)


def pickle_load_dill(path):
    with open(path, mode='rb') as f:
        data = dill.load(f)
        return data

In [None]:
def read_txt(file):
    with open(file) as f:
        txt = f.readlines()

    modified_data = []
    for s in txt:
        if s.count("TYPE_") > 1:
            lines = multi_line_spliter(s)
            modified_data.extend(lines)
        else:
            modified_data.append(s)
    return modified_data


def _flatten(l):
    return list(itertools.chain.from_iterable(l))

sample_sub = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')
sample_sub = sample_sub["site_path_timestamp"].apply(
    lambda x: pd.Series(x.split("_")))
sample_sub.columns = ["site_id", "path_id", "timestamp"]

def get_feature_test(site_id, path_id, sample_sub, input_path="../input/indoor-location-navigation/"):
    file = f"{input_path}/test/{path_id}.txt"
    content = read_txt(file)
    data_df = pd.DataFrame([d.replace("\n", "").split("\t")
                            for d in content if d[0] != "#"])
    data_dict = OrderedDict()
    for dt in FeatureStore.df_types:
        # select data type
        df_s = data_df[data_df[1] == f"TYPE_{dt.upper()}"]
        if len(df_s) == 0:
            setattr(data_dict, dt, pd.DataFrame(
                columns=FeatureStore.df_type_cols[dt]))
        else:
            # remove empty cols
            na_info = df_s.isna().sum(axis=0) == len(df_s)
            df_s = df_s[[i for i in na_info[na_info ==
                                            False].index if i != 1]].reset_index(drop=True)

            if len(df_s.columns) != len(FeatureStore.df_type_cols[dt]):
                df_s.columns = FeatureStore.df_type_cols[dt][:len(
                    df_s.columns)]
            else:
                df_s.columns = FeatureStore.df_type_cols[dt]

            # set dtype
            for c in df_s.columns:
                df_s[c] = df_s[c].astype(FeatureStore.dtype_dict[dt][c])
            setattr(data_dict, dt, df_s)
    data_dict.meta_info_df = pd.DataFrame([m.replace("\n", "").split(":")
                                           for m in _flatten([d.split("\t")
                                                              for d in content if d[0] == "#"]) if m != "#"])
    startTime_ind = int(np.where(data_dict.meta_info_df[0] == 'startTime')[0])
    endTime_ind = int(np.where(data_dict.meta_info_df[0] == 'endTime')[0])
    data_dict.meta_info_df.loc[startTime_ind,
                               1] = data_dict.meta_info_df.loc[startTime_ind+1, 0]
    data_dict.meta_info_df.loc[endTime_ind,
                               1] = data_dict.meta_info_df.loc[endTime_ind+1, 0]

    data_dict.waypoint['timestamp'] = sample_sub[sample_sub.path_id ==
                                                 path_id].timestamp.values.astype(int)
    data_dict.waypoint['x'] = 0
    data_dict.waypoint['y'] = 0
    data_dict.n_floor = 0
    data_dict.site_id = site_id
    if len(data_dict.beacon) > 0:
        gap = data_dict.beacon.loc[0, 'timestamp2'] + \
            data_dict.beacon.loc[0, 'timestamp']
    else:
        gap = (data_dict.wifi.last_seen_timestamp.values -
               data_dict.wifi.timestamp.values).max()+210.14426803816337  # from mean gap
    #data_dict.wifi.last_seen_timestamp = data_dict.wifi.last_seen_timestamp-gap
    return data_dict

# The common wifi records in the training dataset.

First, please look at these two paths in train set.

In [None]:
site_id='5a0546857ecc773753327266'
floor='B1'
path_id_in_train1='5e15bdabf4c3420006d52333'
path_id_in_train2='5e15bda91506f2000638feb7'

feature = FeatureStore(
    site_id=site_id, floor=floor, path_id=path_id_in_train1)
feature.load_all_data() 
wifi1 = feature.wifi
waypoint1 = feature.waypoint

feature = FeatureStore(
    site_id=site_id, floor=floor, path_id=path_id_in_train2)
feature.load_all_data() 
wifi2 = feature.wifi
waypoint2 = feature.waypoint

In [None]:
waypoint1

In [None]:
waypoint2

The last waypoint of path 1 is the same as the first waypoint of path 2. I guess the path 1 and path 2 are divided from a single measurement.

In the wifi data of these paths you can find the records with the same BSSID.

In [None]:
common_wifi_bssid = '9ad1d8c3a29b04ff542c90d2f6e05eaeddc42a97'

In [None]:
wifi1[wifi1.bssid==common_wifi_bssid]

In [None]:
wifi2[wifi2.bssid==common_wifi_bssid]

You can see there is a common wifi record with the same SSID, BSSID, RSSI, frequency, and last seen timestamps.

I am not familiar with wifi data, but @franoisboyer in [this discussion section](https://www.kaggle.com/c/indoor-location-navigation/discussion/224491) explained how the wifi data are collected.

* The first timestamp is the time when the scanning device calls the scan function.
* When the device calls the scan function, there may be 3 possibilities for scan result values


1. Either the wifi access point has never been scanned => nothing is returned and nothing is in the dataset for this timestamp
2. Either the wifi access point is in range => then lastseen timestamp is updated with current time stamp and fresh values are returned
3. Either the wifi access point is not in range anymore => then lastseen timestamp is not updated, and previously seen values are returned

I suspect that these common wifi records are predictor of the leakage.


# The common wifi records between the training and test dataset.
Surprisingly, path 1 shares the same wifi records with a path in test set

In [None]:
path_id_in_test='d592885af4e6e380c376dc55'
feature = get_feature_test(
        site_id=site_id, path_id=path_id_in_test, sample_sub=sample_sub)
wifi_test = feature.wifi

In [None]:
common_wifi_bssid2 = '914cb2b0c63064164d4b8fd821bbde4a164a2a6a'

In [None]:
wifi1[wifi1.bssid==common_wifi_bssid2]

In [None]:
wifi_test[wifi_test.bssid==common_wifi_bssid2]

You can see there are common wifi records with the same SSID, BSSID, RSSI, frequency, and last seen timestamps.
I could not believe my eyes when I first found it.

# Get wifi data and start/end waypoints/time in the dataset
Now I partially recovered the user ID with these leaked common wifi records.

In [None]:
# train_meta_data
train_meta = glob("../input/indoor-location-navigation/train/*/*/*")
train_meta_org = pd.DataFrame(train_meta)
train_meta = train_meta_org[0].str.split("/", expand=True)[[4, 5, 6]]
train_meta.columns = ["site_id", "floor", "path_id"]
train_meta["path_id"] = train_meta["path_id"].str.replace(".txt", "")
train_meta["path"] = train_meta_org[0]
#train_meta.head()

In [None]:
sample_sub = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')
sample_sub = sample_sub["site_path_timestamp"].apply(
    lambda x: pd.Series(x.split("_")))
sample_sub.columns = ["site_id", "path_id", "timestamp"]
test_meta=sample_sub.drop('timestamp', axis=1)
test_meta = test_meta.drop_duplicates(subset=["site_id", "path_id"]).reset_index(drop=True)

In [None]:
train_meta['start_time'] = 0
train_meta['end_time'] = 0
train_meta['start_wp_time'] = 0
train_meta['start_wp_x'] = 0
train_meta['start_wp_y'] = 0
train_meta['end_wp_time'] = 0
train_meta['end_wp_x'] = 0
train_meta['end_wp_y'] = 0
train_meta['n_floor'] = 0
wifi_dict=defaultdict(lambda:pd.DataFrame())
for i in tqdm(range(len(train_meta))):
    t = train_meta.iloc[i]
    n_floor = FeatureStore.floor_convert[t.floor]
    feature = FeatureStore(
        site_id=t.site_id, floor=t.floor, path_id=t.path_id)
    feature.load_all_data() 
    if feature.meta_info_df[feature.meta_info_df[0] == 'startTime'][1].values == None:
        start_time = int(np.nanmin([feature.accelerometer.timestamp.min(
        ), feature.wifi.timestamp.min(), feature.beacon.timestamp.min()]))
    else:
        start_time = int(
            feature.meta_info_df[feature.meta_info_df[0] == 'startTime'][1])
    if (len(feature.meta_info_df[feature.meta_info_df[0] == 'endTime']) == 0) or (feature.meta_info_df[feature.meta_info_df[0] == 'endTime'][1].values == None):
        end_time = int(np.nanmax([feature.accelerometer.timestamp.max(
        ), feature.wifi.timestamp.max(), feature.beacon.timestamp.max()]))
    else:
        end_time = int(
            feature.meta_info_df[feature.meta_info_df[0] == 'endTime'][1])
    train_meta.loc[i, 'start_time'] = start_time
    train_meta.loc[i, 'start_wp_time'] = feature.waypoint.iloc[0]['timestamp']
    train_meta.loc[i, 'start_wp_x'] = feature.waypoint.iloc[0]['x']
    train_meta.loc[i, 'start_wp_y'] = feature.waypoint.iloc[0]['y']
    train_meta.loc[i, 'end_time'] = end_time
    train_meta.loc[i, 'end_wp_time'] = feature.waypoint.iloc[-1]['timestamp']
    train_meta.loc[i, 'end_wp_x'] = feature.waypoint.iloc[-1]['x']
    train_meta.loc[i, 'end_wp_y'] = feature.waypoint.iloc[-1]['y']
    train_meta.loc[i, 'n_floor'] = feature.n_floor
    wifi_dict[t.path_id]=feature.wifi[['bssid', 'last_seen_timestamp']].drop_duplicates()
train_meta = train_meta.sort_values(
    ['site_id', 'start_time']).reset_index(drop=True)

test_meta['start_time'] = 0
test_meta['end_time'] = 0
test_meta['start_wp_time'] = 0
test_meta['start_wp_x'] = 0
test_meta['start_wp_y'] = 0
test_meta['end_wp_time'] = 0
test_meta['end_wp_x'] = 0
test_meta['end_wp_y'] = 0
test_meta['n_floor'] = 0
for i in tqdm(range(len(test_meta))):
    t = test_meta.iloc[i]
    #print(f"site_id: {t.site_id}, floor: {t.floor}, path_id: {t.path_id}")
    feature = get_feature_test(
        site_id=t.site_id, path_id=t.path_id, sample_sub=sample_sub)
    if feature.meta_info_df[feature.meta_info_df[0] == 'startTime'][1].values == None:
        start_time = int(np.nanmin([feature.accelerometer.timestamp.min(
        ), feature.wifi.timestamp.min(), feature.beacon.timestamp.min()]))
    else:
        start_time = int(
            feature.meta_info_df[feature.meta_info_df[0] == 'startTime'][1])
    if (len(feature.meta_info_df[feature.meta_info_df[0] == 'endTime']) == 0) or (feature.meta_info_df[feature.meta_info_df[0] == 'endTime'][1].values == None):
        end_time = int(np.nanmax([feature.accelerometer.timestamp.max(
        ), feature.wifi.timestamp.max(), feature.beacon.timestamp.max()]))
    else:
        end_time = int(
            feature.meta_info_df[feature.meta_info_df[0] == 'endTime'][1])
    if len(feature.beacon) > 0:
        gap = feature.beacon.loc[0, 'timestamp2'] - \
            feature.beacon.loc[0, 'timestamp']
    else:
        gap = (feature.wifi.last_seen_timestamp.values -
               feature.wifi.timestamp.values).max()+210.14426803816337  # from mean gap
    test_meta.loc[i, 'start_time'] = start_time+gap
    test_meta.loc[i, 'start_wp_time'] = feature.waypoint.iloc[0]['timestamp']
    test_meta.loc[i, 'start_wp_x'] = feature.waypoint.iloc[0]['x']
    test_meta.loc[i, 'start_wp_y'] = feature.waypoint.iloc[0]['y']
    test_meta.loc[i, 'end_time'] = end_time+gap
    test_meta.loc[i, 'end_wp_time'] = feature.waypoint.iloc[-1]['timestamp']
    test_meta.loc[i, 'end_wp_x'] = feature.waypoint.iloc[-1]['x']
    test_meta.loc[i, 'end_wp_y'] = feature.waypoint.iloc[-1]['y']
    test_meta.loc[i, 'n_floor'] = feature.n_floor
    wifi_dict[t.path_id]=feature.wifi[['bssid', 'last_seen_timestamp']].drop_duplicates()

df = pd.merge(train_meta, test_meta, how='outer')
df = df.sort_values(['site_id', 'start_time']).reset_index(drop=True)

# Get user ID based on leaked common records
I consider a small noize in last seen timestamps as shown above.

In [None]:
df['user_id'] = 0
df['counter'] = 0
n = 0
for i in tqdm(range(len(df))):
    t = df.iloc[i]
    current_wifi=wifi_dict[t.path_id]
    min_last_seen_timestamp = current_wifi.last_seen_timestamp.min()
    df_site = df[df.site_id == t.site_id]
    df_site = df_site[df_site.end_time < t.start_time]
    df_site = df_site[min_last_seen_timestamp < df_site.end_time]
    counter = 0
    if len(df_site) > 0:
        for j in range(len(df_site)):
            t = df_site.iloc[j]
            old_wifi = wifi_dict[t.path_id]
            common_wifi = pd.merge(
                current_wifi, old_wifi, how='inner', on=['bssid'])
            common_wifi['diff_time'] = abs(
                common_wifi.last_seen_timestamp_x-common_wifi.last_seen_timestamp_y)
            if (common_wifi.diff_time < 5).sum() > 0:
                #If there is a leak
                df.loc[i, 'user_id'] = t.user_id
                counter += 1
    if counter == 0:
        df.loc[i, 'user_id'] = n
        n += 1
    df.loc[i, 'counter'] = counter

Three paths shown above has the same user ID.

In [None]:
df[df.user_id==161]

In [None]:
df[:30]

In [None]:
df.to_csv('df.csv', index=False)

Strictly speaking, my model is contaminated by the leakage, as far as I use wifi features even if I do not use last seen timestamps. 

What should I do?

Any comments are welcome.