Many people noticed the test data has *fake* timestamps as discussed in this [thread](https://www.kaggle.com/c/indoor-location-navigation/discussion/218074). Depending on how the timestamp is used, this could be a big deal for your model. In my case, my RNN model's LB score is improved by 0.4 with fixing test data's timestamp only.

In this notebook, I will:
* modify the `read_data_file` function from the host's github to read last timestamp of `ibeacon`
* calculate the `gap` between the real timestamp and the `fake` timestamp from `ibeacon`. 
* use `dask` to recover the real timestamp of the test data in parallel with the `gap`.

In [None]:
! ls ../input/indoor-location-navigation

In [None]:
from glob import glob
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from glob import glob
from dask.distributed import wait

SENSORS = ['acce','acce_uncali','gyro',
           'gyro_uncali','magn','magn_uncali','ahrs']

NFEAS = {
    'acce': 3,
    'acce_uncali': 3,
    'gyro': 3,
    'gyro_uncali': 3,
    'magn': 3,
    'magn_uncali': 3,
    'ahrs': 3,
    'wifi': 1,
    'ibeacon': 1,
    'waypoint': 3
}

ACOLS = ['timestamp','x','y','z']
        
FIELDS = {
    'acce': ACOLS,
    'acce_uncali': ACOLS,
    'gyro': ACOLS,
    'gyro_uncali': ACOLS,
    'magn': ACOLS,
    'magn_uncali': ACOLS,
    'ahrs': ACOLS,
    'wifi': ['timestamp','ssid','bssid','rssi','last_timestamp'],
    'ibeacon': ['timestamp','code','rssi','last_timestamp'],
    'waypoint': ['timestamp','x','y']
}

def to_frame(data, col):
    cols = FIELDS[col]
    is_dummy = False
    if data.shape[0]>0:
        df = pd.DataFrame(data, columns=cols)
    else:
        df = create_dummy_df(cols)
        is_dummy = True
    for col in df.columns:
        if 'timestamp' in col:
            df[col] = df[col].astype('int64')
    return df, is_dummy

def create_dummy_df(cols):
    df = pd.DataFrame()
    for col in cols:
        df[col] = [0]
        if col in ['ssid','bssid']:
            df[col] = df[col].map(str)
    return df

In [None]:
from dataclasses import dataclass

import numpy as np


@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray


def read_data_file(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            if len(line_data)>=5:
                ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            rssi = line_data[6]
            lastts = line_data[-1]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi, lastts]
            ibeacon.append(ibeacon_data)
            continue

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])

    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)

    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)

The main changes made are these two lines:
```
lastts = line_data[-1] # last timestamp
ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi, lastts]
```

In [None]:
def get_test_dfs(PATH, test_files):
    dtest = get_test_df(PATH)
    buildings = set(dtest['building'].values.tolist())
    dws = {}
    ntest_files = []
    for fname in tqdm(test_files):
        path = fname.split('/')[-1].split('.')[0]
        mask = dtest['path'] == path
        dws[fname] = dtest.loc[mask, ['timestamp','x','y','floor','building','site_path_timestamp']].copy().reset_index(drop=True)
        ntest_files.append(fname)
    return dws

def get_test_df(PATH):
    dtest = pd.read_csv(f'{PATH}/sample_submission.csv')
    dtest['building'] = dtest['site_path_timestamp'].apply(lambda x: x.split('_')[0])
    dtest['path'] = dtest['site_path_timestamp'].apply(lambda x: x.split('_')[1])
    dtest['timestamp'] = dtest['site_path_timestamp'].apply(lambda x: x.split('_')[2])
    dtest['timestamp'] = dtest['timestamp'].astype('int64')
    dtest = dtest.sort_values(['path','timestamp']).reset_index(drop=True)
    return dtest

def get_time_gap(name):
    data = read_data_file(name)
    db,no_ibeacon = to_frame(data.ibeacon,'ibeacon')
    gap = db['last_timestamp'] - db['timestamp']
    assert gap.unique().shape[0]==1
    return gap.values[0],no_ibeacon

def fix_timestamp_test(df, gap):
    df['real_timestamp'] = df['timestamp'] + gap
    return df

In [None]:
import dask
from dask.distributed import Client, wait, LocalCluster

In [None]:
# set n_workers to number of cores
client = Client(n_workers=2, 
                threads_per_worker=1)
client

### Read data

In [None]:
PATH = '../input/indoor-location-navigation'
#train_files = glob(f'{PATH}/train/*/*/*.txt')
dtest = get_test_df(PATH)
test_sites = dtest['building'].unique()
train_files = []
for i in test_sites:
    train_files.extend(glob(f'{PATH}/train/{i}/*/*.txt'))
test_files = glob(f'{PATH}/test/*.txt')
len(train_files),len(test_files)

In [None]:
test_dfs = get_test_dfs(PATH, test_files)

`test_dfs` is a dictionary which maps the file path to its waypoint dataframe.

### How to recover the real timestamp

In the [webinar](https://youtu.be/xt3OzMC-XMU?t=690), the host mentioned that for `ibeacon`, the `timestamp` and the `last_timestamp` are the same timestamps. We can verify this claim by checking the training ibeacon data. 

In [None]:
fname = train_files[4]
data = read_data_file(fname)
db,no_ibeacon = to_frame(data.ibeacon,'ibeacon')
db.head()

In [None]:
(db['timestamp']==db['last_timestamp']).all()

I also checked every other train files. The claim is true for all of them. Next, let's look at one test ibeacon data. 

In [None]:
fname = test_files[0]
data = read_data_file(fname)
db,no_ibeacon = to_frame(data.ibeacon,'ibeacon')
db.head()

The `timestamp` and the `last_timestamp` are obviously different. But if we look closely, the gap between them are actually constant.

In [None]:
db['gap'] = db['last_timestamp'] - db['timestamp']
db['gap'].unique()

Hence, an intuitive guess is this `gap` is artificially introduced when preparing test data and we could use this `gap` to fix timestamps of `waypoints`, `wifi`, etc.

#### Fix one test waypoint

In [None]:
fname = test_files[0]
gap,no_ibeacon = get_time_gap(fname)
df = fix_timestamp_test(test_dfs[fname], gap)
df[['timestamp','real_timestamp','site_path_timestamp']]

### Fix all test waypoints using DASK

In [None]:
%%time
futures = []
for fname in tqdm(test_files, total=len(test_files)):
    f = client.submit(get_time_gap,fname)
    futures.append(f)

futures2 = []
no_ibeacon_list = []
for f,fname in tqdm(zip(futures, test_files), total=len(test_files)):
    gap,no_ibeacon = f.result()
    no_ibeacon_list.append(no_ibeacon)
    f = client.submit(fix_timestamp_test, test_dfs[fname], gap)
    futures2.append(f)
    
fixed_test_dfs = {}
for f,fname in tqdm(zip(futures2, test_files), total=len(test_files)):
    fixed_test_dfs[fname] = f.result()
    
fix_summary = pd.DataFrame({'file':test_files, 'no_ibeacon':no_ibeacon_list})
fix_summary.head()

In [None]:
fix_summary['no_ibeacon'].mean()

**There are about 5% of test files without ibeacon data so these files still have incorrect timestamps. How to fix these data is the next question. Hopefully the host could respoind to this issue.**

**Before fix**

In [None]:
fname = test_files[1]
test_dfs[fname].head()[['timestamp','site_path_timestamp']]

**After fix**

In [None]:
fixed_test_dfs[fname].head()[['timestamp','real_timestamp','site_path_timestamp']]

You can use the same method to fix test data `wifi` dataframes.