In [1]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool, Manager

import pickle
import math

pd.set_option("display.max_columns", 100)

In [2]:
# Settings and altering components for GCP

# path settings
root_path = "../input/indoor-location-navigation/"
# root_path = "../jupyter/input/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

# function imports using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./
from io_f import read_data_file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# import for gcp settings
# import compute_f
# import io_f
# import visualize_f
# import main
# from io_f import read_data_file
# from compute_f import compute_step_positions, compute_steps, \
# compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# filter milisecond setting 
time_stamp_cut = 250

# train number setting
train_num = len(train_paths)
# train_num = round(len(train_paths) / 2)
# train_num = 1000

In [3]:
# Preprocess
print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

# Reading in 1 file
def pick_example(max_range, paths):
    ex = random.randint(0, max_range)
    example_path = paths[ex]
    path = f"{example_path}"
    paths = path.split("/")
    site = paths[4]
    floorNo = paths[5]
    floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
print("example path: ", path)
print("site: ", site)
print("floorNo: ", floorNo)
print("floor_plan_filename: ", floor_plan_filename)
print("json_plan_filename: ", json_plan_filename)
print("width: {}, height: {} ".format(width_meter, height_meter))

with open(path) as p:
    lines = p.readlines()
print("No. Lines in 1 example: {:,}". format(len(lines)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204
example path:  ../input/indoor-location-navigation/train/5da1382d4db8ce0c98bbe92e/F4/5da833e2ae6cfc0006ca81dd.txt
site:  5da1382d4db8ce0c98bbe92e
floorNo:  F4
floor_plan_filename:  ../input/indoor-location-navigation/metadata/5da1382d4db8ce0c98bbe92e/F4/floor_image.png
json_plan_filename:  ../input/indoor-location-navigation/metadata/5da1382d4db8ce0c98bbe92e/F4/floor_info.json
width: 188.07749775811797, height: 163.30639801879926 
No. Lines in 1 example: 7,621


In [5]:
# for line in lines:
#     print(line)

#	startTime:1571302029807

#	SiteID:5da1382d4db8ce0c98bbe92e	SiteName:杭州嘉里中心	FloorId:5da138304db8ce0c98bbef3f	FloorName:F4

#	Brand:OPPO	Model:PBCM10	AndroidName:8.1.0	APILevel:27	

#	type:1	name:BMI160 Accelerometer	version:2062600	vendor:BOSCH	resolution:0.0023956299	power:0.18	maximumRange:39.22661

#	type:4	name:BMI160 Gyroscope	version:2062600	vendor:BOSCH	resolution:0.0010681152	power:0.9	maximumRange:34.906586

#	type:2	name:AK09911 Magnetometer	version:1	vendor:AKM	resolution:0.5996704	power:2.4	maximumRange:4911.9995

#	type:35	name:BMI160 Accelerometer Uncalibrated	version:2062600	vendor:BOSCH	resolution:0.0023956299	power:0.18	maximumRange:39.22661

#	type:16	name:BMI160 Gyroscope Uncalibrated	version:2062600	vendor:BOSCH	resolution:0.0010681152	power:0.9	maximumRange:34.906586

#	type:14	name:AK09911 Magnetometer Uncalibrated	version:1	vendor:AKM	resolution:0.5996704	power:2.4	maximumRange:4911.9995

#	VersionName:v20190925-nightly-15-gf5429c0	VersionCode:376	

157130202981

1571302034343	TYPE_WIFI	5ec944cb0e043327906acce8d1b61f6e32d8db1c	cf3204ad4693ee259059e5ec00bb56898ffd9bec	-86	5300	1571302010811

1571302034343	TYPE_WIFI	da39a3ee5e6b4b0d3255bfef95601890afd80709	b46bdaa98be44f2d9c65f779109acd08c77bda39	-86	5300	1571302033785

1571302034343	TYPE_WIFI	da39a3ee5e6b4b0d3255bfef95601890afd80709	de3c4dc15682673d584903b1bcfb0635caf97998	-86	5765	1571302019066

1571302034343	TYPE_WIFI	5ec944cb0e043327906acce8d1b61f6e32d8db1c	af098a41f1a7bc5ec6919e33ee34b6c73c27abcf	-86	5240	1571302030879

1571302034343	TYPE_WIFI	0b72e3ecd8f9071d311fd78bc78b4696fbbd35e5	29b82df6b8ba04c1f9064828a833ca94ee5e6c82	-86	5805	1571302033368

1571302034343	TYPE_WIFI	0fa36db77af713808449ff54868815dc26f88e45	e64da3ac16c88b33e21a5fdb93d7efe782eb5e8a	-86	5805	1571302033369

1571302034343	TYPE_WIFI	5ec944cb0e043327906acce8d1b61f6e32d8db1c	5a2d908ea8461e3f29656f175793af7a6f19ed6b	-86	5220	1571302012211

1571302034343	TYPE_WIFI	0b72e3ecd8f9071d311fd78bc78b4696fbbd35e5	785de6299cc176ab48531f94f


1571302038128	TYPE_GYROSCOPE_UNCALIBRATED	-0.44667053	-0.5662689	-0.44334412	-3.6621094E-4	-7.324219E-4	4.272461E-4	3

1571302038128	TYPE_ACCELEROMETER_UNCALIBRATED	-0.36697388	2.9969482	11.525696	0.0	0.0	0.0	3

1571302038149	TYPE_ACCELEROMETER	-0.22691345	2.5725708	12.454636	2

1571302038149	TYPE_MAGNETIC_FIELD	14.3585205	14.263916	-31.544495	3

1571302038149	TYPE_GYROSCOPE	-0.7035675	-0.22732544	-0.4001007	3

1571302038149	TYPE_ROTATION_VECTOR	0.095734246	0.07060358	0.38156357	3

1571302038149	TYPE_MAGNETIC_FIELD_UNCALIBRATED	41.48407	-58.747864	-333.97522	27.12555	-73.01178	-302.43073	3

1571302038149	TYPE_GYROSCOPE_UNCALIBRATED	-0.5883484	-0.325531	-0.42736816	-3.6621094E-4	-7.324219E-4	4.272461E-4	3

1571302038149	TYPE_ACCELEROMETER_UNCALIBRATED	-0.70277405	2.9424744	11.990753	0.0	0.0	0.0	3

1571302038169	TYPE_ACCELEROMETER	1.1180267	1.2737274	14.921265	2

1571302038169	TYPE_MAGNETIC_FIELD	12.249756	15.678406	-34.965515	3

1571302038169	TYPE_GYROSCOPE	-1.0332642	-0.39509583	-0.59

1571302041239	TYPE_WIFI	0fa36db77af713808449ff54868815dc26f88e45	ada9c718a8bec744a03ea021aadd57ce07be4e46	-79	5785	1571302040262

1571302041239	TYPE_WIFI	da39a3ee5e6b4b0d3255bfef95601890afd80709	2a0973e9001c7d88e8c6c7f15e9390c9fd5765dd	-79	5785	1571302040252

1571302041239	TYPE_WIFI	0b72e3ecd8f9071d311fd78bc78b4696fbbd35e5	c82ec661353a25eb5f27ab85586f8b0d96402a9b	-79	5260	1571302040516

1571302041239	TYPE_WIFI	5ec944cb0e043327906acce8d1b61f6e32d8db1c	04d2e644d2d158093b0916d825447d9100691e9b	-79	5260	1571302040517

1571302041239	TYPE_WIFI	da39a3ee5e6b4b0d3255bfef95601890afd80709	5265376c4b73e5a68e2e54c80285d7c26b2a89e2	-79	5260	1571302040517

1571302041239	TYPE_WIFI	1365d7dd87c2ef94fcc25004fcebbb68b6a78a10	99130fb8e9dbd8b2646880e45744f3528386e541	-79	5745	1571302023500

1571302041239	TYPE_WIFI	da39a3ee5e6b4b0d3255bfef95601890afd80709	aa91cecd10ce7c4145a63a3f595f669fd00aeb54	-79	5765	1571302037904

1571302041239	TYPE_WIFI	7b651470a02545abd0f2802990d57f9ae962833b	60fd29e50846d0571390c9fb9

In [51]:
# Redefine the data extraction class

from dataclasses import dataclass

@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray


def read_data_file_ed(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, '_'.join([ssid, bssid]), rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            txpower = line_data[5]
            rssi = line_data[6]
            distance = line_data[7]
            mac_address = line_data[-2]
            beacon_ts = line_data[-1]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), txpower, rssi, distance, mac_address, beacon_ts]
            ibeacon.append(ibeacon_data)
            continue

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])

    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)

    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)

In [48]:
# beacon 1571302029900, 1571302029925, 1571302030475
a = 1571302029813 - 1571302029894
print(a)

# wifi -> has multiple records for 1 timestamp, and in a descending order of rssi
b = 1571302029813 - 1571302032104
print(b)

-81
-2291


In [None]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [49]:
# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [60]:
# %%timeit

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# read_data_file -> 170 ms per loop
# read_data_file_ed -> 173 ms per loop
path_datas = read_data_file_ed(path)
acce_datas = path_datas.acce
magn_datas = path_datas.magn
ahrs_datas = path_datas.ahrs
wifi_datas = path_datas.wifi
ibeacon_datas = path_datas.ibeacon
wps = path_datas.waypoint # not to be used

# acce and ahrs data translation
rel_positions = calc_rel_positions(acce_datas, ahrs_datas)
# print(acce_datas.shape)
# print(wifi_datas.shape)
# print(ibeacon_datas.shape)
# print(rel_positions.shape)

print("wps: ", wps.shape)
print("wps ts: ", wps[:, 0].astype(int))
print("wifi: ", wifi_datas.shape)
print("wifi unique ts len: ", len(set(wifi_datas[:, 0])))
print("wifi unique last_seen ts len: ", len(set(wifi_datas[:, -1])))
print("wifi unique ssid: ", len(set(wifi_datas[:, 1])))
# print("wifi unique ssid: ", set(wifi_datas[:, 1]))
print("wifi unique bssid: ", len(set(wifi_datas[:, 2])))
print("wifi unique ssid x bssid: ", len(set(wifi_datas[:, 3])))

# differences
wifi_ts = list(set(wifi_datas[:, 0].astype(int)))
wifi_last_seen_ts = list(set(wifi_datas[:, -1].astype(int)))
wps_ts = wps[:, 0].astype(int)

# wifi_ts_diff = []
# for t in wifi_ts:
#     for w_t in wps_ts:
#         wifi_ts_diff.append(t - w_t)

# wifi_last_seen_ts_diff = []
# for t in wifi_last_seen_ts:
#     for w_t in wps_ts:
#         wifi_last_seen_ts_diff.append(t - w_t)

# plot the distribution differences between wifi ts and wifi last_seen ts
# print(len(wifi_ts_diff))
# print(len(wifi_last_seen_ts_diff))
# sns.distplot(wifi_ts_diff, hist=False, color="red")
# sns.distplot(wifi_last_seen_ts_diff, hist=False)

# get the wifi_ts
for t in wifi_ts:
    wifi_ts_diff = []
    for w_t in wps_ts:
        diff = abs(t - w_t)
        wifi_ts_diff.append(diff)
        diff_index = np.argmin(wifi_ts_diff)
        wps[diff_index]
print(wifi_ts_diff)

wps:  (11, 3)
wps ts:  [1573720359495 1573720361965 1573720367931 1573720370651 1573720378362
 1573720382583 1573720393210 1573720398101 1573720407796 1573720413731
 1573720420907]
wifi:  (6154, 6)
wifi unique ts len:  31
wifi unique last_seen ts len:  2835
wifi unique ssid:  83
wifi unique bssid:  256
wifi unique ssid x bssid:  256
[3968, 1188, 3372, 1784, 2607, 2549, 1392, 3764, 4634, 522, 700, 4456, 1981, 3175]


In [None]:
# Methods for preprocessing train data: Timestamp handling
def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    if data.size == 0:
        return np.array([])
    else:
        data_ts = data[:, [0]]
        diff = []
        for ts in data_ts:
            diff.append(abs(int(t) - int(ts)))
        closest_index = np.argmin(diff) # if multiple records have the same value..?
        return data[closest_index]

In [None]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    if data.size == 0:
        # print("no axis data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        z_axis = data[3]
        try:
            accuracy = data[4]
        except IndexError:
            accuracy = np.nan
        return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    if data.size == 0:
        # print("no wifi data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        bssid = data[2]
        rssi = data[3]
        if len(data) > 5:
            freq = data[4]
            last_seen_ts = data[5]
        else:
            freq = np.nan
            last_seen_ts = data[-1]
        return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    if data.size == 0:
        # print("no beacon data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        rssi = data[2]
        return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    if data.size == 0:
        # print("no rel_pos data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        return [data_ts, diff_ts, x_axis, y_axis]

In [None]:
# Extract path and other data
def extract_path(path, floor_map):
    # split path
    try:
        ex_path = f"{path}"
        ex_paths = ex_path.split("/")
        site_id = ex_paths[4]
        floor = ex_paths[5]
        f = floor_map[floor]
        file_id = ex_paths[6].split(".")[0]
        return [site_id, file_id, f, floor]
    except:
        print("extract_path error")

# Definitely needs to be refactored
def extract_data(path):
    start_ts = find_start_ts(path)
    path_datas = read_data_file(path)
    acce = path_datas.acce
    ahrs = path_datas.ahrs
    magn = path_datas.magn
    gyro = path_datas.gyro
    acce_uncali = path_datas.acce_uncali
    magn_uncali = path_datas.magn_uncali
    gyro_uncali = path_datas.gyro_uncali
    wifi = path_datas.wifi
    wps = path_datas.waypoint
    ibeacon = path_datas.ibeacon
    rel_positions = calc_rel_positions(acce, ahrs)

    # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
    # ts = np.unique(wps[:, [0]])
    if acce_uncali.any():
        # print("acce_uncali")
        ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data
    elif acce.any():
        # print("acce")
        ts = np.unique(acce[:, [0]])
    else:
        print("no acce or acce_uncali")

    # extract data for each timestamp of waypoints
    res = []
    for t in ts:
        try:
            wp_closest = find_smallest_diff(t, wps)
            closest_wp_ts = wp_closest[0]
            diff_ts_wp_ts = abs(int(t) - int(closest_wp_ts))
            # time_stamp_cut = 2000, only the records within 2 sec of waypoint are kept
            if diff_ts_wp_ts < time_stamp_cut:
                # flag to indicate how close the data point is to the wps
                # print("diff_ts_wp_ts", diff_ts_wp_ts)
                within_100ms = True if abs(diff_ts_wp_ts) <= 100 else False
                within_200ms = True if abs(diff_ts_wp_ts) <= 200 else False
                x = wp_closest[1]
                y = wp_closest[2]
                # print("x, y: ", x, y)
                diff_start_ts = int(t) - int(start_ts)
                diff_start_wp_ts = int(closest_wp_ts) - int(start_ts)
                # print("diff_start_ts, diff_start_wp_ts: ", diff_start_ts, diff_start_wp_ts)
                acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
                ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
                magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
                magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
                gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
                # print("acce: ", acce_closest)
                # print("ahrs: ", ahrs_closest)
                # print("magn: ", magn_closest)
                # print("gyro: ", gyro_closest)
                acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
                magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
                gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
                # print("acce_u_closest: ", acce_u_closest)
                # print("magn_u_closest: ", magn_u_closest)
                # print("gyro_u_closest: ", gyro_u_closest)
                wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
                if len(ibeacon) > 0:
                    beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
                else:
                    beacon_closest = [np.nan, np.nan, np.nan, np.nan]
                rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
                # print([t, x, y, int(closest_wp_ts), acce_closest, acce_u_closest])
                res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
                           acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                           acce_u_closest + magn_u_closest + gyro_u_closest + \
                           wifi_closest + beacon_closest + rel_pos
                          )
            else:
                # print("no wp made it through timestamp cut")
                continue
        except Exception as exc:
            pass
            # print("Error message: ", exc)
            # print("extract_test_data error")
    return res

In [None]:
# %%timeit

# 5.55 ms ± 1.76 ms per loop
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# for fixing floor expression
floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

def one_trace_to_rows(path, floor_map):
    try:
        path_info = extract_path(path, floor_map)
        data = extract_data(path)
        # rows = list(itertools.chain(path_info, *data))
        rows = []
        for d in data:
            row = path_info + d
            rows.append(row)
            # print("row: ", row)
        return rows
    except:
        print("one_trace_to_rows error at: ", path)

# path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# time w/ for loop with 1 train_path -> 11.6
# time w/ itertools.chain for 1 train_path -> 11.8
start = time.time()
path_info = extract_path(path, floor_map)
print("path: ", path_info)
rows = one_trace_to_rows(path, floor_map)
print("time to process one train_path", time.time() - start)
#print("col count: ", len(rows[0]))
print("rows: ", rows)

In [None]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [None]:
# Check if we can make df

# column names
col_names = ["site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_100ms", "within_200ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

print(len(col_names))

df = pd.DataFrame(rows, columns=col_names)
print("df len: ", len(df))
print("site_id nunique: ", df["site_id"].nunique())
print("file_id nunique: ", df["file_id"].nunique())
print("x value_counts: ", df["x"].value_counts())
print("y value_counts: ", df["y"].value_counts())
print("event ts nunique: ", df["ts"].nunique())
print("start ts nunique: ", df["start_ts"].nunique()) # should be one
print("diff_ts_wp_ts value_counts: ", df["diff_ts_wp_ts"].value_counts())
print("diff_ts_wp_ts nunique: ", df["diff_ts_wp_ts"].nunique())
print("within_100ms value_counts: ", df["within_100ms"].value_counts())
print("within_100ms nunique: ", df["within_100ms"].nunique())
print("within_100ms count: ", df["within_100ms"].count())
print("within_200ms value_counts: ", df["within_200ms"].value_counts())
print("within_200ms nunique: ", df["within_200ms"].nunique())
print("within_200ms count: ", df["within_200ms"].count())
display(df.head())

In [None]:
# # Set pool
# num_cores = multiprocessing.cpu_count()
# print(f"num_cores={num_cores}")
# # args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list]
# pool = Pool(num_cores)

# start = time.time()
# # w/ 250ms settings, 3 random samples from each site_id
# # 2 paths -> 18.7 sec
# # 10 paths -> 315 sec (df len is 1994)
# # 100 paths -> 708 sec (df len is 7183)
# # all ~ 600 paths -> 

# # errors
# # grouped_paths_list -> 100 paths -> site_id: 8 errors, 27 correct
# # grouped_paths_list -> 100 paths -> file_id: 23 errors, 77 correct

# # all in one go -> xxx sec
# # array_split -> 5891.8 sec

# # all in one go
# # res = pool.starmap(one_trace_to_rows, args)

# # split the args
# res = []
# for arg in tqdm(np.array_split(args, 50)):
#     res.extend(pool.starmap(one_trace_to_rows, arg))

In [None]:
# train_path filtering
def extract_path_for_grouplist(path):
    ex_path = f"{path}"
    ex_paths = ex_path.split("/")
    site_id = ex_paths[4]
    file_id = ex_paths[6].split(".")[0]
    return [path, site_id, file_id]

# create pathlist to be used by 2 types of paths list
path_list = [extract_path_for_grouplist(item) for item in train_paths]
df_paths = pd.DataFrame(path_list, columns=["path", "site_id", "file_id"])
site_id_path_list = df_paths["site_id"].unique()

# grouped_paths_list -> It takes 3 records from every site_id
grouped_paths_df = df_paths.groupby("site_id").sample(n=3)
grouped_paths_list = list(grouped_paths_df["path"].unique())
print("grouped_paths_list len: ", len(grouped_paths_list))
print("grouped_paths_list examples: ", grouped_paths_list[:5])

# filter train_paths to only those with sites that are in the sub_df
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
sub_df_site_list = sub_df["site"].unique()
df_24_sites = df_paths[df_paths["site_id"].isin(sub_df_site_list)]
df_24_site_list = df_24_sites["site_id"].unique()
sub_site_paths_list = list(df_24_sites["path"].unique())
print(set(sub_df_site_list) == set(df_24_site_list)) # Check if df_24_site has only the sites from sub_df
print(len(df_24_sites))
print(len(sub_site_paths_list))
print(len(df_24_site_list))

In [None]:
# Set pool
num_cores = multiprocessing.cpu_count()
print(f"num_cores={num_cores}")
pool = Pool(num_cores)
start = time.time()

# make new train_path_list
# args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list[:100]]
args = [(p, floor_map) for p in sub_site_paths_list[:100]]

res_dict = {}
res = []
for i, e in enumerate(tqdm(np.array_split(args, 10))):
    # print("ith iteration: ", i)
    rows = pool.starmap(one_trace_to_rows, e)
    res_dict[i] = rows
    res.extend(rows)
    
pool.close()
print("time to process", time.time() - start)

In [None]:
res_name = "indoor_train_res_4.pkl"

with open(res_name, "wb") as file:
    pickle.dump(res, file)

In [None]:
with open(res_name, "rb") as file:
    res = pickle.load(file)

In [None]:
############################## KEEP THIS CELL FOR LATER REF ##############################

# Error in ~20% of the train paths -> caused by not having acces_uncali to create the event timestamps

# error files
# /5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bbc852a70008c01607.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bda50dc300099d34cc.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085df529994a0008202661.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt
# /5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt
# /5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt
# /5cd56b5ae2acfd2d33b5854a/B1/5d073ba64a19c000086c559b.txt
# /5cd56b5ae2acfd2d33b5854a/F1/5d07603e4cae4f000a2db525.txt
# /5cd56b63e2acfd2d33b591c2/F2/5d0b0668912a980009fe91f2.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt
# /5cd56b64e2acfd2d33b592b3/F2/5d0c9321c99c56000836df18.txt
# /5cd56b64e2acfd2d33b592b3/F3/5d0c9952ea565d0008e34e8b.txt
# /5cd56b64e2acfd2d33b592b3/F4/5d0c9d65ea565d0008e34ea2.txt
# /5cd56b5ae2acfd2d33b58549/5F/5d0613514a19c000086c432a.txt
# /5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b015e2f8a26000805ba44.txt
# /5cd56b79e2acfd2d33b5b74e/F1/5d0af3452f8a26000805b830.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt
# /5cd56b6be2acfd2d33b59d1f/F3/5d0896415125450008037c76.txt

# base_path = "../input/indoor-location-navigation/train"
# error_files = [
#     "/5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt",
#     "/5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt",
#     "/5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt",
#     "/5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt",
#     "/5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt",
#     "/5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt"
# ]

# working_path = "../input/indoor-location-navigation/train/5d2709c303f801723c3299ee/1F/5dad7d6daa1d300006faa80c.txt"
# error_paths = [base_path + e for e in error_files]
# rows = one_trace_to_rows(error_paths[1], floor_map)
# print(rows)

In [None]:
# pd for loop version -> time to process 3.84

start = time.time()

# find index of None object first
print(len(res))
none_list = [i for i,v in enumerate(res) if v == None]
print(none_list)
for e in none_list:
    res.pop(e)
print(len(res))

# https://stackoverflow.com/questions/45910827/how-can-i-convert-a-3d-list-into-a-2d-list-in-python
# Convert to list of lists then df
res_unpack = [e for row in tqdm(res) for e in row]
df_train = pd.DataFrame(res_unpack, columns=col_names)

print("time to process", time.time() - start)
print("length of df made", len(df_train))
display(df_train.head())

In [None]:
# start = time.time()

# df_train = pd.DataFrame(res[0], columns=col_names)
# for r in res[1:]:
#     df = pd.DataFrame(r, columns=col_names)
#     df_train = df_train.append(df, ignore_index=True)

# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))

In [None]:
# def list_to_df(row_list):
#     df_train = pd.DataFrame(row_list[0], columns=col_names)
#     for r in row_list[1:]:
#         df = pd.DataFrame(r, columns=col_names)
#         df_train = df_train.append(df)
#     return df_train

# start = time.time()
# pool = Pool(num_cores)

# df_train = pool.map(list_to_df, tqdm(res))

# # print("train_path count", len(train_paths[:train_num]))
# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))
# pool.close()

In [None]:
print("df len: ", len(df_train), "\n")
print("site_id nunique: ", df_train["site_id"].nunique(), "\n")
print("site_id value_counts: ", df_train["site_id"].value_counts(), "\n")
print("file_id nunique: ", df_train["file_id"].nunique(), "\n")
print("file_id value_counts: ", df_train["file_id"].value_counts(), "\n")
print("floor value_counts: ", df_train["floor"].value_counts(), "\n")
print("x value_counts: ", df_train["x"].value_counts(), "\n")
print("y value_counts: ", df_train["y"].value_counts(), "\n")
print("event ts nunique: ", df_train["ts"].nunique(), "\n")
print("start ts nunique: ", df_train["start_ts"].nunique(), "\n") # should be one
print("diff_ts_wp_ts value_counts: ", df_train["diff_ts_wp_ts"].value_counts(), "\n")
print("diff_ts_wp_ts nunique: ", df_train["diff_ts_wp_ts"].nunique(), "\n")
display(df_train.head())

In [None]:
# Visualizing timestamp distribution

# Explore
# print(df_train["ts"].dtype)
# print(df_test["ts"].dtype)

# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_train, tqdm(col_enc))

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_train, tqdm(["floor_converted","ts", "start_ts", "diff_start_ts", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
             "acce_ts", "diff_acce_ts", \
             "ahrs_ts", "diff_ahrs_ts", \
             "magn_ts", "diff_magn_ts", \
             "gyro_ts", "diff_gyro_ts", \
             "acce_u_ts", "diff_acce_u_ts", \
             "magn_u_ts", "diff_magn_u_ts", \
             "gyro_u_ts", "diff_gyro_u_ts", \
             "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_rssi", \
             "rel_ts", "diff_rel_ts"
            ]), float)

# convert ts and wifi_last_see_ts to dates
for df in [df_train]:
    for col in tqdm(["ts", "wifi_last_seen_ts"]):
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_train.head())

In [None]:
# Calculate moving averages
# Differencing respect to time (as each timestep is unevenly spaced)

In [None]:
# Save the file in parquet
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Saving train data
train_file_name = "indoor_train_4.pkl"

with open(train_file_name, "wb") as file:
    pickle.dump(df_train, file)

# Save them to output
# df_train.to_csv('df_train_2.csv',index=False)
# df_test.to_csv('df_test.csv',index=False)

In [None]:
# Load data it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

In [None]:
print("df len: ", len(df_train), "\n")
print("file_id unique: ", (df_train["file_id"].nunique()), "\n")
print("site_id unique: ", (df_train["site_id"].nunique()), "\n")
print("site_id value_counts: ", (df_train["site_id"].value_counts()))
display(df_train.head())

In [None]:
# # Get submission file
# sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
# sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
# sub_df = sub_df.drop(columns=["floor", "x", "y"])
# # grouped_df = sub_df.groupby("file").sample(n=2)
# # all_file_id = grouped_df["file"].unique()
# # print(len(grouped_df))
# # print(len(all_file_id))
# # display(grouped_df.head())
# display(sub_df.head())

# test_site_id = sub_df["site"].unique()
# train_site_id = df_train["site_id"].unique()
# print(test_site_id, "\n")
# print(train_site_id, "\n")
# a = list(set(test_site_id) & set(train_site_id))
# print(a)