In [2]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool

pd.set_option("display.max_columns", 100)

In [3]:
# Check out each file. Content, images

root_path = "../input/indoor-location-navigation/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204


In [4]:
# # Preprocess

# # Reading in 1 file
# def pick_example(max_range, paths):
#     ex = random.randint(0, max_range)
#     example_path = paths[ex]
#     path = f"{example_path}"
#     paths = path.split("/")
#     site = paths[4]
#     floorNo = paths[5]
#     floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
#     json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
#     with open(json_plan_filename) as json_file:
#         json_data = json.load(json_file)
#     width_meter = json_data["map_info"]["width"]
#     height_meter = json_data["map_info"]["height"]
#     return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# print("example path: ", path)
# print("site: ", site)
# print("floorNo: ", floorNo)
# print("floor_plan_filename: ", floor_plan_filename)
# print("json_plan_filename: ", json_plan_filename)
# print("width: {}, height: {} ".format(width_meter, height_meter))

# with open(path) as p:
#     lines = p.readlines()
# print("No. Lines in 1 example: {:,}". format(len(lines)))

In [15]:
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
display(sub_df.head())

Unnamed: 0,site_path_timestamp,site,file,timestamp
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,18763
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,22328


In [29]:
# 200 train paths come out with ~1000 examples, so multiply train examples by 5 to extract similar no. of examples
# test_num = train_num * 5
test_num = 30
# test_num = len(sub_df) - 1
# test_num = round(len(sub_df) / 2)

# set timestamp cut
time_stamp_cut = 2000

print(test_num)

30


In [7]:
# for line in lines[:200]:
#     print(line)

In [8]:
# using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./

In [9]:
# Import custom function from the repository
from io_f import read_data_file

# More viz on accelerometers, wifi etc in one go
from visualize_f import visualize_trajectory, visualize_heatmap
from main import extract_wifi_rssi, extract_wifi_count
from main import calibrate_magnetic_wifi_ibeacon_to_position
from main import extract_magnetic_strength
from main import extract_ibeacon_rssi

In [None]:
# # Read in 1 random example
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# sample_file = read_data_file(path)

# You can access the information for each variable:
# Each data is split for time
# Metadata is expressed with "#"

# for i in sample_file.acce[:, [0]]:
#     print(i)
#     print(int(i))

# print("~~~ Example ~~~")
# print("acce: {}".format(sample_file.acce), "\n" +
#       "acce shape: {}".format(sample_file.acce.shape), "\n" +
#       "acacce_uncalice: {}".format(sample_file.acce_uncali), "\n" +
#       "acacce_uncalice shape: {}".format(sample_file.acce_uncali.shape), "\n" +
#       "ahrs: {}".format(sample_file.ahrs), "\n" +
#       "ahrs shape: {}".format(sample_file.ahrs.shape), "\n" +
#       "gyro: {}".format(sample_file.gyro), "\n" +
#       "gyro shape: {}".format(sample_file.gyro.shape), "\n" +
#       "gyro_uncali: {}".format(sample_file.gyro_uncali), "\n" +
#       "gyro_uncali shape: {}".format(sample_file.gyro_uncali.shape), "\n" +
#       "ibeacon: {}".format(sample_file.ibeacon), "\n" +
#       "ibeacon shape: {}".format(sample_file.ibeacon.shape), "\n" +
#       "magn: {}".format(sample_file.magn), "\n" +
#       "magn shape: {}".format(sample_file.magn.shape), "\n" +
#       "magn_uncali: {}".format(sample_file.magn_uncali), "\n" +
#       "magn_uncali shape: {}".format(sample_file.magn_uncali.shape), "\n" +
#       "waypoint: {}".format(sample_file.waypoint), "\n" +
#       "waypoint shape: {}".format(sample_file.waypoint.shape), "\n" +
#       "wifi: {}".format(sample_file.wifi), "\n" +
#       "wifi shape: {}".format(sample_file.wifi.shape))

In [None]:
# def show_site_png(root_path, site):
#     floor_paths = glob.glob(root_path + "metadata/" + site + "/*/floor_image.png")
#     n = len(floor_paths)
#     print("No. of floor paths: ", n)

#     # Create the custom number of rows & columns
#     ncols = [ceil(n / 3) if n > 4 else 4][0]
#     nrows = [ceil(n / ncols) if n > 4 else 1][0]

#     plt.figure(figsize=(16, 10))
#     plt.suptitle(f"Site no. '{site}'", fontsize=18)

#     # Plot image for each floor
#     for k, floor in enumerate(floor_paths):
#         # plt.subplot(nrows, ncols, k+1)
#         plt.subplot(ncols, nrows, k+1)
#         plt.rcParams["figure.facecolor"] = "white"

#         image = Image.open(floor)

#         plt.imshow(image)
#         plt.axis("off")
#         title = floor.split("/")[5]
#         plt.title(title, fontsize=15)

In [None]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [None]:
# # Checking the floor number distribution

# all_floors = glob.glob("../input/indoor-location-navigation/metadata/*/*")
# all_sites = glob.glob("../input/indoor-location-navigation/metadata/*")
# floor_no = []
# floor_counts = []

# # Floor count
# for site in all_sites:
#     floor_count = len([name for name in os.listdir(site)])
#     floor_counts.append(floor_count)

# floor_counts_df = pd.DataFrame(floor_counts, columns=["F_Count"])
# floor_counts_df = floor_counts_df["F_Count"].value_counts().reset_index()
# floor_counts_df = floor_counts_df.sort_values("index", ascending=True)

# # Extract only the floor number
# for floor in all_floors:
#     no = floor.split("/")[5]
#     floor_no.append(no)
    
# floor_no = pd.DataFrame(floor_no, columns=["No"])
# floor_no = floor_no["No"].value_counts().reset_index()
# floor_no = floor_no.sort_values("No", ascending=False)

# # ToDo: Floor expressions need to be fixed
# # 1F -> F1, L1 -> F1, G -> F1 etc

# # Plot
# # display(floor_counts_df.head(10))

# fig, axes = plt.subplots(ncols=2, figsize=(16, 10))
# axes[0] = sns.barplot(data=floor_counts_df, x="index", y="F_Count", palette="viridis", saturation=0.4, ax=axes[0])
# axes[0].set_title("Floor Count Distribution", size = 26, weight="bold")
# axes[0].set_xlabel("")
# axes[0].set_ylabel("Floor Count", size = 18, weight="bold")

# axes[1] = sns.barplot(data=floor_no, x="No", y="index", palette="viridis", saturation=0.4, ax=axes[1])
# axes[1].set_title("Frequency of Floors", size = 26, weight="bold")
# axes[1].set_xlabel("")
# axes[1].set_ylabel("Floor No.", size = 18, weight="bold")

# plt.xticks([])
# plt.yticks(fontsize=11)
# sns.despine(left=True, bottom=True);

In [None]:
# # Metadata checking (GeoJSON)
# # This is a vector representation of floor map
# geojson_paths = glob.glob("../input/indoor-location-navigation/metadata/*/*/geojson_map.json")
# print("No. of geojson file: {}".format(len(geojson_paths)))

# # Print one example
# ex = random.randint(0, len(geojson_paths))
# geojson_file_name = geojson_paths[ex]
# with open(geojson_file_name) as json_file:
#     paths = geojson_file_name.split("/")
#     site_id = paths[4]
#     floor = paths[5]
#     json_data = json.load(json_file)
#     json_properties = json_data["features"][0]["properties"]
#     print("File path: {}".format(geojson_file_name))
#     print("SiteID: {}".format(site_id))
#     print("Floor: {}".format(floor))
#     print("Floor info: {}".format(json_properties))

# # create id and floor number matching file
# site_ids = []
# floor_no = []
# floor_no_json = []

# for i in range(0, len(geojson_paths)):
#     with open(geojson_paths[i]) as f:
#         paths = geojson_paths[i].split("/")
#         site_id = paths[4]
#         floor = paths[5]
#         site_ids.append(site_id)
#         floor_no.append(floor)
#         d = json.load(f)
#         try:
#             floor_no_json.append(d["features"][0]["properties"]["floor_num"])
#         except:
#             floor_no_json.append(np.nan)

# floor_num_df = pd.DataFrame(
#     {"site_id": site_ids,
#      "floor_no": floor_no,
#      "floor_no_json": floor_no_json,
#     })

# display("floor_num_df length: {}".format(len(floor_num_df)))
# display(floor_num_df.head())

# # Get floormap dict to be used later
# floor_map_pairs = list(zip(floor_num_df["floor_no"], floor_num_df["floor_no_json"]))
# floor_map_pairs = np.unique(floor_map_pairs, axis=0) # get unique pair
# # print(floor_map_pairs) # to be used as floor_map later

# # Plot distribution
# floor_num_count_df = floor_num_df["floor_no_json"].value_counts().reset_index()
# floor_num_count_df = floor_num_count_df.sort_values("floor_no_json", ascending=False)
# # display(floor_num_count_df)
# # print(len(floor_num_count_df["floor_no_json"] == np.nan))

# fig = plt.figure()
# ax = plt.subplots(figsize=(16, 10))
# sns.barplot(data=floor_num_count_df, x="index", y="floor_no_json", palette="viridis", saturation=0.4)
# fig.show()

# # Just in case: Need for altitude info in geoJSON
# # from pyproj import Proj, transform
# # print(transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), -0.1285907, 51.50809))  # longitude first, latitude second.
# # output (meters east of 0, meters north of 0): (-14314.651244750548, 6711665.883938471)

In [None]:
# # Visualizing magnetic strength
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# # extract mag, wifi, beacon of one example
# mwi_datas = calibrate_magnetic_wifi_ibeacon_to_position([path])
# magnetic_strength = extract_magnetic_strength(mwi_datas)
# wifi_rssi = extract_wifi_rssi(mwi_datas)
# wifi_counts = extract_wifi_count(mwi_datas)
# ibeacon_rssi = extract_ibeacon_rssi(mwi_datas)
# ibeacon_ummids = list(ibeacon_rssi.keys())
# target_ibeacon = ibeacon_ummids[0]

# # positions for heatmap
# heat_positions = np.array(list(magnetic_strength.keys()))
# heat_values = np.array(list(magnetic_strength.values()))
# heat_positions_wifi = np.array(list(wifi_counts.keys()))
# heat_values_wifi = np.array(list(wifi_counts.values()))
# heat_positions_bc = np.array(list(ibeacon_rssi[target_ibeacon].keys()))
# heat_values_bc = np.array(list(ibeacon_rssi[target_ibeacon].values()))[:, 0]

# # filter out positions that no wifi detected
# mask = heat_values_wifi != 0
# heat_positions_wifi = heat_positions_wifi[mask]
# heat_values_wifi = heat_values_wifi[mask]

# # get trajectory
# example = read_data_file(path)
# trajectory = example.waypoint # Returns timestamp, x, y values
# print(f"Waypoints: {trajectory}")
# trajectory = trajectory[:, 1:3] # Removes timestamp (we only need the coordinates)

# # Plot trajectory
# visualize_trajectory(trajectory = trajectory,
#                      floor_plan_filename = floor_plan_filename,
#                      width_meter = width_meter,
#                      height_meter = height_meter,
#                      title = "Example of Waypoint",)

In [18]:
# Try working out step_positions for 1 trace file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [19]:
# Methods for preprocessing train data: Timestamp handling
def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    data_ts = data[:, [0]]
    diff = []
    for ts in data_ts:
        diff.append(abs(int(t) - int(ts)))
    closest_index = np.argmin(diff) # if multiple records have the same value..?
    return data[closest_index]

In [20]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    z_axis = data[3]
    try:
        accuracy = data[4]
    except IndexError:
        accuracy = np.nan
    return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    bssid = data[2]
    rssi = data[3]
    if len(data) > 5:
        freq = data[4]
        last_seen_ts = data[5]
    else:
        freq = np.nan
        last_seen_ts = data[-1]
    return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    rssi = data[2]
    return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    return [data_ts, diff_ts, x_axis, y_axis]

In [21]:
floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

In [None]:
# def extract_test_data(df):
#     test_rows = []
#     for index, row in df.iterrows():
#         test_site = row["site"]
#         file_name = row["file"]
#         test_ts = row["timestamp"]

#         test_path = root_path + "test/" + file_name + ".txt" # get test_path from file name
#         start_ts = find_start_ts(test_path)
#         diff_start_ts = int(test_ts) - int(start_ts)
#         path_datas = read_data_file(test_path)
#         acce = path_datas.acce
#         ahrs = path_datas.ahrs
#         magn = path_datas.magn
#         wifi = path_datas.wifi

#         # extract data for each timestamp of waypoints
#         res = []
#         acce_closest = split_axis(find_smallest_diff(test_ts, acce))
#         ahrs_closest = split_axis(find_smallest_diff(test_ts, ahrs))
#         magn_closest = split_axis(find_smallest_diff(test_ts, magn))
#         wifi_closest = split_wifi(find_smallest_diff(test_ts, wifi))
#         test_rows.append([test_site, file_name, np.nan, np.nan, test_ts, np.nan, np.nan, start_ts, diff_start_ts] + \
#                           acce_closest + ahrs_closest + magn_closest + wifi_closest + \
#                          [acce_closest[0], ahrs_closest[0], magn_closest[0], wifi_closest[0]])
#     return test_rows

In [39]:
# Generate test data
def extract_test_data(df):
    try:
        test_rows = []
        for index, row in df.iterrows():
            row_id = row["site_path_timestamp"]
            test_site = row["site"]
            file_name = row["file"]
            test_ts = row["timestamp"]
            test_path = root_path + "test/" + file_name + ".txt" # get test_path from file name

            # extract data for each timestamp of waypoints
            start_ts = find_start_ts(test_path)
            path_datas = read_data_file(test_path)
            acce = path_datas.acce
            ahrs = path_datas.ahrs
            magn = path_datas.magn
            gyro = path_datas.gyro
            acce_uncali = path_datas.acce_uncali
            magn_uncali = path_datas.magn_uncali
            gyro_uncali = path_datas.gyro_uncali
            wifi = path_datas.wifi
            wps = path_datas.waypoint
            ibeacon = path_datas.ibeacon
            rel_positions = calc_rel_positions(acce, ahrs)

            # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
            # ts = np.unique(wps[:, [0]])
            ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data

            # extract data for each timestamp of waypoints
            for i, t in enumerate(ts):
                # wp_closest = find_smallest_diff(t, wps)
                # closest_wp_ts = wp_closest[0]
                diff_ts_wp_ts = int(t) - int(test_ts)
                # time_stamp_cut = 2000, only the records within 2 sec of test timestamp
                if abs(diff_ts_wp_ts) < time_stamp_cut:
                    # flag to indicate how close the data point is to the wps
                    within_500ms = True if abs(diff_ts_wp_ts) <= 500 else False
                    within_1000ms = True if abs(diff_ts_wp_ts) <= 1000 else False
                    # x = wp_closest[1]
                    # y = wp_closest[2]
                    diff_start_ts = int(t) - int(start_ts)
                    diff_start_wp_ts = int(test_ts)
                    acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
                    ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
                    magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
                    magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
                    gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
                    acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
                    magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
                    gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
                    wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
                    beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
                    rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)

                    # train append example
                    # res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_500ms, within_1000ms] + \
                    #            acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                    #            acce_u_closest + magn_u_closest + gyro_u_closest + \
                    #            wifi_closest + beacon_closest + rel_pos
                    #           )
                    test_rows.append([row_id, test_site, file_name, np.nan, np.nan, \
                                      t, start_ts, diff_start_ts, np.nan, np.nan, test_ts, diff_start_wp_ts, diff_ts_wp_ts, within_500ms, within_1000ms] + \
                                      acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                                      acce_u_closest + magn_u_closest + gyro_u_closest + \
                                      wifi_closest + beacon_closest + rel_pos
                                    )
                else:
                    continue

        return test_rows
    except:
        print("extract_test_data error")

In [40]:
# can read_data_file method read test data
print(test_paths[0])
test_path = test_paths[0]
read_data_file(test_path)
find_start_ts(test_path)

../input/indoor-location-navigation/test/52ad8c760ff9978d0949deed.txt


'0000000000000'

In [41]:
# try generating test data
start = time.time()
test_rows = extract_test_data(sub_df.iloc[:5, :])
print(f"time to process: ", time.time() - start)

time to process:  46.772876501083374


In [42]:
test_df = pd.DataFrame(test_rows)
display(test_df.head())
print(len(test_rows[0]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,136.0,0,136,,,9,9,127,True,True,136.0,136,0.798813,4.30072,7.810059,,136.0,136,0.247101,0.104201,0.474897,,136.0,136,30.561829,-1.228333,-38.301086,,49.015379,136.0,136,-0.039139,-0.507996,-0.148392,,136.0,136,0.578552,4.353989,8.195526,,136.0,136,34.687805,6.938171,-377.32544,,136.0,136,-0.077835,-0.334671,-0.166565,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,110,110,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91,1144.0,1144,-0.425353,0.24869
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,156.0,0,156,,,9,9,147,True,True,156.0,156,0.026688,4.911835,7.244446,,156.0,156,0.247101,0.104201,0.474897,,156.0,156,29.173279,-1.922607,-36.950684,,47.118252,156.0,156,-0.084946,-0.4478,-0.20752,,156.0,156,0.524673,4.493454,7.43837,,156.0,156,33.299255,6.243896,-375.97504,,156.0,156,-0.049072,-0.504059,-0.15538,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,110,110,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91,1144.0,1144,-0.425353,0.24869
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,176.0,0,176,,,9,9,167,True,True,176.0,176,-0.41864,5.354172,7.295914,,176.0,176,0.254368,0.11151,0.452041,,176.0,176,27.786255,-0.535583,-37.625122,,46.776197,176.0,176,-0.110519,-0.327423,-0.163315,,176.0,176,-0.269592,5.217697,7.327042,,176.0,176,31.912231,7.63092,-376.64948,,176.0,176,-0.094879,-0.443863,-0.214508,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,196.0,0,196,,,9,9,187,True,True,196.0,196,-0.540146,5.456512,7.113358,,196.0,196,0.254368,0.11151,0.452041,,196.0,196,27.786255,-0.535583,-36.950684,,46.235439,196.0,196,-0.267639,-0.278961,-0.078094,,196.0,196,-0.55571,5.455322,7.103775,,196.0,196,31.912231,7.63092,-375.97504,,196.0,196,-0.120453,-0.323486,-0.170303,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,216.0,0,216,,,9,9,207,True,True,216.0,216,-0.078064,5.03154,7.386291,,216.0,216,0.256831,0.115168,0.441099,,216.0,216,27.786255,-0.535583,-38.975525,,47.869138,216.0,216,-0.461517,-0.306656,-0.005112,,216.0,216,-0.302521,5.271561,7.274963,,216.0,216,31.912231,7.63092,-377.99988,,216.0,216,-0.277573,-0.275024,-0.085083,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869


73


In [43]:
# Pool for test data

def apply_pool_to_df(df, f, pool, num_cores):
    result = pool.map(f, [d for d in np.array_split(df, num_cores)])
    pool.close()
    return result

num_cores = multiprocessing.cpu_count()
pool = Pool(num_cores)

start = time.time()
res = apply_pool_to_df(sub_df.iloc[:test_num, :], extract_test_data, pool, num_cores)
print(f"time to process {test_num} examples of sub_df", time.time() - start)

time to process 30 examples of sub_df 144.77688002586365


In [44]:
col_names = ["site_path_timestamp", "site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_500ms", "within_1000ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

df_test = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_test = df_test.append(df)
df_test = df_test.set_index("site_path_timestamp")

# process 1000 records -> 173.9 sec -> all test records are ~10,000 -> 1740 sec (~29min)
print("test_path count", len(test_paths[:test_num]))
print("length of df made", len(df_test))
display(df_test.head(10))

test_path count 30
time to process 145.02227973937988
length of df made 5634


Unnamed: 0_level_0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_500ms,within_1000ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
site_path_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,136.0,0,136,,,9,9,127,True,True,136.0,136,0.798813,4.30072,7.810059,,136.0,136,0.247101,0.104201,0.474897,,136.0,136,30.561829,-1.228333,-38.301086,,49.015379,136.0,136,-0.039139,-0.507996,-0.148392,,136.0,136,0.578552,4.353989,8.195526,,136.0,136,34.687805,6.938171,-377.32544,,136.0,136,-0.077835,-0.334671,-0.166565,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,110,110,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,156.0,0,156,,,9,9,147,True,True,156.0,156,0.026688,4.911835,7.244446,,156.0,156,0.247101,0.104201,0.474897,,156.0,156,29.173279,-1.922607,-36.950684,,47.118252,156.0,156,-0.084946,-0.4478,-0.20752,,156.0,156,0.524673,4.493454,7.43837,,156.0,156,33.299255,6.243896,-375.97504,,156.0,156,-0.049072,-0.504059,-0.15538,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,110,110,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,176.0,0,176,,,9,9,167,True,True,176.0,176,-0.41864,5.354172,7.295914,,176.0,176,0.254368,0.11151,0.452041,,176.0,176,27.786255,-0.535583,-37.625122,,46.776197,176.0,176,-0.110519,-0.327423,-0.163315,,176.0,176,-0.269592,5.217697,7.327042,,176.0,176,31.912231,7.63092,-376.64948,,176.0,176,-0.094879,-0.443863,-0.214508,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,196.0,0,196,,,9,9,187,True,True,196.0,196,-0.540146,5.456512,7.113358,,196.0,196,0.254368,0.11151,0.452041,,196.0,196,27.786255,-0.535583,-36.950684,,46.235439,196.0,196,-0.267639,-0.278961,-0.078094,,196.0,196,-0.55571,5.455322,7.103775,,196.0,196,31.912231,7.63092,-375.97504,,196.0,196,-0.120453,-0.323486,-0.170303,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,216.0,0,216,,,9,9,207,True,True,216.0,216,-0.078064,5.03154,7.386291,,216.0,216,0.256831,0.115168,0.441099,,216.0,216,27.786255,-0.535583,-38.975525,,47.869138,216.0,216,-0.461517,-0.306656,-0.005112,,216.0,216,-0.302521,5.271561,7.274963,,216.0,216,31.912231,7.63092,-377.99988,,216.0,216,-0.277573,-0.275024,-0.085083,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,237.0,0,237,,,9,9,228,True,True,237.0,237,0.186493,4.829239,7.793304,,237.0,237,0.255339,0.113015,0.437278,,237.0,237,26.399231,-1.228333,-36.950684,,45.428859,237.0,237,-0.604263,-0.352997,-0.032806,,237.0,237,0.05542,4.89209,7.538925,,237.0,237,30.525208,6.938171,-375.97504,,237.0,237,-0.471451,-0.302719,-0.0121,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,244,244,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-86,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,257.0,0,257,,,9,9,248,True,True,257.0,257,0.383423,4.802292,8.195541,,257.0,257,0.251375,0.108101,0.43634,,257.0,257,27.786255,1.545715,-38.301086,,47.343832,257.0,257,-0.605331,-0.345535,-0.086075,,257.0,257,0.319382,4.796906,8.047699,,257.0,257,31.912231,9.712219,-377.32544,,257.0,257,-0.614197,-0.34906,-0.039795,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,264,264,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-77,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,277.0,0,277,,,9,9,268,True,True,277.0,277,0.313995,4.899261,8.138077,,277.0,277,0.247039,0.10247,0.435846,,277.0,277,27.093506,0.852966,-36.950684,,45.82727,277.0,277,-0.481232,-0.286423,-0.11644,,277.0,277,0.374451,4.845398,8.23085,,277.0,277,31.219482,9.01947,-375.97504,,277.0,277,-0.615265,-0.341599,-0.093063,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,264,264,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-77,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,297.0,0,297,,,9,9,288,True,True,297.0,297,0.307404,4.945953,7.83461,,297.0,297,0.244292,0.098703,0.434144,,297.0,297,27.093506,1.545715,-36.950684,,45.845396,297.0,297,-0.367249,-0.209717,-0.094604,,297.0,297,0.269104,4.938766,7.962707,,297.0,297,31.219482,9.712219,-375.97504,,297.0,297,-0.491165,-0.282486,-0.123428,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,315,315,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-79,1144.0,1144,-0.425353,0.24869
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,317.0,0,317,,,9,9,308,True,True,317.0,317,0.528275,4.838806,7.586807,,317.0,317,0.243227,0.096333,0.431872,,317.0,317,26.399231,2.23999,-39.649963,,47.687069,317.0,317,-0.31398,-0.208115,-0.067444,,317.0,317,0.406174,4.91124,7.737045,,317.0,317,30.525208,10.406494,-378.67432,,317.0,317,-0.377182,-0.20578,-0.101593,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,315,315,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-79,1144.0,1144,-0.425353,0.24869


In [None]:
# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_test, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_test, ["ts", "start_ts", "diff_start_ts", \
                         "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
                         "acce_ts", "diff_acce_ts", "ahrs_ts", "diff_ahrs_ts", \
                         "magn_ts", "diff_magn_ts", "gyro_ts", "diff_gyro_ts", \
                         "acce_u_ts", "diff_acce_u_ts", "magn_u_ts", "diff_magn_u_ts", \
                         "gyro_u_ts", "diff_gyro_u_ts", \
                         "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
                         "beacon_ts", "diff_beacon_ts", "beacon_rssi", "rel_ts", "diff_rel_ts"
                        ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_test]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_test.head())

In [None]:
# Save the file in parquet
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models
import pickle

# Saving train data
test_file_name = "indoor_test.pkl"
with open(test_file_name, "wb") as file:
    pickle.dump(df_test, file)

# Save them to output
df_test.to_csv('df_test.csv',index=False)

In [None]:
# # %%timeit

# # 5.55 ms ± 1.76 ms per loop
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# # for fixing floor expression
# # print(floor_map_pairs) # to be used as floor_map later
# # assign 1F to 1 rather than zero, just in case we want to use this as integer

# # Original floor map
# # floor_map = {
# #     '1F': 1, '2F': 2, '3F': 3, '4F': 4, '5F': 5, '6F': 6, '7F': 7,
# #     '8F': 8, '9F': 9, 'B': -1, 'B1': -1, 'B2': -2, 'B2': -3, 'B3': -3,
# #     'BF': -1, 'BM': -1, 'F1': 1, 'F2': 2, 'F3': 3, 'F4': 4, 'F5': 5,
# #     'F6': 6, 'F7': 7, 'F8': 8, 'F9': 9, 'F10': 10, 'G': -1, 'L1': 1, 'L2': 2,
# #     'L3': 3, 'L3': 4, 'L4': 4, 'L4': 6, 'L5': 5, 'L6': 6, 'L7': 7, 'L8': 8,
# #     'L9': 9, 'L10': 10, 'L11': 11, 'LG1': -1, 'LG2': -2,
# #     'LM': np.nan, 'M': np.nan, 'P1': np.nan, 'P2': np.nan}

# floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
#              "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
#              "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
#              "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

# def one_trace_to_rows(path, floor_map):
#     try:
#         path_info = extract_path(path, floor_map)
#         data = extract_data(path)
#         # rows = list(itertools.chain(path_info, *data))
#         rows = []
#         for d in data:
#             row = path_info + d
#             rows.append(row)
#             # print("row: ", row)
#         return rows
#     except:
#         print("one_trace_to_rows error")

# # path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# # time w/ for loop with 1 train_path -> 11.642422199249268
# # time w/ itertools.chain for 1 train_path -> 11.862319946289062
# start = time.time()
# print("path: ", path)
# path_info = extract_path(path, floor_map)
# rows = one_trace_to_rows(path, floor_map)
# print("time to process one train_path", time.time() - start)
# print("col count: ", len(rows[0]))
# print("rows: ", rows)

In [None]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [None]:
# # Set pool
# num_cores = multiprocessing.cpu_count()
# print(f"num_cores={num_cores}")
# args = [(p, floor_map) for p in train_paths[:train_num]]
# pool = Pool(num_cores)

# start = time.time()
# res = pool.starmap(one_trace_to_rows, args)
# df_train = pd.DataFrame(res[0], columns=col_names)
# for r in res[1:]:
#     df = pd.DataFrame(r, columns=col_names)
#     df_train = df_train.append(df)

# # Memo
# # time becomes approx. half is we halve the cut off timestamp
# # 10 paths -> 52~58 secs (CPU, num_cores=4)
# # 10 paths -> 55 secs (TPU, num_cores=4)
# # 100 paths -> 
# # 1000 paths ->
# print("train_path count", len(train_paths[:train_num]))
# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))