In [1]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool, Manager

import pickle
import math

pd.set_option("display.max_columns", 100)

In [2]:
# Settings and altering components for GCP

# path settings
root_path = "../input/indoor-location-navigation/"
# root_path = "../jupyter/input/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

# function imports using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./
from io_f import read_data_file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# import for gcp settings
# import compute_f
# import io_f
# import visualize_f
# import main
# from io_f import read_data_file
# from compute_f import compute_step_positions, compute_steps, \
# compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# filter milisecond setting 
time_stamp_cut = 250

# train number setting
train_num = len(train_paths)
# train_num = round(len(train_paths) / 2)
# train_num = 1000

In [3]:
# Preprocess
print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

# Reading in 1 file
def pick_example(max_range, paths):
    ex = random.randint(0, max_range)
    example_path = paths[ex]
    path = f"{example_path}"
    paths = path.split("/")
    site = paths[4]
    floorNo = paths[5]
    floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
print("example path: ", path)
print("site: ", site)
print("floorNo: ", floorNo)
print("floor_plan_filename: ", floor_plan_filename)
print("json_plan_filename: ", json_plan_filename)
print("width: {}, height: {} ".format(width_meter, height_meter))

with open(path) as p:
    lines = p.readlines()
print("No. Lines in 1 example: {:,}". format(len(lines)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204
example path:  ../input/indoor-location-navigation/train/5cd56b86e2acfd2d33b5cf97/F2/5d0b05792f8a26000805baa4.txt
site:  5cd56b86e2acfd2d33b5cf97
floorNo:  F2
floor_plan_filename:  ../input/indoor-location-navigation/metadata/5cd56b86e2acfd2d33b5cf97/F2/floor_image.png
json_plan_filename:  ../input/indoor-location-navigation/metadata/5cd56b86e2acfd2d33b5cf97/F2/floor_info.json
width: 143.35628215499145, height: 93.61971713520958 
No. Lines in 1 example: 9,065


In [4]:
# train_path filtering
def extract_path_group(path):
    ex_path = f"{path}"
    ex_paths = ex_path.split("/")
    site_id = ex_paths[4]
    file_id = ex_paths[6].split(".")[0]
    return [path, site_id, file_id]

path_list = [extract_path_group(item) for item in train_paths]
df_paths = pd.DataFrame(path_list, columns=["path", "site_id", "file_id"])
site_id_path_list = df_paths["site_id"].unique()
print(len(train_paths))
print(len(site_id_path_list))
# sample_num = math.ceil(train_num / len(site_id_path_list))
grouped_paths_df = df_paths.groupby("site_id").sample(n=3)
display(grouped_paths_df.head())
grouped_paths_list = list(grouped_paths_df["path"].unique())
print(len(grouped_paths_list))
print(grouped_paths_list[:5])

# groupby

26925
204


Unnamed: 0,path,site_id,file_id
10964,../input/indoor-location-navigation/train/5a05...,5a0546857ecc773753327266,5d8f094cb6e29d0006fb8c05
11011,../input/indoor-location-navigation/train/5a05...,5a0546857ecc773753327266,5dccfd12757dea0006080571
10607,../input/indoor-location-navigation/train/5a05...,5a0546857ecc773753327266,5e1581bf1506f2000638fc72
25488,../input/indoor-location-navigation/train/5c3c...,5c3c44b80379370013e0fd2b,5d077df90e86b6000803625e
25609,../input/indoor-location-navigation/train/5c3c...,5c3c44b80379370013e0fd2b,5d073bb91a69370008bc5d1b


612
['../input/indoor-location-navigation/train/5a0546857ecc773753327266/F2/5d8f094cb6e29d0006fb8c05.txt', '../input/indoor-location-navigation/train/5a0546857ecc773753327266/F2/5dccfd12757dea0006080571.txt', '../input/indoor-location-navigation/train/5a0546857ecc773753327266/B1/5e1581bf1506f2000638fc72.txt', '../input/indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F1/5d077df90e86b6000803625e.txt', '../input/indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F3/5d073bb91a69370008bc5d1b.txt']


In [5]:
# for line in lines[:200]:
#     print(line)

In [6]:
# # Read in 1 random example
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# sample_file = read_data_file(path)

# # You can access the information for each variable:
# # Each data is split for time
# # Metadata is expressed with "#"

# # for i in sample_file.acce[:, [0]]:
# #     print(i)
# #     print(int(i))

# print("~~~ Example ~~~")
# print("acce: {}".format(sample_file.acce), "\n" +
#       "acce shape: {}".format(sample_file.acce.shape), "\n" +
# #       "acacce_uncalice: {}".format(sample_file.acce_uncali), "\n" +
#       "acacce_uncalice shape: {}".format(sample_file.acce_uncali.shape), "\n" +
# #       "ahrs: {}".format(sample_file.ahrs), "\n" +
#       "ahrs shape: {}".format(sample_file.ahrs.shape), "\n" +
# #       "gyro: {}".format(sample_file.gyro), "\n" +
#       "gyro shape: {}".format(sample_file.gyro.shape), "\n" +
# #       "gyro_uncali: {}".format(sample_file.gyro_uncali), "\n" +
#       "gyro_uncali shape: {}".format(sample_file.gyro_uncali.shape), "\n" +
# #       "ibeacon: {}".format(sample_file.ibeacon), "\n" +
#       "ibeacon shape: {}".format(sample_file.ibeacon.shape), "\n" +
# #       "magn: {}".format(sample_file.magn), "\n" +
#       "magn shape: {}".format(sample_file.magn.shape), "\n" +
# #       "magn_uncali: {}".format(sample_file.magn_uncali), "\n" +
#       "magn_uncali shape: {}".format(sample_file.magn_uncali.shape), "\n" +
# #       "waypoint: {}".format(sample_file.waypoint), "\n" +
#       "waypoint shape: {}".format(sample_file.waypoint.shape), "\n" +
# #       "wifi: {}".format(sample_file.wifi), "\n" +
#       "wifi shape: {}".format(sample_file.wifi.shape))

In [7]:
# def show_site_png(root_path, site):
#     floor_paths = glob.glob(root_path + "metadata/" + site + "/*/floor_image.png")
#     n = len(floor_paths)
#     print("No. of floor paths: ", n)

#     # Create the custom number of rows & columns
#     ncols = [ceil(n / 3) if n > 4 else 4][0]
#     nrows = [ceil(n / ncols) if n > 4 else 1][0]

#     plt.figure(figsize=(16, 10))
#     plt.suptitle(f"Site no. '{site}'", fontsize=18)

#     # Plot image for each floor
#     for k, floor in enumerate(floor_paths):
#         # plt.subplot(nrows, ncols, k+1)
#         plt.subplot(ncols, nrows, k+1)
#         plt.rcParams["figure.facecolor"] = "white"

#         image = Image.open(floor)

#         plt.imshow(image)
#         plt.axis("off")
#         title = floor.split("/")[5]
#         plt.title(title, fontsize=15)

In [8]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [9]:
# # Checking the floor number distribution

# all_floors = glob.glob("../input/indoor-location-navigation/metadata/*/*")
# all_sites = glob.glob("../input/indoor-location-navigation/metadata/*")
# floor_no = []
# floor_counts = []

# # Floor count
# for site in all_sites:
#     floor_count = len([name for name in os.listdir(site)])
#     floor_counts.append(floor_count)

# floor_counts_df = pd.DataFrame(floor_counts, columns=["F_Count"])
# floor_counts_df = floor_counts_df["F_Count"].value_counts().reset_index()
# floor_counts_df = floor_counts_df.sort_values("index", ascending=True)

# # Extract only the floor number
# for floor in all_floors:
#     no = floor.split("/")[5]
#     floor_no.append(no)
    
# floor_no = pd.DataFrame(floor_no, columns=["No"])
# floor_no = floor_no["No"].value_counts().reset_index()
# floor_no = floor_no.sort_values("No", ascending=False)

# # ToDo: Floor expressions need to be fixed
# # 1F -> F1, L1 -> F1, G -> F1 etc

# # Plot
# # display(floor_counts_df.head(10))

# fig, axes = plt.subplots(ncols=2, figsize=(16, 10))
# axes[0] = sns.barplot(data=floor_counts_df, x="index", y="F_Count", palette="viridis", saturation=0.4, ax=axes[0])
# axes[0].set_title("Floor Count Distribution", size = 26, weight="bold")
# axes[0].set_xlabel("")
# axes[0].set_ylabel("Floor Count", size = 18, weight="bold")

# axes[1] = sns.barplot(data=floor_no, x="No", y="index", palette="viridis", saturation=0.4, ax=axes[1])
# axes[1].set_title("Frequency of Floors", size = 26, weight="bold")
# axes[1].set_xlabel("")
# axes[1].set_ylabel("Floor No.", size = 18, weight="bold")

# plt.xticks([])
# plt.yticks(fontsize=11)
# sns.despine(left=True, bottom=True);

In [10]:
# # Metadata checking (GeoJSON)
# # This is a vector representation of floor map
# geojson_paths = glob.glob("../input/indoor-location-navigation/metadata/*/*/geojson_map.json")
# print("No. of geojson file: {}".format(len(geojson_paths)))

# # Print one example
# ex = random.randint(0, len(geojson_paths))
# geojson_file_name = geojson_paths[ex]
# with open(geojson_file_name) as json_file:
#     paths = geojson_file_name.split("/")
#     site_id = paths[4]
#     floor = paths[5]
#     json_data = json.load(json_file)
#     json_properties = json_data["features"][0]["properties"]
#     print("File path: {}".format(geojson_file_name))
#     print("SiteID: {}".format(site_id))
#     print("Floor: {}".format(floor))
#     print("Floor info: {}".format(json_properties))

# # create id and floor number matching file
# site_ids = []
# floor_no = []
# floor_no_json = []

# for i in range(0, len(geojson_paths)):
#     with open(geojson_paths[i]) as f:
#         paths = geojson_paths[i].split("/")
#         site_id = paths[4]
#         floor = paths[5]
#         site_ids.append(site_id)
#         floor_no.append(floor)
#         d = json.load(f)
#         try:
#             floor_no_json.append(d["features"][0]["properties"]["floor_num"])
#         except:
#             floor_no_json.append(np.nan)

# floor_num_df = pd.DataFrame(
#     {"site_id": site_ids,
#      "floor_no": floor_no,
#      "floor_no_json": floor_no_json,
#     })

# display("floor_num_df length: {}".format(len(floor_num_df)))
# display(floor_num_df.head())

# # Get floormap dict to be used later
# floor_map_pairs = list(zip(floor_num_df["floor_no"], floor_num_df["floor_no_json"]))
# floor_map_pairs = np.unique(floor_map_pairs, axis=0) # get unique pair
# # print(floor_map_pairs) # to be used as floor_map later

# # Plot distribution
# floor_num_count_df = floor_num_df["floor_no_json"].value_counts().reset_index()
# floor_num_count_df = floor_num_count_df.sort_values("floor_no_json", ascending=False)
# # display(floor_num_count_df)
# # print(len(floor_num_count_df["floor_no_json"] == np.nan))

# fig = plt.figure()
# ax = plt.subplots(figsize=(16, 10))
# sns.barplot(data=floor_num_count_df, x="index", y="floor_no_json", palette="viridis", saturation=0.4)
# fig.show()

# # Just in case: Need for altitude info in geoJSON
# # from pyproj import Proj, transform
# # print(transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), -0.1285907, 51.50809))  # longitude first, latitude second.
# # output (meters east of 0, meters north of 0): (-14314.651244750548, 6711665.883938471)

In [11]:
# # More viz on accelerometers, wifi etc in one go
# from visualize_f import visualize_trajectory, visualize_heatmap
# from main import extract_wifi_rssi, extract_wifi_count
# from main import calibrate_magnetic_wifi_ibeacon_to_position
# from main import extract_magnetic_strength
# from main import extract_ibeacon_rssi

# # Visualizing magnetic strength
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# # extract mag, wifi, beacon of one example
# mwi_datas = calibrate_magnetic_wifi_ibeacon_to_position([path])
# magnetic_strength = extract_magnetic_strength(mwi_datas)
# wifi_rssi = extract_wifi_rssi(mwi_datas)
# wifi_counts = extract_wifi_count(mwi_datas)
# ibeacon_rssi = extract_ibeacon_rssi(mwi_datas)
# ibeacon_ummids = list(ibeacon_rssi.keys())
# target_ibeacon = ibeacon_ummids[0]

# # positions for heatmap
# heat_positions = np.array(list(magnetic_strength.keys()))
# heat_values = np.array(list(magnetic_strength.values()))
# heat_positions_wifi = np.array(list(wifi_counts.keys()))
# heat_values_wifi = np.array(list(wifi_counts.values()))
# heat_positions_bc = np.array(list(ibeacon_rssi[target_ibeacon].keys()))
# heat_values_bc = np.array(list(ibeacon_rssi[target_ibeacon].values()))[:, 0]

# # filter out positions that no wifi detected
# mask = heat_values_wifi != 0
# heat_positions_wifi = heat_positions_wifi[mask]
# heat_values_wifi = heat_values_wifi[mask]

# # get trajectory
# example = read_data_file(path)
# trajectory = example.waypoint # Returns timestamp, x, y values
# print(f"Waypoints: {trajectory}")
# trajectory = trajectory[:, 1:3] # Removes timestamp (we only need the coordinates)

# # Plot trajectory
# visualize_trajectory(trajectory = trajectory,
#                      floor_plan_filename = floor_plan_filename,
#                      width_meter = width_meter,
#                      height_meter = height_meter,
#                      title = "Example of Waypoint",)

In [12]:
# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [13]:
# path_datas = read_data_file(path)
# acce_datas = path_datas.acce
# magn_datas = path_datas.magn
# ahrs_datas = path_datas.ahrs
# wifi_datas = path_datas.wifi
# ibeacon_datas = path_datas.ibeacon
# posi_datas = path_datas.waypoint # not to be used

# # acce and ahrs data translation
# rel_positions = calc_rel_positions(acce_datas, ahrs_datas)
# print(acce_datas.shape)
# print(acce_datas[0])
# print(ahrs_datas[0])
# print(rel_positions.shape)

# # magn data translation
# print(magn_datas.shape)
# print(magn_datas[0])
# # print(extract_magnetic_strength(magn_datas))

In [14]:
# Methods for preprocessing train data: Timestamp handling
def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    if data.size == 0:
        return np.array([])
    else:
        data_ts = data[:, [0]]
        diff = []
        for ts in data_ts:
            diff.append(abs(int(t) - int(ts)))
        closest_index = np.argmin(diff) # if multiple records have the same value..?
        return data[closest_index]

In [15]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    if data.size == 0:
        # print("no axis data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        z_axis = data[3]
        try:
            accuracy = data[4]
        except IndexError:
            accuracy = np.nan
        return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    if data.size == 0:
        # print("no wifi data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        bssid = data[2]
        rssi = data[3]
        if len(data) > 5:
            freq = data[4]
            last_seen_ts = data[5]
        else:
            freq = np.nan
            last_seen_ts = data[-1]
        return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    if data.size == 0:
        # print("no beacon data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        rssi = data[2]
        return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    if data.size == 0:
        # print("no rel_pos data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        return [data_ts, diff_ts, x_axis, y_axis]

In [16]:
# Extract path and other data
def extract_path(path, floor_map):
    # split path
    try:
        ex_path = f"{path}"
        ex_paths = ex_path.split("/")
        site_id = ex_paths[4]
        floor = ex_paths[5]
        f = floor_map[floor]
        file_id = ex_paths[6].split(".")[0]
        return [site_id, file_id, f, floor]
    except:
        print("extract_path error")

# Definitely needs to be refactored
def extract_data(path):
    start_ts = find_start_ts(path)
    path_datas = read_data_file(path)
    acce = path_datas.acce
    ahrs = path_datas.ahrs
    magn = path_datas.magn
    gyro = path_datas.gyro
    acce_uncali = path_datas.acce_uncali
    magn_uncali = path_datas.magn_uncali
    gyro_uncali = path_datas.gyro_uncali
    wifi = path_datas.wifi
    wps = path_datas.waypoint
    ibeacon = path_datas.ibeacon
    rel_positions = calc_rel_positions(acce, ahrs)

    # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
    ts = np.unique(wps[:, [0]])
#     if acce_uncali.any():
#         # print("acce_uncali")
#         ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data
#     elif acce.any():
#         # print("acce")
#         ts = np.unique(acce[:, [0]])
#     else:
#         print("no acce or acce_uncali")

    # extract data for each timestamp of waypoints
    res = []
    for i, t in enumerate(ts):
        try:
            wp_closest = np.nan
            closest_wp_ts = np.nan
            diff_ts_wp_ts = np.nan
            within_100ms = np.nan
            within_200ms = np.nan
            wp = wps[i]
            x = wp[1]
            y = wp[2]
            # print("x, y: ", x, y)
            diff_start_ts = int(t) - int(start_ts)
            diff_start_wp_ts = np.nan
            # print("diff_start_ts, diff_start_wp_ts: ", diff_start_ts, diff_start_wp_ts)
            acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
            ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
            magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
            magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
            gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
            # print("acce: ", acce_closest)
            # print("ahrs: ", ahrs_closest)
            # print("magn: ", magn_closest)
            # print("gyro: ", gyro_closest)
            acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
            magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
            gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
            # print("acce_u_closest: ", acce_u_closest)
            # print("magn_u_closest: ", magn_u_closest)
            # print("gyro_u_closest: ", gyro_u_closest)
            wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
            beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
            rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
            # print([t, x, y, int(closest_wp_ts), acce_closest, acce_u_closest])
            res.append([int(t), start_ts, diff_start_ts, x, y, closest_wp_ts, diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
                       acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                       acce_u_closest + magn_u_closest + gyro_u_closest + \
                       wifi_closest + beacon_closest + rel_pos
                      )
        except Exception as exc:
            # print("Error message: ", exc)
            # print("extract_test_data error: ", path)
            pass
    return res

In [17]:
# # Extract path and other data
# def extract_path(path, floor_map):
#     # split path
#     try:
#         ex_path = f"{path}"
#         ex_paths = ex_path.split("/")
#         site_id = ex_paths[4]
#         floor = ex_paths[5]
#         f = floor_map[floor]
#         file_id = ex_paths[6].split(".")[0]
#         return [site_id, file_id, f, floor]
#     except:
#         print("extract_path error")

# # Definitely needs to be refactored
# def extract_data(path):
#     start_ts = find_start_ts(path)
#     path_datas = read_data_file(path)
#     acce = path_datas.acce
#     ahrs = path_datas.ahrs
#     magn = path_datas.magn
#     gyro = path_datas.gyro
#     acce_uncali = path_datas.acce_uncali
#     magn_uncali = path_datas.magn_uncali
#     gyro_uncali = path_datas.gyro_uncali
#     wifi = path_datas.wifi
#     wps = path_datas.waypoint
#     ibeacon = path_datas.ibeacon
#     rel_positions = calc_rel_positions(acce, ahrs)

#     # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
#     # ts = np.unique(wps[:, [0]])
#     if acce_uncali.any():
#         # print("acce_uncali")
#         ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data
#     elif acce.any():
#         # print("acce")
#         ts = np.unique(acce[:, [0]])
#     else:
#         print("no acce or acce_uncali")

#     # extract data for each timestamp of waypoints
#     res = []
#     for t in ts:
#         try:
#             wp_closest = find_smallest_diff(t, wps)
#             closest_wp_ts = wp_closest[0]
#             diff_ts_wp_ts = abs(int(t) - int(closest_wp_ts))
#             # time_stamp_cut = 2000, only the records within 2 sec of waypoint are kept
#             if diff_ts_wp_ts < time_stamp_cut:
#                 # flag to indicate how close the data point is to the wps
#                 # print("diff_ts_wp_ts", diff_ts_wp_ts)
#                 within_100ms = True if abs(diff_ts_wp_ts) <= 100 else False
#                 within_200ms = True if abs(diff_ts_wp_ts) <= 200 else False
#                 x = wp_closest[1]
#                 y = wp_closest[2]
#                 # print("x, y: ", x, y)
#                 diff_start_ts = int(t) - int(start_ts)
#                 diff_start_wp_ts = int(closest_wp_ts) - int(start_ts)
#                 # print("diff_start_ts, diff_start_wp_ts: ", diff_start_ts, diff_start_wp_ts)
#                 acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
#                 ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
#                 magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
#                 magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
#                 gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
#                 # print("acce: ", acce_closest)
#                 # print("ahrs: ", ahrs_closest)
#                 # print("magn: ", magn_closest)
#                 # print("gyro: ", gyro_closest)
#                 acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
#                 magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
#                 gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
#                 # print("acce_u_closest: ", acce_u_closest)
#                 # print("magn_u_closest: ", magn_u_closest)
#                 # print("gyro_u_closest: ", gyro_u_closest)
#                 wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
#                 if len(ibeacon) > 0:
#                     beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
#                 else:
#                     beacon_closest = [np.nan, np.nan, np.nan, np.nan]
#                 rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
#                 # print([t, x, y, int(closest_wp_ts), acce_closest, acce_u_closest])
#                 res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
#                            acce_closest + ahrs_closest + magn_closest + gyro_closest + \
#                            acce_u_closest + magn_u_closest + gyro_u_closest + \
#                            wifi_closest + beacon_closest + rel_pos
#                           )
#             else:
#                 # print("no wp made it through timestamp cut")
#                 continue
#         except Exception as exc:
#             pass
#             # print("Error message: ", exc)
#             # print("extract_test_data error")
#     return res

In [18]:
# %%timeit

# 5.55 ms ± 1.76 ms per loop
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# for fixing floor expression
# print(floor_map_pairs) # to be used as floor_map later
# assign 1F to 1 rather than zero, just in case we want to use this as integer

# Original floor map
# floor_map = {
#     '1F': 1, '2F': 2, '3F': 3, '4F': 4, '5F': 5, '6F': 6, '7F': 7,
#     '8F': 8, '9F': 9, 'B': -1, 'B1': -1, 'B2': -2, 'B2': -3, 'B3': -3,
#     'BF': -1, 'BM': -1, 'F1': 1, 'F2': 2, 'F3': 3, 'F4': 4, 'F5': 5,
#     'F6': 6, 'F7': 7, 'F8': 8, 'F9': 9, 'F10': 10, 'G': -1, 'L1': 1, 'L2': 2,
#     'L3': 3, 'L3': 4, 'L4': 4, 'L4': 6, 'L5': 5, 'L6': 6, 'L7': 7, 'L8': 8,
#     'L9': 9, 'L10': 10, 'L11': 11, 'LG1': -1, 'LG2': -2,
#     'LM': np.nan, 'M': np.nan, 'P1': np.nan, 'P2': np.nan}

floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

def one_trace_to_rows(path, floor_map):
    try:
        path_info = extract_path(path, floor_map)
        data = extract_data(path)
        # rows = list(itertools.chain(path_info, *data))
        rows = []
        for d in data:
            row = path_info + d
            rows.append(row)
            # print("row: ", row)
        return rows
    except:
        print("one_trace_to_rows error at: ", path)

# path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# time w/ for loop with 1 train_path -> 11.6
# time w/ itertools.chain for 1 train_path -> 11.8
start = time.time()
path_info = extract_path(path, floor_map)
print("path: ", path_info)
rows = one_trace_to_rows(path, floor_map)
print("time to process one train_path", time.time() - start)
#print("col count: ", len(rows[0]))
print("rows: ", rows)

path:  ['5d2709c303f801723c3299ee', '5daecceae415cd0006629416', 8, '9F']
time to process one train_path 2.060051679611206
rows:  [['5d2709c303f801723c3299ee', '5daecceae415cd0006629416', 8, '9F', 1571736127198, '1571736127183', 15, 41.92366, 54.011845, nan, nan, nan, nan, nan, 1571736127326.0, 143, -1.0124512, -0.095184326, 10.62738, nan, 1571736127326.0, 143, -0.017615361, 0.039660674, 0.11309541, nan, 1571736127326.0, 143, 6.6589355, 26.052856, -21.86737, nan, 34.659408512939116, 1571736127326.0, 143, -0.20941162, -0.030639648, -0.14700317, nan, 1571736127326.0, 143, -1.0016785, -0.14007568, 10.526825, nan, 1571736127326.0, 143, -42.318726, -50.64392, -355.93567, nan, 1571736127326.0, 143, -0.20709229, -0.03074646, -0.14717102, nan, '1571736129099', 1916, 'c9f5b338cabea457b4696fafbcfeaa13558e6325', 'b39edee978b6375599ae859d045b6583a5750f60', '-42', nan, '1571736127591', '1571736128063', 880, '07efd69e3167537492f0ead89fb2779633b04949_b6589fc6ab0dc82cf12099d1c2d40ab994e8410c_e2a8a10b3a

In [19]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [20]:
# Check if we can make df

# column names
col_names = ["site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_100ms", "within_200ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

print(len(col_names))

df = pd.DataFrame(rows, columns=col_names)
print("df len: ", len(df))
print("site_id nunique: ", df["site_id"].nunique())
print("file_id nunique: ", df["file_id"].nunique())
print("x value_counts: ", df["x"].value_counts())
print("y value_counts: ", df["y"].value_counts())
print("event ts nunique: ", df["ts"].nunique())
print("start ts nunique: ", df["start_ts"].nunique()) # should be one
print("diff_ts_wp_ts value_counts: ", df["diff_ts_wp_ts"].value_counts())
print("diff_ts_wp_ts nunique: ", df["diff_ts_wp_ts"].nunique())
print("within_100ms value_counts: ", df["within_100ms"].value_counts())
print("within_100ms nunique: ", df["within_100ms"].nunique())
print("within_100ms count: ", df["within_100ms"].count())
print("within_200ms value_counts: ", df["within_200ms"].value_counts())
print("within_200ms nunique: ", df["within_200ms"].nunique())
print("within_200ms count: ", df["within_200ms"].count())
display(df.head())

72
df len:  15
site_id nunique:  1
file_id nunique:  1
x value_counts:  27.997662    1
19.713085    1
32.297585    1
21.671806    1
24.763794    1
31.220552    1
22.098574    1
31.539568    1
24.121689    1
30.218525    1
16.977514    1
41.923660    1
23.016280    1
41.760895    1
21.424997    1
Name: x, dtype: int64
y value_counts:  54.011845    1
61.179607    1
60.095978    1
40.363930    1
3.618611     1
3.289189     1
3.733026     1
60.690334    1
27.753407    1
19.187958    1
54.001633    1
31.257586    1
21.531437    1
11.257348    1
47.680150    1
Name: y, dtype: int64
event ts nunique:  15
start ts nunique:  1
diff_ts_wp_ts value_counts:  Series([], Name: diff_ts_wp_ts, dtype: int64)
diff_ts_wp_ts nunique:  0
within_100ms value_counts:  Series([], Name: within_100ms, dtype: int64)
within_100ms nunique:  0
within_100ms count:  0
within_200ms value_counts:  Series([], Name: within_200ms, dtype: int64)
within_200ms nunique:  0
within_200ms count:  0


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5d2709c303f801723c3299ee,5daecceae415cd0006629416,8,9F,1571736127198,1571736127183,15,41.92366,54.011845,,,,,,1571736000000.0,143,-1.012451,-0.095184,10.62738,,1571736000000.0,143,-0.017615,0.039661,0.113095,,1571736000000.0,143,6.658936,26.052856,-21.86737,,34.659409,1571736000000.0,143,-0.209412,-0.03064,-0.147003,,1571736000000.0,143,-1.001678,-0.140076,10.526825,,1571736000000.0,143,-42.318726,-50.64392,-355.93567,,1571736000000.0,143,-0.207092,-0.030746,-0.147171,,1571736129099,1916,c9f5b338cabea457b4696fafbcfeaa13558e6325,b39edee978b6375599ae859d045b6583a5750f60,-42,,1571736127591,1571736128063,880,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-83,1571736000000.0,779,-0.048288,0.397075
1,5d2709c303f801723c3299ee,5daecceae415cd0006629416,8,9F,1571736132742,1571736127183,5559,41.760895,61.179607,,,,,,1571736000000.0,5565,-0.17627,0.726624,10.927246,,1571736000000.0,5565,0.008987,0.038233,0.090438,,1571736000000.0,5565,7.35321,35.765076,-25.219727,,44.376176,1571736000000.0,5565,-0.337769,-0.225586,0.183228,,1571736000000.0,5565,-0.412094,0.744583,11.342636,,1571736000000.0,5565,-41.62445,-40.9317,-359.28802,,1571736000000.0,5565,-0.335449,-0.225693,0.18306,,1571736132948,5765,c9f5b338cabea457b4696fafbcfeaa13558e6325,76e94c277eaf24663e1d177cd847b8bb29d254af,-36,,1571736131390,1571736132975,5792,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-80,1571736000000.0,5287,-0.039629,0.718185
2,5d2709c303f801723c3299ee,5daecceae415cd0006629416,8,9F,1571736140051,1571736127183,12868,32.297585,60.690334,,,,,,1571736000000.0,12873,0.549179,-0.697937,13.305283,,1571736000000.0,12873,-0.043725,0.017086,0.703873,,1571736000000.0,12873,30.245972,-1.002502,-25.219727,,39.393635,1571736000000.0,12873,0.05957,-0.555283,-0.369125,,1571736000000.0,12873,0.541397,-0.351975,12.877914,,1571736000000.0,12873,-18.73169,-77.69928,-359.28802,,1571736000000.0,12873,0.06189,-0.555389,-0.369293,,1571736140740,13557,da39a3ee5e6b4b0d3255bfef95601890afd80709,961416bf40753832a3c4c1581bee5e5539d77d85,-26,,1571736138847,1571736139652,12469,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-93,1571736000000.0,12913,-0.879713,0.022029
3,5d2709c303f801723c3299ee,5daecceae415cd0006629416,8,9F,1571736146950,1571736127183,19767,24.121689,60.095978,,,,,,1571736000000.0,19764,0.266663,-1.489227,9.806168,,1571736000000.0,19764,-0.014458,0.029122,0.725276,,1571736000000.0,19764,32.32727,-3.083801,-26.559448,,41.951955,1571736000000.0,19764,-0.209946,-0.357147,-0.349945,,1571736000000.0,19764,0.342682,-1.497604,9.858826,,1571736000000.0,19764,-15.956116,-81.16913,-359.9579,,1571736000000.0,19764,-0.207626,-0.357254,-0.350113,,1571736146628,19445,da39a3ee5e6b4b0d3255bfef95601890afd80709,961416bf40753832a3c4c1581bee5e5539d77d85,-30,,1571736140791,1571736146439,19256,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-87,1571736000000.0,20043,-0.681378,-0.031513
4,5d2709c303f801723c3299ee,5daecceae415cd0006629416,8,9F,1571736153496,1571736127183,26313,23.01628,54.001633,,,,,,1571736000000.0,26318,-0.289398,-2.96286,13.205933,,1571736000000.0,26318,-0.040761,-0.02488,0.998066,,1571736000000.0,26318,3.190613,-27.365112,-29.240417,,40.175009,1571736000000.0,26318,-0.131638,-0.803497,-0.475113,,1571736000000.0,26318,-0.440231,-3.286667,15.402618,,1571736000000.0,26318,-45.78705,-104.06189,-363.30872,,1571736000000.0,26318,-0.129318,-0.803604,-0.475281,,1571736152718,25535,da39a3ee5e6b4b0d3255bfef95601890afd80709,47c09a9d610d6b00d875bf44fd5945d6fd728025,-41,,1571736142764,1571736154005,26822,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-88,1571736000000.0,26179,-0.053073,-0.945681


In [21]:
# # Set pool
# num_cores = multiprocessing.cpu_count()
# print(f"num_cores={num_cores}")
# # args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list]
# pool = Pool(num_cores)

# start = time.time()
# # w/ 250ms settings, 3 random samples from each site_id
# # 2 paths -> 18.7 sec
# # 10 paths -> 315 sec (df len is 1994)
# # 100 paths -> 708 sec (df len is 7183)
# # all ~ 600 paths -> 

# # errors
# # grouped_paths_list -> 100 paths -> site_id: 8 errors, 27 correct
# # grouped_paths_list -> 100 paths -> file_id: 23 errors, 77 correct

# # all in one go -> xxx sec
# # array_split -> 5891.8 sec

# # all in one go
# # res = pool.starmap(one_trace_to_rows, args)

# # split the args
# res = []
# for arg in tqdm(np.array_split(args, 50)):
#     res.extend(pool.starmap(one_trace_to_rows, arg))

In [22]:
# Set pool
num_cores = multiprocessing.cpu_count()
print(f"num_cores={num_cores}")
pool = Pool(num_cores)
start = time.time()

args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list]

res_dict = {}
res = []
for i, e in enumerate(tqdm(np.array_split(args, 30))):
    # print("ith iteration: ", i)
    rows = pool.starmap(one_trace_to_rows, e)
    res_dict[i] = rows
    res.extend(rows)
    
pool.close()
print("time to process", time.time() - start)

num_cores=4


 47%|████▋     | 14/30 [1:47:50<2:03:15, 462.21s/it]


IndexError: list index out of range

In [None]:
res_name = "indoor_train_res.pkl"

with open(res_name, "wb") as file:
    pickle.dump(res, file)

In [None]:
with open(res_name, "rb") as file:
    res = pickle.load(file)

In [None]:
############################## KEEP THIS CELL FOR LATER REF ##############################

# Error in ~20% of the train paths -> caused by not having acces_uncali to create the event timestamps

# error files
# /5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bbc852a70008c01607.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bda50dc300099d34cc.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085df529994a0008202661.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt
# /5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt
# /5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt
# /5cd56b5ae2acfd2d33b5854a/B1/5d073ba64a19c000086c559b.txt
# /5cd56b5ae2acfd2d33b5854a/F1/5d07603e4cae4f000a2db525.txt
# /5cd56b63e2acfd2d33b591c2/F2/5d0b0668912a980009fe91f2.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt
# /5cd56b64e2acfd2d33b592b3/F2/5d0c9321c99c56000836df18.txt
# /5cd56b64e2acfd2d33b592b3/F3/5d0c9952ea565d0008e34e8b.txt
# /5cd56b64e2acfd2d33b592b3/F4/5d0c9d65ea565d0008e34ea2.txt
# /5cd56b5ae2acfd2d33b58549/5F/5d0613514a19c000086c432a.txt
# /5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b015e2f8a26000805ba44.txt
# /5cd56b79e2acfd2d33b5b74e/F1/5d0af3452f8a26000805b830.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt
# /5cd56b6be2acfd2d33b59d1f/F3/5d0896415125450008037c76.txt

# base_path = "../input/indoor-location-navigation/train"
# error_files = [
#     "/5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt",
#     "/5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt",
#     "/5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt",
#     "/5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt",
#     "/5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt",
#     "/5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt"
# ]

# working_path = "../input/indoor-location-navigation/train/5d2709c303f801723c3299ee/1F/5dad7d6daa1d300006faa80c.txt"
# error_paths = [base_path + e for e in error_files]
# rows = one_trace_to_rows(error_paths[1], floor_map)
# print(rows)

In [None]:
print(len(res))
print(len(res[0]))
print(len(res[0][0]))

In [None]:
start = time.time()

df_train = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_train = df_train.append(df, ignore_index=True)

print("time to process", time.time() - start)
print("length of df made", len(df_train))
display(df_train.head(10))

In [None]:
# def list_to_df(row_list):
#     df_train = pd.DataFrame(row_list[0], columns=col_names)
#     for r in row_list[1:]:
#         df = pd.DataFrame(r, columns=col_names)
#         df_train = df_train.append(df)
#     return df_train

# start = time.time()
# pool = Pool(num_cores)

# df_train = pool.map(list_to_df, tqdm(res))

# # print("train_path count", len(train_paths[:train_num]))
# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))
# pool.close()

In [None]:
print("df len: ", len(df_train), "\n")
print("site_id nunique: ", df_train["site_id"].nunique(), "\n")
print("site_id value_counts: ", df_train["site_id"].value_counts(), "\n")
print("file_id nunique: ", df_train["file_id"].nunique(), "\n")
print("file_id value_counts: ", df_train["file_id"].value_counts(), "\n")
print("floor value_counts: ", df_train["floor"].value_counts(), "\n")
print("x value_counts: ", df_train["x"].value_counts(), "\n")
print("y value_counts: ", df_train["y"].value_counts(), "\n")
print("event ts nunique: ", df_train["ts"].nunique(), "\n")
print("start ts nunique: ", df_train["start_ts"].nunique(), "\n") # should be one
print("diff_ts_wp_ts value_counts: ", df_train["diff_ts_wp_ts"].value_counts(), "\n")
print("diff_ts_wp_ts nunique: ", df_train["diff_ts_wp_ts"].nunique(), "\n")
display(df_train.head())

In [None]:
# Visualizing timestamp distribution

# Explore
# print(df_train["ts"].dtype)
# print(df_test["ts"].dtype)

# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_train, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_train, ["floor_converted","ts", "start_ts", "diff_start_ts", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
             "acce_ts", "diff_acce_ts", \
             "ahrs_ts", "diff_ahrs_ts", \
             "magn_ts", "diff_magn_ts", \
             "gyro_ts", "diff_gyro_ts", \
             "acce_u_ts", "diff_acce_u_ts", \
             "magn_u_ts", "diff_magn_u_ts", \
             "gyro_u_ts", "diff_gyro_u_ts", \
             "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_rssi", \
             "rel_ts", "diff_rel_ts"
            ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_train]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_train.head())

In [None]:
# Calculate moving averages
# Differencing respect to time (as each timestep is unevenly spaced)

In [None]:
# Save the file in parquet
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Saving train data
train_file_name = "indoor_train_3.pkl"

with open(train_file_name, "wb") as file:
    pickle.dump(df_train, file)

# Save them to output
# df_train.to_csv('df_train_2.csv',index=False)
# df_test.to_csv('df_test.csv',index=False)

In [None]:
# Load data it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

In [None]:
print("df len: ", len(df_train), "\n")
print("file_id unique: ", (df_train["file_id"].nunique()), "\n")
print("site_id unique: ", (df_train["site_id"].nunique()), "\n")
print("site_id value_counts: ", (df_train["site_id"].value_counts()))
display(df_train.head())

In [None]:
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
# grouped_df = sub_df.groupby("file").sample(n=2)
# all_file_id = grouped_df["file"].unique()
# print(len(grouped_df))
# print(len(all_file_id))
# display(grouped_df.head())
display(sub_df.head())

test_site_id = sub_df["site"].unique()
train_site_id = df_train["site_id"].unique()
print(test_site_id, "\n")
print(train_site_id, "\n")
a = list(set(test_site_id) & set(train_site_id))
print(a)