In [28]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool

pd.set_option("display.max_columns", 100)

In [29]:
# milisecond setting 
time_stamp_cut = 1000

# train number setting
# train_num = len(train_paths) - 1
# train_num = round(len(train_paths) / 2)
train_num = 1000

# 200 train paths come out with ~1000 examples, so multiply train examples by 5 to extract similar no. of examples
# test_num = train_num * 5
# test_num = len(sub_df) - 1
print(train_num)

10


In [30]:
# Preprocess

# Check out each file. Content, images
root_path = "../input/indoor-location-navigation/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

# Reading in 1 file
def pick_example(max_range, paths):
    ex = random.randint(0, max_range)
    example_path = paths[ex]
    path = f"{example_path}"
    paths = path.split("/")
    site = paths[4]
    floorNo = paths[5]
    floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
print("example path: ", path)
print("site: ", site)
print("floorNo: ", floorNo)
print("floor_plan_filename: ", floor_plan_filename)
print("json_plan_filename: ", json_plan_filename)
print("width: {}, height: {} ".format(width_meter, height_meter))

with open(path) as p:
    lines = p.readlines()
print("No. Lines in 1 example: {:,}". format(len(lines)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204
example path:  ../input/indoor-location-navigation/train/5cd969ea39e2fc0b4afeaef3/F2/5d0c489922c7770008a75852.txt
site:  5cd969ea39e2fc0b4afeaef3
floorNo:  F2
floor_plan_filename:  ../input/indoor-location-navigation/metadata/5cd969ea39e2fc0b4afeaef3/F2/floor_image.png
json_plan_filename:  ../input/indoor-location-navigation/metadata/5cd969ea39e2fc0b4afeaef3/F2/floor_info.json
width: 279.9016182762572, height: 168.76029648949492 
No. Lines in 1 example: 10,876


In [31]:
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df.columns = ["site", "file", "timestamp"]
display(sub_df.head())

Unnamed: 0,site,file,timestamp
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,18763
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,22328


In [32]:
for line in lines[:200]:
    print(line)

#	startTime:1561085017159

#	SiteID:5cd969ea39e2fc0b4afeaef3	SiteName:三湘商业广场	FloorId:5cd969f039e2fc0b4afeb5ec	FloorName:F2

#	Brand:vivo	Model:vivo X20	AndroidName:8.1.0	APILevel:27	

#	type:1	name:BMI160 Accelerometer	version:2061700	vendor:BOSCH	resolution:0.0023956299	power:0.18	maximumRange:78.4532

#	type:4	name:BMI160 Gyroscope	version:2061700	vendor:BOSCH	resolution:0.0010681152	power:0.9	maximumRange:34.906586

#	type:2	name:AK09911 Magnetometer	version:1	vendor:AKM	resolution:0.1	power:0.24	maximumRange:4900.0

#	VersionName:v20190612-nightly	VersionCode:263	

1561085017159	TYPE_WAYPOINT	197.02727	115.13132

1561085017352	TYPE_ACCELEROMETER	-0.1149292	0.44293213	9.844955

1561085017352	TYPE_MAGNETIC_FIELD	-25.26	-19.98	-31.14

1561085017352	TYPE_GYROSCOPE	-0.13017273	0.10491943	0.02029419

1561085017352	TYPE_ROTATION_VECTOR	0.018641658	-0.024995841	-0.8971284

1561085017352	TYPE_MAGNETIC_FIELD_UNCALIBRATED	-4.92	-17.64	-100.979996

1561085017352	TYPE_GYROSCOPE_UNCALIBRATED	-0.

In [33]:
# using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./

In [34]:
# Import custom function from the repository
from io_f import read_data_file

# Read in 1 random example
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
sample_file = read_data_file(path)

# You can access the information for each variable:
# Each data is split for time
# Metadata is expressed with "#"

# for i in sample_file.acce[:, [0]]:
#     print(i)
#     print(int(i))

print("~~~ Example ~~~")
print("acce: {}".format(sample_file.acce), "\n" +
      "acce shape: {}".format(sample_file.acce.shape), "\n" +
#       "acacce_uncalice: {}".format(sample_file.acce_uncali), "\n" +
      "acacce_uncalice shape: {}".format(sample_file.acce_uncali.shape), "\n" +
#       "ahrs: {}".format(sample_file.ahrs), "\n" +
      "ahrs shape: {}".format(sample_file.ahrs.shape), "\n" +
#       "gyro: {}".format(sample_file.gyro), "\n" +
      "gyro shape: {}".format(sample_file.gyro.shape), "\n" +
#       "gyro_uncali: {}".format(sample_file.gyro_uncali), "\n" +
      "gyro_uncali shape: {}".format(sample_file.gyro_uncali.shape), "\n" +
#       "ibeacon: {}".format(sample_file.ibeacon), "\n" +
      "ibeacon shape: {}".format(sample_file.ibeacon.shape), "\n" +
#       "magn: {}".format(sample_file.magn), "\n" +
      "magn shape: {}".format(sample_file.magn.shape), "\n" +
#       "magn_uncali: {}".format(sample_file.magn_uncali), "\n" +
      "magn_uncali shape: {}".format(sample_file.magn_uncali.shape), "\n" +
#       "waypoint: {}".format(sample_file.waypoint), "\n" +
      "waypoint shape: {}".format(sample_file.waypoint.shape), "\n" +
#       "wifi: {}".format(sample_file.wifi), "\n" +
      "wifi shape: {}".format(sample_file.wifi.shape))

~~~ Example ~~~
acce: [[ 1.57657286e+12 -2.12945560e+00 -3.34091200e-01  5.88655100e+00]
 [ 1.57657286e+12 -2.18690500e+00 -3.38272100e-01  4.83909600e+00]
 [ 1.57657286e+12 -9.64675900e-01 -8.85955800e-01  5.00907900e+00]
 ...
 [ 1.57657297e+12 -2.11987300e+00  5.22430400e-01  9.71009800e+00]
 [ 1.57657297e+12 -2.07139590e+00  6.11022950e-01  9.69572450e+00]
 [ 1.57657297e+12 -1.83377080e+00  3.50646970e-01  1.03218230e+01]] 
acce shape: (5299, 4) 
acacce_uncalice shape: (5299, 4) 
ahrs shape: (5299, 4) 
gyro shape: (5299, 4) 
gyro_uncali shape: (5299, 4) 
ibeacon shape: (250, 3) 
magn shape: (5299, 4) 
magn_uncali shape: (5299, 4) 
waypoint shape: (19, 3) 
wifi shape: (2403, 5)


In [35]:
# def show_site_png(root_path, site):
#     floor_paths = glob.glob(root_path + "metadata/" + site + "/*/floor_image.png")
#     n = len(floor_paths)
#     print("No. of floor paths: ", n)

#     # Create the custom number of rows & columns
#     ncols = [ceil(n / 3) if n > 4 else 4][0]
#     nrows = [ceil(n / ncols) if n > 4 else 1][0]

#     plt.figure(figsize=(16, 10))
#     plt.suptitle(f"Site no. '{site}'", fontsize=18)

#     # Plot image for each floor
#     for k, floor in enumerate(floor_paths):
#         # plt.subplot(nrows, ncols, k+1)
#         plt.subplot(ncols, nrows, k+1)
#         plt.rcParams["figure.facecolor"] = "white"

#         image = Image.open(floor)

#         plt.imshow(image)
#         plt.axis("off")
#         title = floor.split("/")[5]
#         plt.title(title, fontsize=15)

In [36]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [37]:
# # Checking the floor number distribution

# all_floors = glob.glob("../input/indoor-location-navigation/metadata/*/*")
# all_sites = glob.glob("../input/indoor-location-navigation/metadata/*")
# floor_no = []
# floor_counts = []

# # Floor count
# for site in all_sites:
#     floor_count = len([name for name in os.listdir(site)])
#     floor_counts.append(floor_count)

# floor_counts_df = pd.DataFrame(floor_counts, columns=["F_Count"])
# floor_counts_df = floor_counts_df["F_Count"].value_counts().reset_index()
# floor_counts_df = floor_counts_df.sort_values("index", ascending=True)

# # Extract only the floor number
# for floor in all_floors:
#     no = floor.split("/")[5]
#     floor_no.append(no)
    
# floor_no = pd.DataFrame(floor_no, columns=["No"])
# floor_no = floor_no["No"].value_counts().reset_index()
# floor_no = floor_no.sort_values("No", ascending=False)

# # ToDo: Floor expressions need to be fixed
# # 1F -> F1, L1 -> F1, G -> F1 etc

# # Plot
# # display(floor_counts_df.head(10))

# fig, axes = plt.subplots(ncols=2, figsize=(16, 10))
# axes[0] = sns.barplot(data=floor_counts_df, x="index", y="F_Count", palette="viridis", saturation=0.4, ax=axes[0])
# axes[0].set_title("Floor Count Distribution", size = 26, weight="bold")
# axes[0].set_xlabel("")
# axes[0].set_ylabel("Floor Count", size = 18, weight="bold")

# axes[1] = sns.barplot(data=floor_no, x="No", y="index", palette="viridis", saturation=0.4, ax=axes[1])
# axes[1].set_title("Frequency of Floors", size = 26, weight="bold")
# axes[1].set_xlabel("")
# axes[1].set_ylabel("Floor No.", size = 18, weight="bold")

# plt.xticks([])
# plt.yticks(fontsize=11)
# sns.despine(left=True, bottom=True);

In [38]:
# # Metadata checking (GeoJSON)
# # This is a vector representation of floor map
# geojson_paths = glob.glob("../input/indoor-location-navigation/metadata/*/*/geojson_map.json")
# print("No. of geojson file: {}".format(len(geojson_paths)))

# # Print one example
# ex = random.randint(0, len(geojson_paths))
# geojson_file_name = geojson_paths[ex]
# with open(geojson_file_name) as json_file:
#     paths = geojson_file_name.split("/")
#     site_id = paths[4]
#     floor = paths[5]
#     json_data = json.load(json_file)
#     json_properties = json_data["features"][0]["properties"]
#     print("File path: {}".format(geojson_file_name))
#     print("SiteID: {}".format(site_id))
#     print("Floor: {}".format(floor))
#     print("Floor info: {}".format(json_properties))

# # create id and floor number matching file
# site_ids = []
# floor_no = []
# floor_no_json = []

# for i in range(0, len(geojson_paths)):
#     with open(geojson_paths[i]) as f:
#         paths = geojson_paths[i].split("/")
#         site_id = paths[4]
#         floor = paths[5]
#         site_ids.append(site_id)
#         floor_no.append(floor)
#         d = json.load(f)
#         try:
#             floor_no_json.append(d["features"][0]["properties"]["floor_num"])
#         except:
#             floor_no_json.append(np.nan)

# floor_num_df = pd.DataFrame(
#     {"site_id": site_ids,
#      "floor_no": floor_no,
#      "floor_no_json": floor_no_json,
#     })

# display("floor_num_df length: {}".format(len(floor_num_df)))
# display(floor_num_df.head())

# # Get floormap dict to be used later
# floor_map_pairs = list(zip(floor_num_df["floor_no"], floor_num_df["floor_no_json"]))
# floor_map_pairs = np.unique(floor_map_pairs, axis=0) # get unique pair
# # print(floor_map_pairs) # to be used as floor_map later

# # Plot distribution
# floor_num_count_df = floor_num_df["floor_no_json"].value_counts().reset_index()
# floor_num_count_df = floor_num_count_df.sort_values("floor_no_json", ascending=False)
# # display(floor_num_count_df)
# # print(len(floor_num_count_df["floor_no_json"] == np.nan))

# fig = plt.figure()
# ax = plt.subplots(figsize=(16, 10))
# sns.barplot(data=floor_num_count_df, x="index", y="floor_no_json", palette="viridis", saturation=0.4)
# fig.show()

# # Just in case: Need for altitude info in geoJSON
# # from pyproj import Proj, transform
# # print(transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), -0.1285907, 51.50809))  # longitude first, latitude second.
# # output (meters east of 0, meters north of 0): (-14314.651244750548, 6711665.883938471)

In [39]:
# # More viz on accelerometers, wifi etc in one go
# from visualize_f import visualize_trajectory, visualize_heatmap
# from main import extract_wifi_rssi, extract_wifi_count
# from main import calibrate_magnetic_wifi_ibeacon_to_position
# from main import extract_magnetic_strength
# from main import extract_ibeacon_rssi

# # Visualizing magnetic strength
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# # extract mag, wifi, beacon of one example
# mwi_datas = calibrate_magnetic_wifi_ibeacon_to_position([path])
# magnetic_strength = extract_magnetic_strength(mwi_datas)
# wifi_rssi = extract_wifi_rssi(mwi_datas)
# wifi_counts = extract_wifi_count(mwi_datas)
# ibeacon_rssi = extract_ibeacon_rssi(mwi_datas)
# ibeacon_ummids = list(ibeacon_rssi.keys())
# target_ibeacon = ibeacon_ummids[0]

# # positions for heatmap
# heat_positions = np.array(list(magnetic_strength.keys()))
# heat_values = np.array(list(magnetic_strength.values()))
# heat_positions_wifi = np.array(list(wifi_counts.keys()))
# heat_values_wifi = np.array(list(wifi_counts.values()))
# heat_positions_bc = np.array(list(ibeacon_rssi[target_ibeacon].keys()))
# heat_values_bc = np.array(list(ibeacon_rssi[target_ibeacon].values()))[:, 0]

# # filter out positions that no wifi detected
# mask = heat_values_wifi != 0
# heat_positions_wifi = heat_positions_wifi[mask]
# heat_values_wifi = heat_values_wifi[mask]

# # get trajectory
# example = read_data_file(path)
# trajectory = example.waypoint # Returns timestamp, x, y values
# print(f"Waypoints: {trajectory}")
# trajectory = trajectory[:, 1:3] # Removes timestamp (we only need the coordinates)

# # Plot trajectory
# visualize_trajectory(trajectory = trajectory,
#                      floor_plan_filename = floor_plan_filename,
#                      width_meter = width_meter,
#                      height_meter = height_meter,
#                      title = "Example of Waypoint",)

In [40]:
# Try working out step_positions for 1 trace file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [41]:
# path_datas = read_data_file(path)
# acce_datas = path_datas.acce
# magn_datas = path_datas.magn
# ahrs_datas = path_datas.ahrs
# wifi_datas = path_datas.wifi
# ibeacon_datas = path_datas.ibeacon
# posi_datas = path_datas.waypoint # not to be used

# # acce and ahrs data translation
# rel_positions = calc_rel_positions(acce_datas, ahrs_datas)
# print(acce_datas.shape)
# print(acce_datas[0])
# print(ahrs_datas[0])
# print(rel_positions.shape)

# # magn data translation
# print(magn_datas.shape)
# print(magn_datas[0])
# # print(extract_magnetic_strength(magn_datas))

In [60]:
# Methods for preprocessing train data: Timestamp handling

def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    data_ts = data[:, [0]]
    diff = []
    for ts in data_ts:
        diff.append(abs(int(t) - int(ts)))
    closest_index = np.argmin(diff) # if multiple records have the same value..?
    return data[closest_index]

In [61]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    z_axis = data[3]
    try:
        accuracy = data[4]
    except IndexError:
        accuracy = np.nan
    return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    bssid = data[2]
    rssi = data[3]
    if len(data) > 5:
        freq = data[4]
        last_seen_ts = data[5]
    else:
        freq = np.nan
        last_seen_ts = data[-1]
    return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    rssi = data[2]
    return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    return [data_ts, diff_ts, x_axis, y_axis]

In [62]:
# Extract path and other data

def extract_path(path, floor_map):
    # split path
    try:
        ex_path = f"{path}"
        ex_paths = ex_path.split("/")
        site_id = ex_paths[4]
        floor = ex_paths[5]
        f = floor_map[floor]
        file_id = ex_paths[6].split(".")[0]
        return [site_id, file_id, f, floor]
    except:
        print("extract_path error")

# Definitely needs to be refactored
def extract_data(path):
    try:
        start_ts = find_start_ts(path)
        path_datas = read_data_file(path)
        acce = path_datas.acce
        ahrs = path_datas.ahrs
        magn = path_datas.magn
        gyro = path_datas.gyro
        acce_uncali = path_datas.acce_uncali
        magn_uncali = path_datas.magn_uncali
        gyro_uncali = path_datas.gyro_uncali
        wifi = path_datas.wifi
        wps = path_datas.waypoint
        ibeacon = path_datas.ibeacon
        rel_positions = calc_rel_positions(acce, ahrs)

        # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
        # ts = np.unique(wps[:, [0]])
        ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data

        # extract data for each timestamp of waypoints
        res = []
        for i, t in enumerate(ts):
            wp_closest = find_smallest_diff(t, wps)
            closest_wp_ts = wp_closest[0]
            diff_ts_wp_ts = int(t) - int(closest_wp_ts)
            # time_stamp_cut = 2000, only the records within 2 sec of waypoint are kept
            if abs(diff_ts_wp_ts) < time_stamp_cut:
            # flag to indicate how close the data point is to the wps
                within_500ms = True if abs(diff_ts_wp_ts) <= 500 else False
                within_1000ms = True if abs(diff_ts_wp_ts) <= 1000 else False
                x = wp_closest[1]
                y = wp_closest[2]
                diff_start_ts = int(t) - int(start_ts)
                diff_start_wp_ts = int(closest_wp_ts) - int(start_ts)
                acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
                ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
                magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
                magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
                gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
                acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
                magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
                gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
                wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
                beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
                rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
                res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_500ms, within_1000ms] + \
                           acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                           acce_u_closest + magn_u_closest + gyro_u_closest + \
                           wifi_closest + beacon_closest + rel_pos
                          )
            else:
                continue
        return res
    except:
        print("extract_data error")

In [63]:
# %%timeit

# 5.55 ms ± 1.76 ms per loop
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# for fixing floor expression
# print(floor_map_pairs) # to be used as floor_map later
# assign 1F to 1 rather than zero, just in case we want to use this as integer

# Original floor map
# floor_map = {
#     '1F': 1, '2F': 2, '3F': 3, '4F': 4, '5F': 5, '6F': 6, '7F': 7,
#     '8F': 8, '9F': 9, 'B': -1, 'B1': -1, 'B2': -2, 'B2': -3, 'B3': -3,
#     'BF': -1, 'BM': -1, 'F1': 1, 'F2': 2, 'F3': 3, 'F4': 4, 'F5': 5,
#     'F6': 6, 'F7': 7, 'F8': 8, 'F9': 9, 'F10': 10, 'G': -1, 'L1': 1, 'L2': 2,
#     'L3': 3, 'L3': 4, 'L4': 4, 'L4': 6, 'L5': 5, 'L6': 6, 'L7': 7, 'L8': 8,
#     'L9': 9, 'L10': 10, 'L11': 11, 'LG1': -1, 'LG2': -2,
#     'LM': np.nan, 'M': np.nan, 'P1': np.nan, 'P2': np.nan}

floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

def one_trace_to_rows(path, floor_map):
    try:
        path_info = extract_path(path, floor_map)
        data = extract_data(path)
        # rows = list(itertools.chain(path_info, *data))
        rows = []
        for d in data:
            row = path_info + d
            rows.append(row)
            # print("row: ", row)
        return rows
    except:
        print("one_trace_to_rows error")

# path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# time w/ for loop with 1 train_path -> 11.642422199249268
# time w/ itertools.chain for 1 train_path -> 11.862319946289062
start = time.time()
print("path: ", path)
path_info = extract_path(path, floor_map)
rows = one_trace_to_rows(path, floor_map)
print("time to process one train_path", time.time() - start)
print("col count: ", len(rows[0]))
print("rows: ", rows)

path:  ../input/indoor-location-navigation/train/5cd56c0ee2acfd2d33b6b000/L1/5cf4dd74bf5b110008c9225c.txt
time to process one train_path 3.6772398948669434
col count:  72
rows:  [['5cd56c0ee2acfd2d33b6b000', '5cf4dd74bf5b110008c9225c', 1, 'L1', 1559549637113, '1559549636985', 128, 150.99194, 27.006023, 1559549636986, 1, 127, True, True, 1559549637113.0, 128, 0.29551697, 3.3308105, 8.822342, nan, 1559549637113.0, 128, 0.08314253, -0.15517128, -0.8215232, nan, 1559549637113.0, 128, -22.859999, -22.26, -37.14, nan, 48.96393319863102, 1559549637113.0, 128, 0.2498169, 0.17692566, 0.26531982, nan, 1559549637113.0, 128, 0.275177, 3.3092804, 8.854645, nan, 1559549637113.0, 128, -16.079998, -57.18, -162.0, nan, 1559549637113.0, 128, 0.3579712, 0.28588867, 0.30296326, nan, '1559549637837', 852, '90a13d9926e0f11cb22ac9bb4d245a64ed87411c', 'b28e6b446bb6d2e075ac5263c52f0b0f3569220e', '-82', nan, '1559549637100', '1559549637201', 216, '89cb11b04122cef23388b0da06bd426c1f48a9b5_ecf1c8b6048e66a1ef5a967

In [46]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [47]:
# Check if we can make df

# column names
col_names = ["site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_500ms", "within_1000ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

print(len(col_names))

df = pd.DataFrame(rows, columns=col_names)
print("df len: ", len(df))
print("x value_counts: ", df["x"].value_counts())
print("y value_counts: ", df["y"].value_counts())
print("event ts nunique: ", df["ts"].nunique())
print("start ts nunique: ", df["start_ts"].nunique()) # should be one
print("diff_ts_wp_ts value_counts: ", df["diff_ts_wp_ts"].value_counts())
print("diff_ts_wp_ts nunique: ", df["diff_ts_wp_ts"].nunique())
print("within_500ms value_counts: ", df["within_500ms"].value_counts())
print("within_500ms nunique: ", df["within_500ms"].nunique())
print("within_500ms count: ", df["within_500ms"].count())
print("within_1000ms value_counts: ", df["within_1000ms"].value_counts())
print("within_1000ms nunique: ", df["within_1000ms"].nunique())
print("within_1000ms count: ", df["within_1000ms"].count())
display(df.head())

72
df len:  530
x value_counts:  85.778310    98
84.732254    98
86.678110    98
83.544500    98
86.276460    98
87.459360    40
Name: x, dtype: int64
y value_counts:  43.738575    98
53.027588    98
38.263794    98
65.347340    98
61.583290    98
32.742960    40
Name: y, dtype: int64
event ts nunique:  530
start ts nunique:  1
diff_ts_wp_ts value_counts:   869    2
 137    2
-880    2
 883    2
 259    2
       ..
 715    1
-350    1
-383    1
 342    1
-952    1
Name: diff_ts_wp_ts, Length: 456, dtype: int64
diff_ts_wp_ts nunique:  456
within_500ms value_counts:  False    268
True     262
Name: within_500ms, dtype: int64
within_500ms nunique:  2
within_500ms count:  530
within_1000ms value_counts:  True    530
Name: within_1000ms, dtype: int64
within_1000ms nunique:  1
within_1000ms count:  530


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_500ms,within_1000ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5da958dd46f8266d0737457b,5db00058e62491000652b275,5,F6,1571815326555,1571815326344,211,87.45936,32.74296,1571815326363,19,192,True,True,1571815000000.0,211,-0.522598,1.632858,9.819824,,1571815000000.0,211,0.093714,0.000691,-0.029479,,1571815000000.0,211,-1.693726,21.768188,-36.07483,,42.167714,1571815000000.0,211,0.015732,-0.009521,-0.072647,,1571815000000.0,211,-0.522598,1.632858,9.819824,,1571815000000.0,211,-49.92218,-62.287903,-330.55267,,1571815000000.0,211,-0.056259,0.116043,-0.117905,,1571815328376,2032,1eed8188f57edbff21b82f77bb92589a553f52cf,b3d8ac445b4613fada92b941469806a65c5e932b,-54,,1571815312252,1571815326480,136,89cb11b04122cef23388b0da06bd426c1f48a9b5_2c5fe...,-85,1571815000000.0,1004,-0.009006,0.46768
1,5da958dd46f8266d0737457b,5db00058e62491000652b275,5,F6,1571815326575,1571815326344,231,87.45936,32.74296,1571815326363,19,212,True,True,1571815000000.0,231,-1.019409,1.216263,9.385269,,1571815000000.0,231,0.092665,0.003888,-0.034768,,1571815000000.0,231,-0.990295,23.184204,-37.443542,,44.051184,1571815000000.0,231,0.152618,0.052277,0.022171,,1571815000000.0,231,-0.522598,1.632858,9.819824,,1571815000000.0,231,-49.21875,-60.871887,-331.9214,,1571815000000.0,231,0.011398,-0.006485,-0.07103,,1571815328376,2032,1eed8188f57edbff21b82f77bb92589a553f52cf,b3d8ac445b4613fada92b941469806a65c5e932b,-54,,1571815312252,1571815326480,136,89cb11b04122cef23388b0da06bd426c1f48a9b5_2c5fe...,-85,1571815000000.0,1004,-0.009006,0.46768
2,5da958dd46f8266d0737457b,5db00058e62491000652b275,5,F6,1571815326596,1571815326344,252,87.45936,32.74296,1571815326363,19,233,True,True,1571815000000.0,252,-1.073883,1.376083,8.722076,,1571815000000.0,252,0.090414,0.011469,-0.040967,,1571815000000.0,252,0.416565,21.768188,-36.75995,,42.723781,1571815000000.0,252,0.112137,0.234436,0.167587,,1571815000000.0,252,-1.019409,1.216263,9.385269,,1571815000000.0,252,-47.81189,-62.287903,-331.2378,,1571815000000.0,252,0.148285,0.055313,0.023788,,1571815328376,2032,1eed8188f57edbff21b82f77bb92589a553f52cf,b3d8ac445b4613fada92b941469806a65c5e932b,-54,,1571815312252,1571815326480,136,89cb11b04122cef23388b0da06bd426c1f48a9b5_2c5fe...,-85,1571815000000.0,1004,-0.009006,0.46768
3,5da958dd46f8266d0737457b,5db00058e62491000652b275,5,F6,1571815326616,1571815326344,272,87.45936,32.74296,1571815326363,19,253,True,True,1571815000000.0,272,-0.284973,1.109726,8.696945,,1571815000000.0,272,0.089781,0.018976,-0.044106,,1571815000000.0,272,0.416565,22.476196,-34.022522,,40.778486,1571815000000.0,272,-0.048187,0.384109,0.27037,,1571815000000.0,272,-1.073883,1.376083,8.722076,,1571815000000.0,272,-47.81189,-61.579895,-328.50037,,1571815000000.0,272,0.107803,0.237473,0.169205,,1571815328376,2032,1eed8188f57edbff21b82f77bb92589a553f52cf,b3d8ac445b4613fada92b941469806a65c5e932b,-54,,1571815312252,1571815326480,136,89cb11b04122cef23388b0da06bd426c1f48a9b5_2c5fe...,-85,1571815000000.0,1004,-0.009006,0.46768
4,5da958dd46f8266d0737457b,5db00058e62491000652b275,5,F6,1571815326636,1571815326344,292,87.45936,32.74296,1571815326363,19,273,True,True,1571815000000.0,292,0.374023,1.829178,9.127899,,1571815000000.0,292,0.086191,0.021967,-0.039744,,1571815000000.0,292,0.416565,22.476196,-36.75995,,43.088825,1571815000000.0,292,-0.152054,0.352676,0.194733,,1571815000000.0,292,-0.284973,1.109726,8.696945,,1571815000000.0,292,-47.81189,-61.579895,-331.2378,,1571815000000.0,292,-0.052521,0.387146,0.271988,,1571815328376,2032,1eed8188f57edbff21b82f77bb92589a553f52cf,b3d8ac445b4613fada92b941469806a65c5e932b,-54,,1571815312252,1571815326480,136,89cb11b04122cef23388b0da06bd426c1f48a9b5_2c5fe...,-85,1571815000000.0,1004,-0.009006,0.46768


In [48]:
# Set pool
num_cores = multiprocessing.cpu_count()
print(f"num_cores={num_cores}")
args = [(p, floor_map) for p in train_paths[:train_num]]
pool = Pool(num_cores)

start = time.time()
res = pool.starmap(one_trace_to_rows, args)
df_train = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_train = df_train.append(df)

# Memo
# time becomes approx. half is we halve the cut off timestamp
# 10 paths -> 52~58 secs (CPU, num_cores=4)
# 10 paths -> 55 secs (TPU, num_cores=4)
# 100 paths -> 
# 1000 paths ->
print("train_path count", len(train_paths[:train_num]))
print("time to process", time.time() - start)
print("length of df made", len(df_train))
display(df_train.head(10))

num_cores=4
extract_data error
extract_data error
one_trace_to_rows error
one_trace_to_rows error
extract_data error
one_trace_to_rows error
extract_data error
one_trace_to_rows error
extract_data error
one_trace_to_rows error
extract_data error
one_trace_to_rows error
extract_data error
one_trace_to_rows error
train_path count 10
time to process 27.874643564224243
length of df made 1737


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_500ms,within_1000ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179027,1560916178830,197,13.726707,22.423597,1560916178830,0,197,True,True,1560916000000.0,197,-0.215485,0.691925,9.414001,,1560916000000.0,197,0.019111,-0.033138,-0.981493,,1560916000000.0,197,-11.88,-31.619999,-27.9,,43.810601,1560916000000.0,197,-0.084213,0.018066,0.016937,,1560916000000.0,197,-0.215485,0.691925,9.414001,,1560916000000.0,197,16.380001,-32.94,-91.02,,1560916000000.0,197,-0.072433,0.039948,0.029831,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
1,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179047,1560916178830,217,13.726707,22.423597,1560916178830,0,217,True,True,1560916000000.0,217,-0.287308,0.67038,9.540894,,1560916000000.0,217,0.016971,-0.034242,-0.983516,,1560916000000.0,217,-11.639999,-31.619999,-27.9,,43.74613,1560916000000.0,217,-0.141205,0.004211,0.005219,,1560916000000.0,217,-0.215485,0.691925,9.414001,,1560916000000.0,217,16.62,-32.94,-91.02,,1560916000000.0,217,-0.098541,0.02771,0.019714,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
2,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179067,1560916178830,237,13.726707,22.423597,1560916178830,0,237,True,True,1560916000000.0,237,-0.265762,0.596161,9.758759,,1560916000000.0,237,0.020993,-0.031925,-0.983147,,1560916000000.0,237,-11.46,-31.56,-27.9,,43.655185,1560916000000.0,237,-0.175827,0.001541,0.008942,,1560916000000.0,237,-0.265762,0.596161,9.758759,,1560916000000.0,237,16.8,-32.88,-91.02,,1560916000000.0,237,-0.190155,0.011185,0.011719,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
3,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179087,1560916178830,257,13.726707,22.423597,1560916178830,0,257,True,True,1560916000000.0,257,-0.179565,0.550659,9.844955,,1560916000000.0,257,0.020695,-0.030332,-0.983186,,1560916000000.0,257,-11.46,-31.38,-27.66,,43.371783,1560916000000.0,257,-0.164642,0.031906,0.017471,,1560916000000.0,257,-0.265762,0.596161,9.758759,,1560916000000.0,257,16.8,-32.7,-90.78,,1560916000000.0,257,-0.190155,0.011185,0.011719,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
4,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179107,1560916178830,277,13.726707,22.423597,1560916178830,0,277,True,True,1560916000000.0,277,-0.131683,0.567429,9.727646,,1560916000000.0,277,0.020536,-0.02862,-0.98321,,1560916000000.0,277,-11.34,-31.5,-27.96,,43.618886,1560916000000.0,277,-0.144394,0.044159,0.019608,,1560916000000.0,277,-0.131683,0.567429,9.727646,,1560916000000.0,277,16.92,-32.82,-91.08,,1560916000000.0,277,-0.17897,0.04155,0.020248,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
5,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179126,1560916178830,296,13.726707,22.423597,1560916178830,0,296,True,True,1560916000000.0,296,-0.160416,0.588974,9.631866,,1560916000000.0,296,0.020648,-0.027042,-0.983207,,1560916000000.0,296,-11.46,-31.26,-28.199999,,43.632088,1560916000000.0,296,-0.149185,0.045761,-0.000107,,1560916000000.0,296,-0.160416,0.588974,9.631866,,1560916000000.0,296,16.8,-32.58,-91.32,,1560916000000.0,296,-0.158722,0.053802,0.022385,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
6,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179146,1560916178830,316,13.726707,22.423597,1560916178830,0,316,True,True,1560916000000.0,316,-0.208298,0.63446,9.715668,,1560916000000.0,316,0.020849,-0.02559,-0.983217,,1560916000000.0,316,-11.46,-31.08,-28.5,,43.698375,1560916000000.0,316,-0.186478,0.017517,-0.016632,,1560916000000.0,316,-0.208298,0.63446,9.715668,,1560916000000.0,316,16.8,-32.4,-91.619995,,1560916000000.0,316,-0.163513,0.055405,0.00267,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
7,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179166,1560916178830,336,13.726707,22.423597,1560916178830,0,336,True,True,1560916000000.0,336,-0.17717,0.65361,9.866501,,1560916000000.0,336,0.020905,-0.023918,-0.983264,,1560916000000.0,336,-11.639999,-30.9,-28.5,,43.618225,1560916000000.0,336,-0.205658,-0.018173,-0.020889,,1560916000000.0,336,-0.17717,0.65361,9.866501,,1560916000000.0,336,16.62,-32.22,-91.619995,,1560916000000.0,336,-0.219986,-0.00853,-0.018112,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
8,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179186,1560916178830,356,13.726707,22.423597,1560916178830,0,356,True,True,1560916000000.0,356,-0.110138,0.63446,9.835388,,1560916000000.0,356,0.020529,-0.02196,-0.983347,,1560916000000.0,356,-11.759999,-30.779999,-28.8,,43.76238,1560916000000.0,356,-0.177414,-0.010178,-0.014496,,1560916000000.0,356,-0.110138,0.63446,9.835388,,1560916000000.0,356,16.5,-32.1,-91.92,,1560916000000.0,356,-0.219986,-0.00853,-0.018112,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
9,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179206,1560916178830,376,13.726707,22.423597,1560916178830,0,376,True,True,1560916000000.0,376,-0.057465,0.600952,9.636658,,1560916000000.0,376,0.019983,-0.020083,-0.983425,,1560916000000.0,376,-12.0,-30.779999,-28.859999,,43.866934,1560916000000.0,376,-0.143341,0.009003,-0.011841,,1560916000000.0,376,-0.057465,0.600952,9.636658,,1560916000000.0,376,16.26,-32.1,-91.979996,,1560916000000.0,376,-0.191742,-0.000534,-0.011719,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686


In [49]:
print("df len: ", len(df_train), "\n")
print("site_id value_counts: ", df_train["site_id"].value_counts(), "\n")
print("file_id value_counts: ", df_train["file_id"].value_counts(), "\n")
print("floor value_counts: ", df_train["floor"].value_counts(), "\n")
print("x value_counts: ", df_train["x"].value_counts(), "\n")
print("y value_counts: ", df_train["y"].value_counts(), "\n")
print("event ts nunique: ", df_train["ts"].nunique(), "\n")
print("start ts nunique: ", df_train["start_ts"].nunique(), "\n") # should be one
print("diff_ts_wp_ts value_counts: ", df_train["diff_ts_wp_ts"].value_counts(), "\n")
print("diff_ts_wp_ts nunique: ", df_train["diff_ts_wp_ts"].nunique(), "\n")
print("within_500ms value_counts: ", df_train["within_500ms"].value_counts(), "\n")
print("within_500ms nunique: ", df_train["within_500ms"].nunique(), "\n")
print("within_500ms count: ", df_train["within_500ms"].count(), "\n")
print("within_1000ms value_counts: ", df_train["within_1000ms"].value_counts(), "\n")
print("within_1000ms nunique: ", df_train["within_1000ms"].nunique(), "\n")
print("within_1000ms count: ", df_train["within_1000ms"].count(), "\n")
display(df_train.head())

df len:  1737 

site_id value_counts:  5cd56c0ce2acfd2d33b6ab27    1737
Name: site_id, dtype: int64 

file_id value_counts:  5d09b23ccfb49b00085466a6    748
5d09b23bf03da60008286c47    545
5d09b23fcfb49b00085466aa    444
Name: file_id, dtype: int64 

floor value_counts:  F3    1737
Name: floor, dtype: int64 

x value_counts:  45.678673    303
56.869186    202
49.860214    202
50.512722    201
38.390095    143
13.726707    142
66.124870    141
15.908386    101
43.870420    101
19.775023    101
16.894894    100
Name: x, dtype: int64 

y value_counts:  32.961018    303
31.726166    202
31.729326    202
34.399567    201
29.042292    143
22.423597    142
30.970420    141
36.060284    101
20.307095    101
24.560215    101
18.667006    100
Name: y, dtype: int64 

event ts nunique:  1737 

start ts nunique:  3 

diff_ts_wp_ts value_counts:   555    3
-118    3
 199    3
-396    3
-534    3
       ..
 907    1
 913    1
 917    1
 918    1
 962    1
Name: diff_ts_wp_ts, Length: 1144, dtype: int

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_500ms,within_1000ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179027,1560916178830,197,13.726707,22.423597,1560916178830,0,197,True,True,1560916000000.0,197,-0.215485,0.691925,9.414001,,1560916000000.0,197,0.019111,-0.033138,-0.981493,,1560916000000.0,197,-11.88,-31.619999,-27.9,,43.810601,1560916000000.0,197,-0.084213,0.018066,0.016937,,1560916000000.0,197,-0.215485,0.691925,9.414001,,1560916000000.0,197,16.380001,-32.94,-91.02,,1560916000000.0,197,-0.072433,0.039948,0.029831,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
1,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179047,1560916178830,217,13.726707,22.423597,1560916178830,0,217,True,True,1560916000000.0,217,-0.287308,0.67038,9.540894,,1560916000000.0,217,0.016971,-0.034242,-0.983516,,1560916000000.0,217,-11.639999,-31.619999,-27.9,,43.74613,1560916000000.0,217,-0.141205,0.004211,0.005219,,1560916000000.0,217,-0.215485,0.691925,9.414001,,1560916000000.0,217,16.62,-32.94,-91.02,,1560916000000.0,217,-0.098541,0.02771,0.019714,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
2,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179067,1560916178830,237,13.726707,22.423597,1560916178830,0,237,True,True,1560916000000.0,237,-0.265762,0.596161,9.758759,,1560916000000.0,237,0.020993,-0.031925,-0.983147,,1560916000000.0,237,-11.46,-31.56,-27.9,,43.655185,1560916000000.0,237,-0.175827,0.001541,0.008942,,1560916000000.0,237,-0.265762,0.596161,9.758759,,1560916000000.0,237,16.8,-32.88,-91.02,,1560916000000.0,237,-0.190155,0.011185,0.011719,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
3,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179087,1560916178830,257,13.726707,22.423597,1560916178830,0,257,True,True,1560916000000.0,257,-0.179565,0.550659,9.844955,,1560916000000.0,257,0.020695,-0.030332,-0.983186,,1560916000000.0,257,-11.46,-31.38,-27.66,,43.371783,1560916000000.0,257,-0.164642,0.031906,0.017471,,1560916000000.0,257,-0.265762,0.596161,9.758759,,1560916000000.0,257,16.8,-32.7,-90.78,,1560916000000.0,257,-0.190155,0.011185,0.011719,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686
4,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916179107,1560916178830,277,13.726707,22.423597,1560916178830,0,277,True,True,1560916000000.0,277,-0.131683,0.567429,9.727646,,1560916000000.0,277,0.020536,-0.02862,-0.98321,,1560916000000.0,277,-11.34,-31.5,-27.96,,43.618886,1560916000000.0,277,-0.144394,0.044159,0.019608,,1560916000000.0,277,-0.131683,0.567429,9.727646,,1560916000000.0,277,16.92,-32.82,-91.08,,1560916000000.0,277,-0.17897,0.04155,0.020248,,1560916179432,602,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77,,1560916166728,1560916180806,1976,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97,1560916000000.0,1683,0.171185,-0.403686


In [50]:
# Visualizing timestamp distribution

# Explore
# print(df_train["ts"].dtype)
# print(df_test["ts"].dtype)

# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor_converted", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_train, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_train, ["ts", "start_ts", "diff_start_ts", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
             "acce_ts", "diff_acce_ts", \
             "ahrs_ts", "diff_ahrs_ts", \
             "magn_ts", "diff_magn_ts", \
             "gyro_ts", "diff_gyro_ts", \
             "acce_u_ts", "diff_acce_u_ts", \
             "magn_u_ts", "diff_magn_u_ts", \
             "gyro_u_ts", "diff_gyro_u_ts", \
             "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_rssi", \
             "rel_ts", "diff_rel_ts"
            ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_train]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_train.head())
# display(df_test.head())

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_500ms,within_1000ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_converted_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916000000.0,1560916000000.0,197.0,13.726707,22.423597,1560916000000.0,0.0,197.0,True,True,1560916000000.0,197.0,-0.215485,0.691925,9.414001,,1560916000000.0,197.0,0.019111,-0.033138,-0.981493,,1560916000000.0,197.0,-11.88,-31.619999,-27.9,,43.810601,1560916000000.0,197.0,-0.084213,0.018066,0.016937,,1560916000000.0,197.0,-0.215485,0.691925,9.414001,,1560916000000.0,197.0,16.380001,-32.94,-91.02,,1560916000000.0,197.0,-0.072433,0.039948,0.029831,,1560916000000.0,602.0,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77.0,,1560916000000.0,1560916000000.0,1976.0,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97.0,1560916000000.0,1683.0,0.171185,-0.403686,0,2,0,2,2,1,2019-06-19 03:49:39.027000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00,2019-06-19 03:49:26.728,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00
1,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916000000.0,1560916000000.0,217.0,13.726707,22.423597,1560916000000.0,0.0,217.0,True,True,1560916000000.0,217.0,-0.287308,0.67038,9.540894,,1560916000000.0,217.0,0.016971,-0.034242,-0.983516,,1560916000000.0,217.0,-11.639999,-31.619999,-27.9,,43.74613,1560916000000.0,217.0,-0.141205,0.004211,0.005219,,1560916000000.0,217.0,-0.215485,0.691925,9.414001,,1560916000000.0,217.0,16.62,-32.94,-91.02,,1560916000000.0,217.0,-0.098541,0.02771,0.019714,,1560916000000.0,602.0,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77.0,,1560916000000.0,1560916000000.0,1976.0,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97.0,1560916000000.0,1683.0,0.171185,-0.403686,0,2,0,2,2,1,2019-06-19 03:49:39.047000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00,2019-06-19 03:49:26.728,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00
2,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916000000.0,1560916000000.0,237.0,13.726707,22.423597,1560916000000.0,0.0,237.0,True,True,1560916000000.0,237.0,-0.265762,0.596161,9.758759,,1560916000000.0,237.0,0.020993,-0.031925,-0.983147,,1560916000000.0,237.0,-11.46,-31.56,-27.9,,43.655185,1560916000000.0,237.0,-0.175827,0.001541,0.008942,,1560916000000.0,237.0,-0.265762,0.596161,9.758759,,1560916000000.0,237.0,16.8,-32.88,-91.02,,1560916000000.0,237.0,-0.190155,0.011185,0.011719,,1560916000000.0,602.0,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77.0,,1560916000000.0,1560916000000.0,1976.0,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97.0,1560916000000.0,1683.0,0.171185,-0.403686,0,2,0,2,2,1,2019-06-19 03:49:39.067000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00,2019-06-19 03:49:26.728,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00
3,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916000000.0,1560916000000.0,257.0,13.726707,22.423597,1560916000000.0,0.0,257.0,True,True,1560916000000.0,257.0,-0.179565,0.550659,9.844955,,1560916000000.0,257.0,0.020695,-0.030332,-0.983186,,1560916000000.0,257.0,-11.46,-31.38,-27.66,,43.371783,1560916000000.0,257.0,-0.164642,0.031906,0.017471,,1560916000000.0,257.0,-0.265762,0.596161,9.758759,,1560916000000.0,257.0,16.8,-32.7,-90.78,,1560916000000.0,257.0,-0.190155,0.011185,0.011719,,1560916000000.0,602.0,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77.0,,1560916000000.0,1560916000000.0,1976.0,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97.0,1560916000000.0,1683.0,0.171185,-0.403686,0,2,0,2,2,1,2019-06-19 03:49:39.087000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00,2019-06-19 03:49:26.728,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00
4,5cd56c0ce2acfd2d33b6ab27,5d09b23fcfb49b00085466aa,2,F3,1560916000000.0,1560916000000.0,277.0,13.726707,22.423597,1560916000000.0,0.0,277.0,True,True,1560916000000.0,277.0,-0.131683,0.567429,9.727646,,1560916000000.0,277.0,0.020536,-0.02862,-0.98321,,1560916000000.0,277.0,-11.34,-31.5,-27.96,,43.618886,1560916000000.0,277.0,-0.144394,0.044159,0.019608,,1560916000000.0,277.0,-0.131683,0.567429,9.727646,,1560916000000.0,277.0,16.92,-32.82,-91.08,,1560916000000.0,277.0,-0.17897,0.04155,0.020248,,1560916000000.0,602.0,b1e32753c8cfd3624253d16d9bc944d917c451e4,8760dd3789b36258dea5d2b3687be70eb2163310,-77.0,,1560916000000.0,1560916000000.0,1976.0,bd1b5cf6d9f4f7bcb796b62cc831b6c81b1aa6ae_356a1...,-97.0,1560916000000.0,1683.0,0.171185,-0.403686,0,2,0,2,2,1,2019-06-19 03:49:39.107000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00,2019-06-19 03:49:26.728,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:49:00


In [51]:
# Calculate moving averages
# Differencing respect to time (as each timestep is unevenly spaced)

In [52]:
# Save the file in parquet
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models
import pickle

# Saving train data
train_file_name = "indoor_train_2.pkl"
# test_file_name = "indoor_test.pkl"

with open(train_file_name, "wb") as file:
    pickle.dump(df_train, file)

# with open(test_file_name, "wb") as file:
#     pickle.dump(df_test, file)

# Save them to output
df_train.to_csv('df_train_2.csv',index=False)
# df_test.to_csv('df_test.csv',index=False)