In [1]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool, Manager

import pickle
import math

pd.set_option("display.max_columns", 100)

In [2]:
# Settings and altering components for GCP

# path settings
root_path = "../input/indoor-location-navigation/"
# root_path = "../jupyter/input/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

# function imports using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./
from io_f import read_data_file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# import for gcp settings
# import compute_f
# import io_f
# import visualize_f
# import main
# from io_f import read_data_file
# from compute_f import compute_step_positions, compute_steps, \
# compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# filter milisecond setting 
time_stamp_cut = 250

# train number setting
train_num = len(train_paths)
# train_num = round(len(train_paths) / 2)
# train_num = 1000

In [3]:
# Preprocess
print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

# Reading in 1 file
def pick_example(max_range, paths):
    ex = random.randint(0, max_range)
    example_path = paths[ex]
    path = f"{example_path}"
    paths = path.split("/")
    site = paths[4]
    floorNo = paths[5]
    floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
print("example path: ", path)
print("site: ", site)
print("floorNo: ", floorNo)
print("floor_plan_filename: ", floor_plan_filename)
print("json_plan_filename: ", json_plan_filename)
print("width: {}, height: {} ".format(width_meter, height_meter))

with open(path) as p:
    lines = p.readlines()
print("No. Lines in 1 example: {:,}". format(len(lines)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204
example path:  ../input/indoor-location-navigation/train/5cd56c29e2acfd2d33b6d915/L1/5d09a36f0e0fc900086ea744.txt
site:  5cd56c29e2acfd2d33b6d915
floorNo:  L1
floor_plan_filename:  ../input/indoor-location-navigation/metadata/5cd56c29e2acfd2d33b6d915/L1/floor_image.png
json_plan_filename:  ../input/indoor-location-navigation/metadata/5cd56c29e2acfd2d33b6d915/L1/floor_info.json
width: 220.3952814874388, height: 200.4864421618867 
No. Lines in 1 example: 8,008


In [4]:
# for line in lines[:200]:
#     print(line)

In [5]:
# # Read in 1 random example
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# sample_file = read_data_file(path)

# # You can access the information for each variable:
# # Each data is split for time
# # Metadata is expressed with "#"

# # for i in sample_file.acce[:, [0]]:
# #     print(i)
# #     print(int(i))

# print("~~~ Example ~~~")
# print("acce: {}".format(sample_file.acce), "\n" +
#       "acce shape: {}".format(sample_file.acce.shape), "\n" +
# #       "acacce_uncalice: {}".format(sample_file.acce_uncali), "\n" +
#       "acacce_uncalice shape: {}".format(sample_file.acce_uncali.shape), "\n" +
# #       "ahrs: {}".format(sample_file.ahrs), "\n" +
#       "ahrs shape: {}".format(sample_file.ahrs.shape), "\n" +
# #       "gyro: {}".format(sample_file.gyro), "\n" +
#       "gyro shape: {}".format(sample_file.gyro.shape), "\n" +
# #       "gyro_uncali: {}".format(sample_file.gyro_uncali), "\n" +
#       "gyro_uncali shape: {}".format(sample_file.gyro_uncali.shape), "\n" +
# #       "ibeacon: {}".format(sample_file.ibeacon), "\n" +
#       "ibeacon shape: {}".format(sample_file.ibeacon.shape), "\n" +
# #       "magn: {}".format(sample_file.magn), "\n" +
#       "magn shape: {}".format(sample_file.magn.shape), "\n" +
# #       "magn_uncali: {}".format(sample_file.magn_uncali), "\n" +
#       "magn_uncali shape: {}".format(sample_file.magn_uncali.shape), "\n" +
# #       "waypoint: {}".format(sample_file.waypoint), "\n" +
#       "waypoint shape: {}".format(sample_file.waypoint.shape), "\n" +
# #       "wifi: {}".format(sample_file.wifi), "\n" +
#       "wifi shape: {}".format(sample_file.wifi.shape))

In [6]:
# def show_site_png(root_path, site):
#     floor_paths = glob.glob(root_path + "metadata/" + site + "/*/floor_image.png")
#     n = len(floor_paths)
#     print("No. of floor paths: ", n)

#     # Create the custom number of rows & columns
#     ncols = [ceil(n / 3) if n > 4 else 4][0]
#     nrows = [ceil(n / ncols) if n > 4 else 1][0]

#     plt.figure(figsize=(16, 10))
#     plt.suptitle(f"Site no. '{site}'", fontsize=18)

#     # Plot image for each floor
#     for k, floor in enumerate(floor_paths):
#         # plt.subplot(nrows, ncols, k+1)
#         plt.subplot(ncols, nrows, k+1)
#         plt.rcParams["figure.facecolor"] = "white"

#         image = Image.open(floor)

#         plt.imshow(image)
#         plt.axis("off")
#         title = floor.split("/")[5]
#         plt.title(title, fontsize=15)

In [7]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [8]:
# # Checking the floor number distribution

# all_floors = glob.glob("../input/indoor-location-navigation/metadata/*/*")
# all_sites = glob.glob("../input/indoor-location-navigation/metadata/*")
# floor_no = []
# floor_counts = []

# # Floor count
# for site in all_sites:
#     floor_count = len([name for name in os.listdir(site)])
#     floor_counts.append(floor_count)

# floor_counts_df = pd.DataFrame(floor_counts, columns=["F_Count"])
# floor_counts_df = floor_counts_df["F_Count"].value_counts().reset_index()
# floor_counts_df = floor_counts_df.sort_values("index", ascending=True)

# # Extract only the floor number
# for floor in all_floors:
#     no = floor.split("/")[5]
#     floor_no.append(no)
    
# floor_no = pd.DataFrame(floor_no, columns=["No"])
# floor_no = floor_no["No"].value_counts().reset_index()
# floor_no = floor_no.sort_values("No", ascending=False)

# # ToDo: Floor expressions need to be fixed
# # 1F -> F1, L1 -> F1, G -> F1 etc

# # Plot
# # display(floor_counts_df.head(10))

# fig, axes = plt.subplots(ncols=2, figsize=(16, 10))
# axes[0] = sns.barplot(data=floor_counts_df, x="index", y="F_Count", palette="viridis", saturation=0.4, ax=axes[0])
# axes[0].set_title("Floor Count Distribution", size = 26, weight="bold")
# axes[0].set_xlabel("")
# axes[0].set_ylabel("Floor Count", size = 18, weight="bold")

# axes[1] = sns.barplot(data=floor_no, x="No", y="index", palette="viridis", saturation=0.4, ax=axes[1])
# axes[1].set_title("Frequency of Floors", size = 26, weight="bold")
# axes[1].set_xlabel("")
# axes[1].set_ylabel("Floor No.", size = 18, weight="bold")

# plt.xticks([])
# plt.yticks(fontsize=11)
# sns.despine(left=True, bottom=True);

In [9]:
# # Metadata checking (GeoJSON)
# # This is a vector representation of floor map
# geojson_paths = glob.glob("../input/indoor-location-navigation/metadata/*/*/geojson_map.json")
# print("No. of geojson file: {}".format(len(geojson_paths)))

# # Print one example
# ex = random.randint(0, len(geojson_paths))
# geojson_file_name = geojson_paths[ex]
# with open(geojson_file_name) as json_file:
#     paths = geojson_file_name.split("/")
#     site_id = paths[4]
#     floor = paths[5]
#     json_data = json.load(json_file)
#     json_properties = json_data["features"][0]["properties"]
#     print("File path: {}".format(geojson_file_name))
#     print("SiteID: {}".format(site_id))
#     print("Floor: {}".format(floor))
#     print("Floor info: {}".format(json_properties))

# # create id and floor number matching file
# site_ids = []
# floor_no = []
# floor_no_json = []

# for i in range(0, len(geojson_paths)):
#     with open(geojson_paths[i]) as f:
#         paths = geojson_paths[i].split("/")
#         site_id = paths[4]
#         floor = paths[5]
#         site_ids.append(site_id)
#         floor_no.append(floor)
#         d = json.load(f)
#         try:
#             floor_no_json.append(d["features"][0]["properties"]["floor_num"])
#         except:
#             floor_no_json.append(np.nan)

# floor_num_df = pd.DataFrame(
#     {"site_id": site_ids,
#      "floor_no": floor_no,
#      "floor_no_json": floor_no_json,
#     })

# display("floor_num_df length: {}".format(len(floor_num_df)))
# display(floor_num_df.head())

# # Get floormap dict to be used later
# floor_map_pairs = list(zip(floor_num_df["floor_no"], floor_num_df["floor_no_json"]))
# floor_map_pairs = np.unique(floor_map_pairs, axis=0) # get unique pair
# # print(floor_map_pairs) # to be used as floor_map later

# # Plot distribution
# floor_num_count_df = floor_num_df["floor_no_json"].value_counts().reset_index()
# floor_num_count_df = floor_num_count_df.sort_values("floor_no_json", ascending=False)
# # display(floor_num_count_df)
# # print(len(floor_num_count_df["floor_no_json"] == np.nan))

# fig = plt.figure()
# ax = plt.subplots(figsize=(16, 10))
# sns.barplot(data=floor_num_count_df, x="index", y="floor_no_json", palette="viridis", saturation=0.4)
# fig.show()

# # Just in case: Need for altitude info in geoJSON
# # from pyproj import Proj, transform
# # print(transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), -0.1285907, 51.50809))  # longitude first, latitude second.
# # output (meters east of 0, meters north of 0): (-14314.651244750548, 6711665.883938471)

In [10]:
# # More viz on accelerometers, wifi etc in one go
# from visualize_f import visualize_trajectory, visualize_heatmap
# from main import extract_wifi_rssi, extract_wifi_count
# from main import calibrate_magnetic_wifi_ibeacon_to_position
# from main import extract_magnetic_strength
# from main import extract_ibeacon_rssi

# # Visualizing magnetic strength
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# # extract mag, wifi, beacon of one example
# mwi_datas = calibrate_magnetic_wifi_ibeacon_to_position([path])
# magnetic_strength = extract_magnetic_strength(mwi_datas)
# wifi_rssi = extract_wifi_rssi(mwi_datas)
# wifi_counts = extract_wifi_count(mwi_datas)
# ibeacon_rssi = extract_ibeacon_rssi(mwi_datas)
# ibeacon_ummids = list(ibeacon_rssi.keys())
# target_ibeacon = ibeacon_ummids[0]

# # positions for heatmap
# heat_positions = np.array(list(magnetic_strength.keys()))
# heat_values = np.array(list(magnetic_strength.values()))
# heat_positions_wifi = np.array(list(wifi_counts.keys()))
# heat_values_wifi = np.array(list(wifi_counts.values()))
# heat_positions_bc = np.array(list(ibeacon_rssi[target_ibeacon].keys()))
# heat_values_bc = np.array(list(ibeacon_rssi[target_ibeacon].values()))[:, 0]

# # filter out positions that no wifi detected
# mask = heat_values_wifi != 0
# heat_positions_wifi = heat_positions_wifi[mask]
# heat_values_wifi = heat_values_wifi[mask]

# # get trajectory
# example = read_data_file(path)
# trajectory = example.waypoint # Returns timestamp, x, y values
# print(f"Waypoints: {trajectory}")
# trajectory = trajectory[:, 1:3] # Removes timestamp (we only need the coordinates)

# # Plot trajectory
# visualize_trajectory(trajectory = trajectory,
#                      floor_plan_filename = floor_plan_filename,
#                      width_meter = width_meter,
#                      height_meter = height_meter,
#                      title = "Example of Waypoint",)

In [11]:
# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [12]:
# path_datas = read_data_file(path)
# acce_datas = path_datas.acce
# magn_datas = path_datas.magn
# ahrs_datas = path_datas.ahrs
# wifi_datas = path_datas.wifi
# ibeacon_datas = path_datas.ibeacon
# posi_datas = path_datas.waypoint # not to be used

# # acce and ahrs data translation
# rel_positions = calc_rel_positions(acce_datas, ahrs_datas)
# print(acce_datas.shape)
# print(acce_datas[0])
# print(ahrs_datas[0])
# print(rel_positions.shape)

# # magn data translation
# print(magn_datas.shape)
# print(magn_datas[0])
# # print(extract_magnetic_strength(magn_datas))

In [13]:
# Methods for preprocessing train data: Timestamp handling
def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    if data.size == 0:
        return np.array([])
    else:
        data_ts = data[:, [0]]
        diff = []
        for ts in data_ts:
            diff.append(abs(int(t) - int(ts)))
        closest_index = np.argmin(diff) # if multiple records have the same value..?
        return data[closest_index]

In [14]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    if data.size == 0:
        # print("no axis data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        z_axis = data[3]
        try:
            accuracy = data[4]
        except IndexError:
            accuracy = np.nan
        return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    if data.size == 0:
        # print("no wifi data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        bssid = data[2]
        rssi = data[3]
        if len(data) > 5:
            freq = data[4]
            last_seen_ts = data[5]
        else:
            freq = np.nan
            last_seen_ts = data[-1]
        return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    if data.size == 0:
        # print("no beacon data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        rssi = data[2]
        return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    if data.size == 0:
        # print("no rel_pos data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        return [data_ts, diff_ts, x_axis, y_axis]

In [15]:
# Extract path and other data
def extract_path(path, floor_map):
    # split path
    try:
        ex_path = f"{path}"
        ex_paths = ex_path.split("/")
        site_id = ex_paths[4]
        floor = ex_paths[5]
        f = floor_map[floor]
        file_id = ex_paths[6].split(".")[0]
        return [site_id, file_id, f, floor]
    except:
        print("extract_path error")

# Definitely needs to be refactored
def extract_data(path):
    start_ts = find_start_ts(path)
    path_datas = read_data_file(path)
    acce = path_datas.acce
    ahrs = path_datas.ahrs
    magn = path_datas.magn
    gyro = path_datas.gyro
    acce_uncali = path_datas.acce_uncali
    magn_uncali = path_datas.magn_uncali
    gyro_uncali = path_datas.gyro_uncali
    wifi = path_datas.wifi
    wps = path_datas.waypoint
    ibeacon = path_datas.ibeacon
    rel_positions = calc_rel_positions(acce, ahrs)

    # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
    # ts = np.unique(wps[:, [0]])
    if acce_uncali.any():
        # print("acce_uncali")
        ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data
    elif acce.any():
        # print("acce")
        ts = np.unique(acce[:, [0]])
    else:
        print("no acce or acce_uncali")

    # extract data for each timestamp of waypoints
    res = []
    for t in ts:
        try:
            wp_closest = find_smallest_diff(t, wps)
            closest_wp_ts = wp_closest[0]
            diff_ts_wp_ts = abs(int(t) - int(closest_wp_ts))
            # time_stamp_cut = 2000, only the records within 2 sec of waypoint are kept
            if diff_ts_wp_ts < time_stamp_cut:
                # flag to indicate how close the data point is to the wps
                # print("diff_ts_wp_ts", diff_ts_wp_ts)
                within_100ms = True if abs(diff_ts_wp_ts) <= 100 else False
                within_200ms = True if abs(diff_ts_wp_ts) <= 200 else False
                x = wp_closest[1]
                y = wp_closest[2]
                # print("x, y: ", x, y)
                diff_start_ts = int(t) - int(start_ts)
                diff_start_wp_ts = int(closest_wp_ts) - int(start_ts)
                # print("diff_start_ts, diff_start_wp_ts: ", diff_start_ts, diff_start_wp_ts)
                acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
                ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
                magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
                magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
                gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
                # print("acce: ", acce_closest)
                # print("ahrs: ", ahrs_closest)
                # print("magn: ", magn_closest)
                # print("gyro: ", gyro_closest)
                acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
                magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
                gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
                # print("acce_u_closest: ", acce_u_closest)
                # print("magn_u_closest: ", magn_u_closest)
                # print("gyro_u_closest: ", gyro_u_closest)
                wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
                if len(ibeacon) > 0:
                    beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
                else:
                    beacon_closest = [np.nan, np.nan, np.nan, np.nan]
                rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
                # print([t, x, y, int(closest_wp_ts), acce_closest, acce_u_closest])
                res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
                           acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                           acce_u_closest + magn_u_closest + gyro_u_closest + \
                           wifi_closest + beacon_closest + rel_pos
                          )
            else:
                # print("no wp made it through timestamp cut")
                continue
        except Exception as exc:
            pass
            # print("Error message: ", exc)
            # print("extract_test_data error")
    return res

In [16]:
# %%timeit

# 5.55 ms ± 1.76 ms per loop
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# for fixing floor expression
floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

def one_trace_to_rows(path, floor_map):
    try:
        path_info = extract_path(path, floor_map)
        data = extract_data(path)
        # rows = list(itertools.chain(path_info, *data))
        rows = []
        for d in data:
            row = path_info + d
            rows.append(row)
            # print("row: ", row)
        return rows
    except:
        print("one_trace_to_rows error at: ", path)

# path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# time w/ for loop with 1 train_path -> 11.6
# time w/ itertools.chain for 1 train_path -> 11.8
start = time.time()
path_info = extract_path(path, floor_map)
print("path: ", path_info)
rows = one_trace_to_rows(path, floor_map)
print("time to process one train_path", time.time() - start)
#print("col count: ", len(rows[0]))
print("rows: ", rows)

path:  ['5cd56c18e2acfd2d33b6c321', '5d03760ae99446000843c022', 4, 'L4']
time to process one train_path 0.23976683616638184
rows:  [['5cd56c18e2acfd2d33b6c321', '5d03760ae99446000843c022', 4, 'L4', 1560507828194, '1560507828058', 136, 141.61975, 235.83246, 1560507828059, 1, 135, False, True, 1560507828194.0, 136, -0.5539398, 1.2322998, 10.307083, nan, 1560507828194.0, 136, 0.05928463, -0.029569399, -0.8775706, nan, 1560507828194.0, 136, -26.939999, -19.8, -25.019999, nan, 41.7588780510205, 1560507828194.0, 136, 0.72039795, 0.14656067, 0.246521, nan, 1560507828194.0, 136, -0.6891327, 1.1844482, 10.711472, nan, 1560507828194.0, 136, -23.939999, -65.64, -168.54001, nan, 1560507828194.0, 136, 0.85276794, 0.118499756, 0.13192749, nan, '1560507828676', 618, '39db40987727b4404daf02cef84642c6678851d3', '99ff4164ac119fb305c8d805011372c51cb243e9', '-64', nan, '1560507828270', '1560507828944', 886, '0e570c3406b79266b7ada12e3b9314e7bb9dde3e_96d341ef29ed4ec84baec7ee1a283a53dfd05a61_ef7aa6e119b6b51c

In [17]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [18]:
# Check if we can make df

# column names
col_names = ["site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_100ms", "within_200ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

print(len(col_names))

df = pd.DataFrame(rows, columns=col_names)
print("df len: ", len(df))
print("site_id nunique: ", df["site_id"].nunique())
print("file_id nunique: ", df["file_id"].nunique())
print("x value_counts: ", df["x"].value_counts())
print("y value_counts: ", df["y"].value_counts())
print("event ts nunique: ", df["ts"].nunique())
print("start ts nunique: ", df["start_ts"].nunique()) # should be one
print("diff_ts_wp_ts value_counts: ", df["diff_ts_wp_ts"].value_counts())
print("diff_ts_wp_ts nunique: ", df["diff_ts_wp_ts"].nunique())
print("within_100ms value_counts: ", df["within_100ms"].value_counts())
print("within_100ms nunique: ", df["within_100ms"].nunique())
print("within_100ms count: ", df["within_100ms"].count())
print("within_200ms value_counts: ", df["within_200ms"].value_counts())
print("within_200ms nunique: ", df["within_200ms"].nunique())
print("within_200ms count: ", df["within_200ms"].count())
display(df.head())

72
df len:  32
site_id nunique:  1
file_id nunique:  1
x value_counts:  149.91766    26
141.61975     6
Name: x, dtype: int64
y value_counts:  233.93027    26
235.83246     6
Name: y, dtype: int64
event ts nunique:  32
start ts nunique:  1
diff_ts_wp_ts value_counts:  194    1
67     1
189    1
185    1
248    1
130    1
244    1
51     1
48     1
175    1
110    1
107    1
234    1
169    1
166    1
228    1
225    1
31     1
28     1
155    1
90     1
87     1
214    1
149    1
146    1
71     1
208    1
205    1
12     1
8      1
135    1
126    1
Name: diff_ts_wp_ts, dtype: int64
diff_ts_wp_ts nunique:  32
within_100ms value_counts:  False    22
True     10
Name: within_100ms, dtype: int64
within_100ms nunique:  2
within_100ms count:  32
within_200ms value_counts:  True     24
False     8
Name: within_200ms, dtype: int64
within_200ms nunique:  2
within_200ms count:  32


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5cd56c18e2acfd2d33b6c321,5d03760ae99446000843c022,4,L4,1560507828194,1560507828058,136,141.61975,235.83246,1560507828059,1,135,False,True,1560508000000.0,136,-0.55394,1.2323,10.307083,,1560508000000.0,136,0.059285,-0.029569,-0.877571,,1560508000000.0,136,-26.939999,-19.8,-25.019999,,41.758878,1560508000000.0,136,0.720398,0.146561,0.246521,,1560508000000.0,136,-0.689133,1.184448,10.711472,,1560508000000.0,136,-23.939999,-65.64,-168.54001,,1560508000000.0,136,0.852768,0.1185,0.131927,,1560507828676,618,39db40987727b4404daf02cef84642c6678851d3,99ff4164ac119fb305c8d805011372c51cb243e9,-64,,1560507828270,1560507828944,886,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_96d34...,-80,1560508000000.0,1081,0.471353,-0.244995
1,5cd56c18e2acfd2d33b6c321,5d03760ae99446000843c022,4,L4,1560507828214,1560507828058,156,141.61975,235.83246,1560507828059,1,155,False,True,1560508000000.0,156,-0.42952,1.462021,9.464813,,1560508000000.0,156,0.050639,-0.03975,-0.8825,,1560508000000.0,156,-26.82,-20.039999,-24.779999,,41.652879,1560508000000.0,156,0.295837,0.332275,0.344879,,1560508000000.0,156,-0.689133,1.184448,10.711472,,1560508000000.0,156,-23.82,-65.88,-168.3,,1560508000000.0,156,0.451431,0.334747,0.247391,,1560507828676,618,39db40987727b4404daf02cef84642c6678851d3,99ff4164ac119fb305c8d805011372c51cb243e9,-64,,1560507828270,1560507828944,886,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_96d34...,-80,1560508000000.0,1081,0.471353,-0.244995
2,5cd56c18e2acfd2d33b6c321,5d03760ae99446000843c022,4,L4,1560507828234,1560507828058,176,141.61975,235.83246,1560507828059,1,175,False,True,1560508000000.0,176,-0.305084,1.677368,8.672791,,1560508000000.0,176,0.056884,-0.055713,-0.866894,,1560508000000.0,176,-26.82,-20.1,-24.48,,41.50413,1560508000000.0,176,0.168167,0.199097,0.280121,,1560508000000.0,176,-0.496521,1.334,9.897919,,1560508000000.0,176,-23.82,-65.94,-168.0,,1560508000000.0,176,0.240067,0.306046,0.235779,,1560507828676,618,39db40987727b4404daf02cef84642c6678851d3,99ff4164ac119fb305c8d805011372c51cb243e9,-64,,1560507828270,1560507828944,886,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_96d34...,-80,1560508000000.0,1081,0.471353,-0.244995
3,5cd56c18e2acfd2d33b6c321,5d03760ae99446000843c022,4,L4,1560507828253,1560507828058,195,141.61975,235.83246,1560507828059,1,194,False,True,1560508000000.0,195,-0.045471,1.928619,8.475388,,1560508000000.0,195,0.062694,-0.073035,-0.858486,,1560508000000.0,195,-26.82,-20.279999,-24.6,,41.662342,1560508000000.0,195,-0.031586,0.190552,0.234314,,1560508000000.0,195,-0.190231,1.800598,8.489746,,1560508000000.0,195,-23.82,-66.119995,-168.12001,,1560508000000.0,195,0.072083,0.222351,0.176529,,1560507828676,618,39db40987727b4404daf02cef84642c6678851d3,99ff4164ac119fb305c8d805011372c51cb243e9,-64,,1560507828270,1560507828944,886,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_96d34...,-80,1560508000000.0,1081,0.471353,-0.244995
4,5cd56c18e2acfd2d33b6c321,5d03760ae99446000843c022,4,L4,1560507828273,1560507828058,215,141.61975,235.83246,1560507828059,1,214,False,False,1560508000000.0,215,0.064606,2.059021,8.785248,,1560508000000.0,215,0.059113,-0.09457,-0.85379,,1560508000000.0,215,-26.699999,-20.34,-24.42,,41.508336,1560508000000.0,215,-0.205673,0.172226,0.18605,,1560508000000.0,215,-0.190231,1.800598,8.489746,,1560508000000.0,215,-23.699999,-66.18,-167.94,,1560508000000.0,215,-0.131943,0.243744,0.139267,,1560507828676,618,39db40987727b4404daf02cef84642c6678851d3,99ff4164ac119fb305c8d805011372c51cb243e9,-64,,1560507828270,1560507828944,886,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_96d34...,-80,1560508000000.0,1081,0.471353,-0.244995


In [19]:
# # Set pool
# num_cores = multiprocessing.cpu_count()
# print(f"num_cores={num_cores}")
# # args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list]
# pool = Pool(num_cores)

# start = time.time()
# # w/ 250ms settings, 3 random samples from each site_id
# # 2 paths -> 18.7 sec
# # 10 paths -> 315 sec (df len is 1994)
# # 100 paths -> 708 sec (df len is 7183)
# # all ~ 600 paths -> 

# # errors
# # grouped_paths_list -> 100 paths -> site_id: 8 errors, 27 correct
# # grouped_paths_list -> 100 paths -> file_id: 23 errors, 77 correct

# # all in one go -> xxx sec
# # array_split -> 5891.8 sec

# # all in one go
# # res = pool.starmap(one_trace_to_rows, args)

# # split the args
# res = []
# for arg in tqdm(np.array_split(args, 50)):
#     res.extend(pool.starmap(one_trace_to_rows, arg))

In [32]:
# train_path filtering
def extract_path_for_grouplist(path):
    ex_path = f"{path}"
    ex_paths = ex_path.split("/")
    site_id = ex_paths[4]
    file_id = ex_paths[6].split(".")[0]
    return [path, site_id, file_id]

# create pathlist to be used by 2 types of paths list
path_list = [extract_path_for_grouplist(item) for item in train_paths]
df_paths = pd.DataFrame(path_list, columns=["path", "site_id", "file_id"])
site_id_path_list = df_paths["site_id"].unique()

# grouped_paths_list -> It takes 3 records from every site_id
grouped_paths_df = df_paths.groupby("site_id").sample(n=3)
grouped_paths_list = list(grouped_paths_df["path"].unique())
print("grouped_paths_list len: ", len(grouped_paths_list))
print("grouped_paths_list examples: ", grouped_paths_list[:5])

# filter train_paths to only those with sites that are in the sub_df
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
sub_df_site_list = sub_df["site"].unique()
df_24_sites = df_paths[df_paths["site_id"].isin(sub_df_site_list)]
df_24_site_list = df_24_sites["site_id"].unique()
sub_site_paths_list = list(df_24_sites["path"].unique())
print(set(sub_df_site_list) == set(df_24_site_list)) # Check if df_24_site has only the sites from sub_df
print(len(df_24_sites))
print(len(sub_site_paths_list))
print(len(df_24_site_list))

grouped_paths_list len:  612
grouped_paths_list examples:  ['../input/indoor-location-navigation/train/5a0546857ecc773753327266/F4/5d11dc04ffe23f0008604f57.txt', '../input/indoor-location-navigation/train/5a0546857ecc773753327266/F3/5d11943dffe23f0008604e3a.txt', '../input/indoor-location-navigation/train/5a0546857ecc773753327266/F3/5d8f0954d5bae80006eb8db6.txt', '../input/indoor-location-navigation/train/5c3c44b80379370013e0fd2b/B1/5d0761f84cae4f000a2db586.txt', '../input/indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F1/5d075ecf4cae4f000a2db507.txt']


In [35]:
# Set pool
num_cores = multiprocessing.cpu_count()
print(f"num_cores={num_cores}")
pool = Pool(num_cores)
start = time.time()

# make new train_path_list
# args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list[:100]]
args = [(p, floor_map) for p in sub_site_paths_list[:100]]

res_dict = {}
res = []
for i, e in enumerate(tqdm(np.array_split(args, 10))):
    # print("ith iteration: ", i)
    rows = pool.starmap(one_trace_to_rows, e)
    res_dict[i] = rows
    res.extend(rows)
    
pool.close()
print("time to process", time.time() - start)

num_cores=4


100%|██████████| 10/10 [04:33<00:00, 27.37s/it]

time to process 273.7289819717407





In [36]:
res_name = "indoor_train_res_4.pkl"

with open(res_name, "wb") as file:
    pickle.dump(res, file)

In [37]:
with open(res_name, "rb") as file:
    res = pickle.load(file)

In [38]:
############################## KEEP THIS CELL FOR LATER REF ##############################

# Error in ~20% of the train paths -> caused by not having acces_uncali to create the event timestamps

# error files
# /5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bbc852a70008c01607.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bda50dc300099d34cc.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085df529994a0008202661.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt
# /5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt
# /5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt
# /5cd56b5ae2acfd2d33b5854a/B1/5d073ba64a19c000086c559b.txt
# /5cd56b5ae2acfd2d33b5854a/F1/5d07603e4cae4f000a2db525.txt
# /5cd56b63e2acfd2d33b591c2/F2/5d0b0668912a980009fe91f2.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt
# /5cd56b64e2acfd2d33b592b3/F2/5d0c9321c99c56000836df18.txt
# /5cd56b64e2acfd2d33b592b3/F3/5d0c9952ea565d0008e34e8b.txt
# /5cd56b64e2acfd2d33b592b3/F4/5d0c9d65ea565d0008e34ea2.txt
# /5cd56b5ae2acfd2d33b58549/5F/5d0613514a19c000086c432a.txt
# /5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b015e2f8a26000805ba44.txt
# /5cd56b79e2acfd2d33b5b74e/F1/5d0af3452f8a26000805b830.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt
# /5cd56b6be2acfd2d33b59d1f/F3/5d0896415125450008037c76.txt

# base_path = "../input/indoor-location-navigation/train"
# error_files = [
#     "/5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt",
#     "/5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt",
#     "/5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt",
#     "/5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt",
#     "/5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt",
#     "/5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt"
# ]

# working_path = "../input/indoor-location-navigation/train/5d2709c303f801723c3299ee/1F/5dad7d6daa1d300006faa80c.txt"
# error_paths = [base_path + e for e in error_files]
# rows = one_trace_to_rows(error_paths[1], floor_map)
# print(rows)

In [39]:
print(len(res))
print(len(res[0]))
print(len(res[0][0]))

100
81
72


In [40]:
start = time.time()

df_train = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_train = df_train.append(df, ignore_index=True)

print("time to process", time.time() - start)
print("length of df made", len(df_train))
display(df_train.head(10))

time to process 3.5163044929504395
length of df made 15439


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980650,1571541980530,120,83.14383,121.14074,1571541980538,8,112,False,True,1571542000000.0,120,0.130234,0.492737,9.300461,,1571542000000.0,120,-0.003185,-0.015755,-0.979973,,1571542000000.0,120,-9.545898,-19.74945,-26.855469,,34.675368,1571542000000.0,120,-0.307785,-0.083252,-0.431976,,1571542000000.0,120,0.159561,0.34549,9.432739,,1571542000000.0,120,2.775574,3.468323,-393.5257,,1571542000000.0,120,-0.342026,-0.115768,-0.414246,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980646,116.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-92,1571542000000.0,907,0.293324,-0.526319
1,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980670,1571541980530,140,83.14383,121.14074,1571541980538,8,132,False,True,1571542000000.0,140,-0.206757,0.833908,9.246597,,1571542000000.0,140,0.001035,-0.019007,-0.979065,,1571542000000.0,140,-11.627197,-21.136475,-26.179504,,35.59928,1571542000000.0,140,-0.243332,-0.013474,-0.433044,,1571542000000.0,140,0.130234,0.492737,9.300461,,1571542000000.0,140,0.694275,2.081299,-392.84973,,1571542000000.0,140,-0.304733,-0.08168,-0.430222,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
2,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980690,1571541980530,160,83.14383,121.14074,1571541980538,8,152,False,True,1571542000000.0,160,-0.444382,0.890182,9.306442,,1571542000000.0,160,0.006563,-0.021173,-0.978133,,1571542000000.0,160,-8.853149,-20.4422,-26.855469,,34.892377,1571542000000.0,160,-0.183151,0.041931,-0.39949,,1571542000000.0,160,-0.206757,0.833908,9.246597,,1571542000000.0,160,3.468323,2.775574,-393.5257,,1571542000000.0,160,-0.24028,-0.011902,-0.43129,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
3,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980710,1571541980530,180,83.14383,121.14074,1571541980538,8,172,False,True,1571542000000.0,180,-0.585632,0.699829,9.459076,,1571542000000.0,180,0.011298,-0.020693,-0.977529,,1571542000000.0,180,-9.545898,-21.83075,-26.855469,,35.90156,1571542000000.0,180,-0.184219,0.044052,-0.385117,,1571542000000.0,180,-0.444382,0.890182,9.306442,,1571542000000.0,180,2.775574,1.387024,-393.5257,,1571542000000.0,180,-0.180099,0.043503,-0.397736,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
4,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980730,1571541980530,200,83.14383,121.14074,1571541980538,8,192,False,True,1571542000000.0,200,-0.672424,0.718384,9.361511,,1571542000000.0,200,0.011298,-0.020693,-0.977529,,1571542000000.0,200,-9.545898,-21.136475,-28.204346,,36.515201,1571542000000.0,200,-0.204468,0.018494,-0.369125,,1571542000000.0,200,-0.585632,0.699829,9.459076,,1571542000000.0,200,2.775574,2.081299,-394.87457,,1571542000000.0,200,-0.181168,0.045624,-0.383362,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
5,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980751,1571541980530,221,83.14383,121.14074,1571541980538,8,213,False,False,1571542000000.0,221,-0.69397,0.783035,9.093353,,1571542000000.0,221,0.019216,-0.019146,-0.976703,,1571542000000.0,221,-6.771851,-21.136475,-26.179504,,34.32164,1571542000000.0,221,-0.201797,0.08934,-0.333435,,1571542000000.0,221,-0.672424,0.718384,9.361511,,1571542000000.0,221,5.549622,2.081299,-392.84973,,1571542000000.0,221,-0.201416,0.020065,-0.367371,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
6,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980771,1571541980530,241,83.14383,121.14074,1571541980538,8,233,False,False,1571542000000.0,241,-0.758621,0.751907,9.294464,,1571542000000.0,241,0.023733,-0.018007,-0.976272,,1571542000000.0,241,-8.158874,-21.83075,-26.855469,,35.557912,1571542000000.0,241,-0.24707,0.222488,-0.251953,,1571542000000.0,241,-0.69397,0.783035,9.093353,,1571542000000.0,241,4.162598,1.387024,-393.5257,,1571542000000.0,241,-0.198746,0.090912,-0.33168,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
7,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541984565,1571541980530,4035,84.63925,117.336205,1571541984807,4277,242,False,False,1571542000000.0,4035,-0.091827,0.651947,5.774994,,1571542000000.0,4035,0.064248,-0.030288,-0.888371,,1571542000000.0,4035,-22.033691,-23.912048,-22.129822,,39.331903,1571542000000.0,4035,-0.534164,0.061111,0.217834,,1571542000000.0,4035,-0.206146,0.590897,5.054947,,1571542000000.0,4035,-9.712219,-0.694275,-388.80005,,1571542000000.0,4035,-0.39476,-0.072083,0.233978,,1571541984340,3810,61bb65531115bd583c84ba851e7b57deb66022aa,daa4493cb1d45e8da1dbda348310fa20ee7992e8,-42,,1571541983697,1571541984348,3818.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-85,1571542000000.0,4197,0.947556,-0.673275
8,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541984585,1571541980530,4055,84.63925,117.336205,1571541984807,4277,222,False,False,1571542000000.0,4055,-0.018204,0.748917,6.90506,,1571542000000.0,4055,0.060078,-0.018609,-0.887119,,1571542000000.0,4055,-24.809265,-23.912048,-21.455383,,40.590875,1571542000000.0,4055,-0.712051,0.148987,0.160324,,1571542000000.0,4055,-0.091827,0.651947,5.774994,,1571542000000.0,4055,-12.487793,-0.694275,-388.1256,,1571542000000.0,4055,-0.531113,0.062683,0.219589,,1571541984340,3810,61bb65531115bd583c84ba851e7b57deb66022aa,daa4493cb1d45e8da1dbda348310fa20ee7992e8,-42,,1571541983697,1571541984348,3818.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-85,1571542000000.0,4197,0.947556,-0.673275
9,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541984606,1571541980530,4076,84.63925,117.336205,1571541984807,4277,201,False,False,1571542000000.0,4076,-0.405472,1.111649,8.655823,,1571542000000.0,4076,0.057382,-0.01013,-0.886737,,1571542000000.0,4076,-22.727966,-24.604797,-24.154663,,41.29654,1571542000000.0,4076,-0.889954,0.164963,0.12677,,1571542000000.0,4076,-0.018204,0.748917,6.90506,,1571542000000.0,4076,-10.406494,-1.387024,-390.8249,,1571542000000.0,4076,-0.709,0.150558,0.162079,,1571541984340,3810,61bb65531115bd583c84ba851e7b57deb66022aa,daa4493cb1d45e8da1dbda348310fa20ee7992e8,-42,,1571541983697,1571541984841,4311.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_b6589...,-75,1571542000000.0,4197,0.947556,-0.673275


In [41]:
# def list_to_df(row_list):
#     df_train = pd.DataFrame(row_list[0], columns=col_names)
#     for r in row_list[1:]:
#         df = pd.DataFrame(r, columns=col_names)
#         df_train = df_train.append(df)
#     return df_train

# start = time.time()
# pool = Pool(num_cores)

# df_train = pool.map(list_to_df, tqdm(res))

# # print("train_path count", len(train_paths[:train_num]))
# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))
# pool.close()

In [42]:
print("df len: ", len(df_train), "\n")
print("site_id nunique: ", df_train["site_id"].nunique(), "\n")
print("site_id value_counts: ", df_train["site_id"].value_counts(), "\n")
print("file_id nunique: ", df_train["file_id"].nunique(), "\n")
print("file_id value_counts: ", df_train["file_id"].value_counts(), "\n")
print("floor value_counts: ", df_train["floor"].value_counts(), "\n")
print("x value_counts: ", df_train["x"].value_counts(), "\n")
print("y value_counts: ", df_train["y"].value_counts(), "\n")
print("event ts nunique: ", df_train["ts"].nunique(), "\n")
print("start ts nunique: ", df_train["start_ts"].nunique(), "\n") # should be one
print("diff_ts_wp_ts value_counts: ", df_train["diff_ts_wp_ts"].value_counts(), "\n")
print("diff_ts_wp_ts nunique: ", df_train["diff_ts_wp_ts"].nunique(), "\n")
display(df_train.head())

df len:  15439 

site_id nunique:  1 

site_id value_counts:  5da138764db8ce0c98bcaa46    15439
Name: site_id, dtype: int64 

file_id nunique:  100 

file_id value_counts:  5dabcd5a18410e00067e6fb9    552
5dabdb6e18410e00067e6ff4    526
5dabdb6018410e00067e6fe4    452
5dabdb4c18410e00067e6fd0    429
5dabdb50df065a00069bef70    429
                           ... 
5dabdb66df065a00069bef88     32
5dad1749dc3e2c0006606c06     32
5dabdb6718410e00067e6fee     32
5dabcd58df065a00069bef4a     31
5dad174adc3e2c0006606c09     26
Name: file_id, Length: 100, dtype: int64 

floor value_counts:  B1    14536
F1      903
Name: floor, dtype: int64 

x value_counts:  38.972908     157
80.631550     155
82.384200     150
36.640835     149
81.600650     149
             ... 
101.022415     14
38.741318       7
17.752827       7
120.843636      7
16.599463       6
Name: x, Length: 248, dtype: int64 

y value_counts:  182.410220    157
44.533028     155
20.638855     150
104.280850    149
33.761030     149


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980650,1571541980530,120,83.14383,121.14074,1571541980538,8,112,False,True,1571542000000.0,120,0.130234,0.492737,9.300461,,1571542000000.0,120,-0.003185,-0.015755,-0.979973,,1571542000000.0,120,-9.545898,-19.74945,-26.855469,,34.675368,1571542000000.0,120,-0.307785,-0.083252,-0.431976,,1571542000000.0,120,0.159561,0.34549,9.432739,,1571542000000.0,120,2.775574,3.468323,-393.5257,,1571542000000.0,120,-0.342026,-0.115768,-0.414246,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980646,116.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-92,1571542000000.0,907,0.293324,-0.526319
1,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980670,1571541980530,140,83.14383,121.14074,1571541980538,8,132,False,True,1571542000000.0,140,-0.206757,0.833908,9.246597,,1571542000000.0,140,0.001035,-0.019007,-0.979065,,1571542000000.0,140,-11.627197,-21.136475,-26.179504,,35.59928,1571542000000.0,140,-0.243332,-0.013474,-0.433044,,1571542000000.0,140,0.130234,0.492737,9.300461,,1571542000000.0,140,0.694275,2.081299,-392.84973,,1571542000000.0,140,-0.304733,-0.08168,-0.430222,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
2,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980690,1571541980530,160,83.14383,121.14074,1571541980538,8,152,False,True,1571542000000.0,160,-0.444382,0.890182,9.306442,,1571542000000.0,160,0.006563,-0.021173,-0.978133,,1571542000000.0,160,-8.853149,-20.4422,-26.855469,,34.892377,1571542000000.0,160,-0.183151,0.041931,-0.39949,,1571542000000.0,160,-0.206757,0.833908,9.246597,,1571542000000.0,160,3.468323,2.775574,-393.5257,,1571542000000.0,160,-0.24028,-0.011902,-0.43129,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
3,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980710,1571541980530,180,83.14383,121.14074,1571541980538,8,172,False,True,1571542000000.0,180,-0.585632,0.699829,9.459076,,1571542000000.0,180,0.011298,-0.020693,-0.977529,,1571542000000.0,180,-9.545898,-21.83075,-26.855469,,35.90156,1571542000000.0,180,-0.184219,0.044052,-0.385117,,1571542000000.0,180,-0.444382,0.890182,9.306442,,1571542000000.0,180,2.775574,1.387024,-393.5257,,1571542000000.0,180,-0.180099,0.043503,-0.397736,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319
4,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1,B1,1571541980730,1571541980530,200,83.14383,121.14074,1571541980538,8,192,False,True,1571542000000.0,200,-0.672424,0.718384,9.361511,,1571542000000.0,200,0.011298,-0.020693,-0.977529,,1571542000000.0,200,-9.545898,-21.136475,-28.204346,,36.515201,1571542000000.0,200,-0.204468,0.018494,-0.369125,,1571542000000.0,200,-0.585632,0.699829,9.459076,,1571542000000.0,200,2.775574,2.081299,-394.87457,,1571542000000.0,200,-0.181168,0.045624,-0.383362,,1571541982444,1914,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43,,1571541980896,1571541980684,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80,1571542000000.0,907,0.293324,-0.526319


In [43]:
# Visualizing timestamp distribution

# Explore
# print(df_train["ts"].dtype)
# print(df_test["ts"].dtype)

# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_train, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_train, ["floor_converted","ts", "start_ts", "diff_start_ts", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
             "acce_ts", "diff_acce_ts", \
             "ahrs_ts", "diff_ahrs_ts", \
             "magn_ts", "diff_magn_ts", \
             "gyro_ts", "diff_gyro_ts", \
             "acce_u_ts", "diff_acce_u_ts", \
             "magn_u_ts", "diff_magn_u_ts", \
             "gyro_u_ts", "diff_gyro_u_ts", \
             "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_rssi", \
             "rel_ts", "diff_rel_ts"
            ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_train]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_train.head())

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,120.0,83.14383,121.14074,1571542000000.0,8.0,112.0,False,True,1571542000000.0,120.0,0.130234,0.492737,9.300461,,1571542000000.0,120.0,-0.003185,-0.015755,-0.979973,,1571542000000.0,120.0,-9.545898,-19.74945,-26.855469,,34.675368,1571542000000.0,120.0,-0.307785,-0.083252,-0.431976,,1571542000000.0,120.0,0.159561,0.34549,9.432739,,1571542000000.0,120.0,2.775574,3.468323,-393.5257,,1571542000000.0,120.0,-0.342026,-0.115768,-0.414246,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,116.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-92.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,41,2019-10-20 03:26:20.649999872,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
1,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,140.0,83.14383,121.14074,1571542000000.0,8.0,132.0,False,True,1571542000000.0,140.0,-0.206757,0.833908,9.246597,,1571542000000.0,140.0,0.001035,-0.019007,-0.979065,,1571542000000.0,140.0,-11.627197,-21.136475,-26.179504,,35.59928,1571542000000.0,140.0,-0.243332,-0.013474,-0.433044,,1571542000000.0,140.0,0.130234,0.492737,9.300461,,1571542000000.0,140.0,0.694275,2.081299,-392.84973,,1571542000000.0,140.0,-0.304733,-0.08168,-0.430222,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.670000128,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
2,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,160.0,83.14383,121.14074,1571542000000.0,8.0,152.0,False,True,1571542000000.0,160.0,-0.444382,0.890182,9.306442,,1571542000000.0,160.0,0.006563,-0.021173,-0.978133,,1571542000000.0,160.0,-8.853149,-20.4422,-26.855469,,34.892377,1571542000000.0,160.0,-0.183151,0.041931,-0.39949,,1571542000000.0,160.0,-0.206757,0.833908,9.246597,,1571542000000.0,160.0,3.468323,2.775574,-393.5257,,1571542000000.0,160.0,-0.24028,-0.011902,-0.43129,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.689999872,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
3,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,180.0,83.14383,121.14074,1571542000000.0,8.0,172.0,False,True,1571542000000.0,180.0,-0.585632,0.699829,9.459076,,1571542000000.0,180.0,0.011298,-0.020693,-0.977529,,1571542000000.0,180.0,-9.545898,-21.83075,-26.855469,,35.90156,1571542000000.0,180.0,-0.184219,0.044052,-0.385117,,1571542000000.0,180.0,-0.444382,0.890182,9.306442,,1571542000000.0,180.0,2.775574,1.387024,-393.5257,,1571542000000.0,180.0,-0.180099,0.043503,-0.397736,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.710000128,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
4,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,200.0,83.14383,121.14074,1571542000000.0,8.0,192.0,False,True,1571542000000.0,200.0,-0.672424,0.718384,9.361511,,1571542000000.0,200.0,0.011298,-0.020693,-0.977529,,1571542000000.0,200.0,-9.545898,-21.136475,-28.204346,,36.515201,1571542000000.0,200.0,-0.204468,0.018494,-0.369125,,1571542000000.0,200.0,-0.585632,0.699829,9.459076,,1571542000000.0,200.0,2.775574,2.081299,-394.87457,,1571542000000.0,200.0,-0.181168,0.045624,-0.383362,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.729999872,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00


In [44]:
# Calculate moving averages
# Differencing respect to time (as each timestep is unevenly spaced)

In [45]:
# Save the file in parquet
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Saving train data
train_file_name = "indoor_train_4.pkl"

with open(train_file_name, "wb") as file:
    pickle.dump(df_train, file)

# Save them to output
# df_train.to_csv('df_train_2.csv',index=False)
# df_test.to_csv('df_test.csv',index=False)

In [46]:
# Load data it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

In [47]:
print("df len: ", len(df_train), "\n")
print("file_id unique: ", (df_train["file_id"].nunique()), "\n")
print("site_id unique: ", (df_train["site_id"].nunique()), "\n")
print("site_id value_counts: ", (df_train["site_id"].value_counts()))
display(df_train.head())

df len:  15439 

file_id unique:  100 

site_id unique:  1 

site_id value_counts:  5da138764db8ce0c98bcaa46    15439
Name: site_id, dtype: int64


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,120.0,83.14383,121.14074,1571542000000.0,8.0,112.0,False,True,1571542000000.0,120.0,0.130234,0.492737,9.300461,,1571542000000.0,120.0,-0.003185,-0.015755,-0.979973,,1571542000000.0,120.0,-9.545898,-19.74945,-26.855469,,34.675368,1571542000000.0,120.0,-0.307785,-0.083252,-0.431976,,1571542000000.0,120.0,0.159561,0.34549,9.432739,,1571542000000.0,120.0,2.775574,3.468323,-393.5257,,1571542000000.0,120.0,-0.342026,-0.115768,-0.414246,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,116.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-92.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,41,2019-10-20 03:26:20.649999872,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
1,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,140.0,83.14383,121.14074,1571542000000.0,8.0,132.0,False,True,1571542000000.0,140.0,-0.206757,0.833908,9.246597,,1571542000000.0,140.0,0.001035,-0.019007,-0.979065,,1571542000000.0,140.0,-11.627197,-21.136475,-26.179504,,35.59928,1571542000000.0,140.0,-0.243332,-0.013474,-0.433044,,1571542000000.0,140.0,0.130234,0.492737,9.300461,,1571542000000.0,140.0,0.694275,2.081299,-392.84973,,1571542000000.0,140.0,-0.304733,-0.08168,-0.430222,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.670000128,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
2,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,160.0,83.14383,121.14074,1571542000000.0,8.0,152.0,False,True,1571542000000.0,160.0,-0.444382,0.890182,9.306442,,1571542000000.0,160.0,0.006563,-0.021173,-0.978133,,1571542000000.0,160.0,-8.853149,-20.4422,-26.855469,,34.892377,1571542000000.0,160.0,-0.183151,0.041931,-0.39949,,1571542000000.0,160.0,-0.206757,0.833908,9.246597,,1571542000000.0,160.0,3.468323,2.775574,-393.5257,,1571542000000.0,160.0,-0.24028,-0.011902,-0.43129,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.689999872,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
3,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,180.0,83.14383,121.14074,1571542000000.0,8.0,172.0,False,True,1571542000000.0,180.0,-0.585632,0.699829,9.459076,,1571542000000.0,180.0,0.011298,-0.020693,-0.977529,,1571542000000.0,180.0,-9.545898,-21.83075,-26.855469,,35.90156,1571542000000.0,180.0,-0.184219,0.044052,-0.385117,,1571542000000.0,180.0,-0.444382,0.890182,9.306442,,1571542000000.0,180.0,2.775574,1.387024,-393.5257,,1571542000000.0,180.0,-0.180099,0.043503,-0.397736,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.710000128,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00
4,5da138764db8ce0c98bcaa46,5dabdb5e18410e00067e6fe2,-1.0,B1,1571542000000.0,1571542000000.0,200.0,83.14383,121.14074,1571542000000.0,8.0,192.0,False,True,1571542000000.0,200.0,-0.672424,0.718384,9.361511,,1571542000000.0,200.0,0.011298,-0.020693,-0.977529,,1571542000000.0,200.0,-9.545898,-21.136475,-28.204346,,36.515201,1571542000000.0,200.0,-0.204468,0.018494,-0.369125,,1571542000000.0,200.0,-0.585632,0.699829,9.459076,,1571542000000.0,200.0,2.775574,2.081299,-394.87457,,1571542000000.0,200.0,-0.181168,0.045624,-0.383362,,1571542000000.0,1914.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,9dc5b9225a87ae88f3c27f704c72c829689a7db4,-43.0,,1571542000000.0,1571542000000.0,154.0,07efd69e3167537492f0ead89fb2779633b04949_b6589...,-80.0,1571542000000.0,907.0,0.293324,-0.526319,0,40,0,71,83,37,2019-10-20 03:26:20.729999872,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00,2019-10-20 03:26:20.896,2019-10-20,2019-10-20 03:00:00,2019-10-20 03:26:00


In [None]:
# # Get submission file
# sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
# sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
# sub_df = sub_df.drop(columns=["floor", "x", "y"])
# # grouped_df = sub_df.groupby("file").sample(n=2)
# # all_file_id = grouped_df["file"].unique()
# # print(len(grouped_df))
# # print(len(all_file_id))
# # display(grouped_df.head())
# display(sub_df.head())

# test_site_id = sub_df["site"].unique()
# train_site_id = df_train["site_id"].unique()
# print(test_site_id, "\n")
# print(train_site_id, "\n")
# a = list(set(test_site_id) & set(train_site_id))
# print(a)