In [3]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool

import pickle
import math

pd.set_option("display.max_columns", 100)

In [4]:
# milisecond setting 
time_stamp_cut = 250

# train number setting
# train_num = len(train_paths) - 1
# train_num = round(len(train_paths) / 2)
train_num = 1000

# 200 train paths come out with ~1000 examples, so multiply train examples by 5 to extract similar no. of examples
# test_num = train_num * 5
# test_num = len(sub_df) - 1
print(train_num)

1000


In [5]:
# Preprocess

# Check out each file. Content, images
root_path = "../input/indoor-location-navigation/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

# Reading in 1 file
def pick_example(max_range, paths):
    ex = random.randint(0, max_range)
    example_path = paths[ex]
    path = f"{example_path}"
    paths = path.split("/")
    site = paths[4]
    floorNo = paths[5]
    floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
print("example path: ", path)
print("site: ", site)
print("floorNo: ", floorNo)
print("floor_plan_filename: ", floor_plan_filename)
print("json_plan_filename: ", json_plan_filename)
print("width: {}, height: {} ".format(width_meter, height_meter))

with open(path) as p:
    lines = p.readlines()
print("No. Lines in 1 example: {:,}". format(len(lines)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204
example path:  ../input/indoor-location-navigation/train/5d2709b303f801723c327472/4F/5da815f1aec4b20006154e34.txt
site:  5d2709b303f801723c327472
floorNo:  4F
floor_plan_filename:  ../input/indoor-location-navigation/metadata/5d2709b303f801723c327472/4F/floor_image.png
json_plan_filename:  ../input/indoor-location-navigation/metadata/5d2709b303f801723c327472/4F/floor_info.json
width: 270.37783808125175, height: 140.81899552876942 
No. Lines in 1 example: 16,018


In [6]:
# train_path filtering
def extract_path(path):
    ex_path = f"{path}"
    ex_paths = ex_path.split("/")
    site_id = ex_paths[4]
    file_id = ex_paths[6].split(".")[0]
    return [path, site_id, file_id]

path_list = [extract_path(item) for item in train_paths]
df_paths = pd.DataFrame(path_list, columns=["path", "site_id", "file_id"])
site_id_path_list = df_paths["site_id"].unique()
print(len(train_paths))
print(len(site_id_path_list))
# sample_num = math.ceil(train_num / len(site_id_path_list))
grouped_paths_df = df_paths.groupby("site_id").sample(n=3)
display(grouped_paths_df.head())
grouped_paths_list = list(grouped_paths_df["path"].unique())
print(len(grouped_paths_list))
print(grouped_paths_list[:5])

# groupby

26925
204


Unnamed: 0,path,site_id,file_id
10956,../input/indoor-location-navigation/train/5a05...,5a0546857ecc773753327266,5dccfd0f757dea000608056d
10858,../input/indoor-location-navigation/train/5a05...,5a0546857ecc773753327266,5d11dbce9c50c70008fe8acd
10742,../input/indoor-location-navigation/train/5a05...,5a0546857ecc773753327266,5e15b393f4c3420006d522ed
25629,../input/indoor-location-navigation/train/5c3c...,5c3c44b80379370013e0fd2b,5d073bbc4a19c000086c55af
25672,../input/indoor-location-navigation/train/5c3c...,5c3c44b80379370013e0fd2b,5d07950e0e86b600080363b9


612
['../input/indoor-location-navigation/train/5a0546857ecc773753327266/F2/5dccfd0f757dea000608056d.txt', '../input/indoor-location-navigation/train/5a0546857ecc773753327266/F3/5d11dbce9c50c70008fe8acd.txt', '../input/indoor-location-navigation/train/5a0546857ecc773753327266/F1/5e15b393f4c3420006d522ed.txt', '../input/indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F3/5d073bbc4a19c000086c55af.txt', '../input/indoor-location-navigation/train/5c3c44b80379370013e0fd2b/F2/5d07950e0e86b600080363b9.txt']


In [7]:
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
# grouped_df = sub_df.groupby("file").sample(n=2)
# all_file_id = grouped_df["file"].unique()
# print(len(grouped_df))
# print(len(all_file_id))
# display(grouped_df.head())
# display(sub_df.head())

In [8]:
# for line in lines[:200]:
#     print(line)

In [11]:
# using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./

In [12]:
# Import custom function from the repository
from io_f import read_data_file

# Read in 1 random example
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
sample_file = read_data_file(path)

# You can access the information for each variable:
# Each data is split for time
# Metadata is expressed with "#"

# for i in sample_file.acce[:, [0]]:
#     print(i)
#     print(int(i))

print("~~~ Example ~~~")
print("acce: {}".format(sample_file.acce), "\n" +
      "acce shape: {}".format(sample_file.acce.shape), "\n" +
#       "acacce_uncalice: {}".format(sample_file.acce_uncali), "\n" +
      "acacce_uncalice shape: {}".format(sample_file.acce_uncali.shape), "\n" +
#       "ahrs: {}".format(sample_file.ahrs), "\n" +
      "ahrs shape: {}".format(sample_file.ahrs.shape), "\n" +
#       "gyro: {}".format(sample_file.gyro), "\n" +
      "gyro shape: {}".format(sample_file.gyro.shape), "\n" +
#       "gyro_uncali: {}".format(sample_file.gyro_uncali), "\n" +
      "gyro_uncali shape: {}".format(sample_file.gyro_uncali.shape), "\n" +
#       "ibeacon: {}".format(sample_file.ibeacon), "\n" +
      "ibeacon shape: {}".format(sample_file.ibeacon.shape), "\n" +
#       "magn: {}".format(sample_file.magn), "\n" +
      "magn shape: {}".format(sample_file.magn.shape), "\n" +
#       "magn_uncali: {}".format(sample_file.magn_uncali), "\n" +
      "magn_uncali shape: {}".format(sample_file.magn_uncali.shape), "\n" +
#       "waypoint: {}".format(sample_file.waypoint), "\n" +
      "waypoint shape: {}".format(sample_file.waypoint.shape), "\n" +
#       "wifi: {}".format(sample_file.wifi), "\n" +
      "wifi shape: {}".format(sample_file.wifi.shape))

~~~ Example ~~~
acce: [[ 1.57371520e+12 -1.39266970e-01 -4.28771970e-03  1.41914520e+01]
 [ 1.57371520e+12 -2.71545400e-01 -3.48205570e-02  1.28129880e+01]
 [ 1.57371520e+12  2.59368900e-01  3.60824580e-01  1.05977480e+01]
 ...
 [ 1.57371521e+12 -1.46804810e+00  3.61395260e+00  7.70315550e+00]
 [ 1.57371521e+12 -1.90080260e+00  2.83224490e+00  9.81065400e+00]
 [ 1.57371521e+12 -1.70927430e+00  2.29055790e+00  1.02811130e+01]] 
acce shape: (554, 4) 
acacce_uncalice shape: (554, 4) 
ahrs shape: (554, 4) 
gyro shape: (554, 4) 
gyro_uncali shape: (554, 4) 
ibeacon shape: (49, 3) 
magn shape: (554, 4) 
magn_uncali shape: (554, 4) 
waypoint shape: (2, 3) 
wifi shape: (1884, 5)


In [13]:
# def show_site_png(root_path, site):
#     floor_paths = glob.glob(root_path + "metadata/" + site + "/*/floor_image.png")
#     n = len(floor_paths)
#     print("No. of floor paths: ", n)

#     # Create the custom number of rows & columns
#     ncols = [ceil(n / 3) if n > 4 else 4][0]
#     nrows = [ceil(n / ncols) if n > 4 else 1][0]

#     plt.figure(figsize=(16, 10))
#     plt.suptitle(f"Site no. '{site}'", fontsize=18)

#     # Plot image for each floor
#     for k, floor in enumerate(floor_paths):
#         # plt.subplot(nrows, ncols, k+1)
#         plt.subplot(ncols, nrows, k+1)
#         plt.rcParams["figure.facecolor"] = "white"

#         image = Image.open(floor)

#         plt.imshow(image)
#         plt.axis("off")
#         title = floor.split("/")[5]
#         plt.title(title, fontsize=15)

In [14]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [15]:
# # Checking the floor number distribution

# all_floors = glob.glob("../input/indoor-location-navigation/metadata/*/*")
# all_sites = glob.glob("../input/indoor-location-navigation/metadata/*")
# floor_no = []
# floor_counts = []

# # Floor count
# for site in all_sites:
#     floor_count = len([name for name in os.listdir(site)])
#     floor_counts.append(floor_count)

# floor_counts_df = pd.DataFrame(floor_counts, columns=["F_Count"])
# floor_counts_df = floor_counts_df["F_Count"].value_counts().reset_index()
# floor_counts_df = floor_counts_df.sort_values("index", ascending=True)

# # Extract only the floor number
# for floor in all_floors:
#     no = floor.split("/")[5]
#     floor_no.append(no)
    
# floor_no = pd.DataFrame(floor_no, columns=["No"])
# floor_no = floor_no["No"].value_counts().reset_index()
# floor_no = floor_no.sort_values("No", ascending=False)

# # ToDo: Floor expressions need to be fixed
# # 1F -> F1, L1 -> F1, G -> F1 etc

# # Plot
# # display(floor_counts_df.head(10))

# fig, axes = plt.subplots(ncols=2, figsize=(16, 10))
# axes[0] = sns.barplot(data=floor_counts_df, x="index", y="F_Count", palette="viridis", saturation=0.4, ax=axes[0])
# axes[0].set_title("Floor Count Distribution", size = 26, weight="bold")
# axes[0].set_xlabel("")
# axes[0].set_ylabel("Floor Count", size = 18, weight="bold")

# axes[1] = sns.barplot(data=floor_no, x="No", y="index", palette="viridis", saturation=0.4, ax=axes[1])
# axes[1].set_title("Frequency of Floors", size = 26, weight="bold")
# axes[1].set_xlabel("")
# axes[1].set_ylabel("Floor No.", size = 18, weight="bold")

# plt.xticks([])
# plt.yticks(fontsize=11)
# sns.despine(left=True, bottom=True);

In [16]:
# # Metadata checking (GeoJSON)
# # This is a vector representation of floor map
# geojson_paths = glob.glob("../input/indoor-location-navigation/metadata/*/*/geojson_map.json")
# print("No. of geojson file: {}".format(len(geojson_paths)))

# # Print one example
# ex = random.randint(0, len(geojson_paths))
# geojson_file_name = geojson_paths[ex]
# with open(geojson_file_name) as json_file:
#     paths = geojson_file_name.split("/")
#     site_id = paths[4]
#     floor = paths[5]
#     json_data = json.load(json_file)
#     json_properties = json_data["features"][0]["properties"]
#     print("File path: {}".format(geojson_file_name))
#     print("SiteID: {}".format(site_id))
#     print("Floor: {}".format(floor))
#     print("Floor info: {}".format(json_properties))

# # create id and floor number matching file
# site_ids = []
# floor_no = []
# floor_no_json = []

# for i in range(0, len(geojson_paths)):
#     with open(geojson_paths[i]) as f:
#         paths = geojson_paths[i].split("/")
#         site_id = paths[4]
#         floor = paths[5]
#         site_ids.append(site_id)
#         floor_no.append(floor)
#         d = json.load(f)
#         try:
#             floor_no_json.append(d["features"][0]["properties"]["floor_num"])
#         except:
#             floor_no_json.append(np.nan)

# floor_num_df = pd.DataFrame(
#     {"site_id": site_ids,
#      "floor_no": floor_no,
#      "floor_no_json": floor_no_json,
#     })

# display("floor_num_df length: {}".format(len(floor_num_df)))
# display(floor_num_df.head())

# # Get floormap dict to be used later
# floor_map_pairs = list(zip(floor_num_df["floor_no"], floor_num_df["floor_no_json"]))
# floor_map_pairs = np.unique(floor_map_pairs, axis=0) # get unique pair
# # print(floor_map_pairs) # to be used as floor_map later

# # Plot distribution
# floor_num_count_df = floor_num_df["floor_no_json"].value_counts().reset_index()
# floor_num_count_df = floor_num_count_df.sort_values("floor_no_json", ascending=False)
# # display(floor_num_count_df)
# # print(len(floor_num_count_df["floor_no_json"] == np.nan))

# fig = plt.figure()
# ax = plt.subplots(figsize=(16, 10))
# sns.barplot(data=floor_num_count_df, x="index", y="floor_no_json", palette="viridis", saturation=0.4)
# fig.show()

# # Just in case: Need for altitude info in geoJSON
# # from pyproj import Proj, transform
# # print(transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), -0.1285907, 51.50809))  # longitude first, latitude second.
# # output (meters east of 0, meters north of 0): (-14314.651244750548, 6711665.883938471)

In [17]:
# # More viz on accelerometers, wifi etc in one go
# from visualize_f import visualize_trajectory, visualize_heatmap
# from main import extract_wifi_rssi, extract_wifi_count
# from main import calibrate_magnetic_wifi_ibeacon_to_position
# from main import extract_magnetic_strength
# from main import extract_ibeacon_rssi

# # Visualizing magnetic strength
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# # extract mag, wifi, beacon of one example
# mwi_datas = calibrate_magnetic_wifi_ibeacon_to_position([path])
# magnetic_strength = extract_magnetic_strength(mwi_datas)
# wifi_rssi = extract_wifi_rssi(mwi_datas)
# wifi_counts = extract_wifi_count(mwi_datas)
# ibeacon_rssi = extract_ibeacon_rssi(mwi_datas)
# ibeacon_ummids = list(ibeacon_rssi.keys())
# target_ibeacon = ibeacon_ummids[0]

# # positions for heatmap
# heat_positions = np.array(list(magnetic_strength.keys()))
# heat_values = np.array(list(magnetic_strength.values()))
# heat_positions_wifi = np.array(list(wifi_counts.keys()))
# heat_values_wifi = np.array(list(wifi_counts.values()))
# heat_positions_bc = np.array(list(ibeacon_rssi[target_ibeacon].keys()))
# heat_values_bc = np.array(list(ibeacon_rssi[target_ibeacon].values()))[:, 0]

# # filter out positions that no wifi detected
# mask = heat_values_wifi != 0
# heat_positions_wifi = heat_positions_wifi[mask]
# heat_values_wifi = heat_values_wifi[mask]

# # get trajectory
# example = read_data_file(path)
# trajectory = example.waypoint # Returns timestamp, x, y values
# print(f"Waypoints: {trajectory}")
# trajectory = trajectory[:, 1:3] # Removes timestamp (we only need the coordinates)

# # Plot trajectory
# visualize_trajectory(trajectory = trajectory,
#                      floor_plan_filename = floor_plan_filename,
#                      width_meter = width_meter,
#                      height_meter = height_meter,
#                      title = "Example of Waypoint",)

In [18]:
# Try working out step_positions for 1 trace file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [19]:
# path_datas = read_data_file(path)
# acce_datas = path_datas.acce
# magn_datas = path_datas.magn
# ahrs_datas = path_datas.ahrs
# wifi_datas = path_datas.wifi
# ibeacon_datas = path_datas.ibeacon
# posi_datas = path_datas.waypoint # not to be used

# # acce and ahrs data translation
# rel_positions = calc_rel_positions(acce_datas, ahrs_datas)
# print(acce_datas.shape)
# print(acce_datas[0])
# print(ahrs_datas[0])
# print(rel_positions.shape)

# # magn data translation
# print(magn_datas.shape)
# print(magn_datas[0])
# # print(extract_magnetic_strength(magn_datas))

In [86]:
# Methods for preprocessing train data: Timestamp handling

def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    if data.size == 0:
        return np.array([])
    else:
        data_ts = data[:, [0]]
        diff = []
        for ts in data_ts:
            diff.append(abs(int(t) - int(ts)))
        closest_index = np.argmin(diff) # if multiple records have the same value..?
        return data[closest_index]

In [87]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    if data.size == 0:
        # print("no axis data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        z_axis = data[3]
        try:
            accuracy = data[4]
        except IndexError:
            accuracy = np.nan
        return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    if data.size == 0:
        # print("no wifi data")
        return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        bssid = data[2]
        rssi = data[3]
        if len(data) > 5:
            freq = data[4]
            last_seen_ts = data[5]
        else:
            freq = np.nan
            last_seen_ts = data[-1]
        return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    if data.size == 0:
        # print("no beacon data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        ssid = data[1]
        rssi = data[2]
        return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    if data.size == 0:
        # print("no rel_pos data")
        return [np.nan, np.nan, np.nan, np.nan]
    else:
        data_ts = data[0]
        diff_ts = int(data[0]) - int(start_ts)
        x_axis = data[1]
        y_axis = data[2]
        return [data_ts, diff_ts, x_axis, y_axis]

In [103]:
# Extract path and other data
def extract_path(path, floor_map):
    # split path
    try:
        ex_path = f"{path}"
        ex_paths = ex_path.split("/")
        site_id = ex_paths[4]
        floor = ex_paths[5]
        f = floor_map[floor]
        file_id = ex_paths[6].split(".")[0]
        return [site_id, file_id, f, floor]
    except:
        print("extract_path error")

# Definitely needs to be refactored
def extract_data(path):
    start_ts = find_start_ts(path)
    path_datas = read_data_file(path)
    acce = path_datas.acce
    ahrs = path_datas.ahrs
    magn = path_datas.magn
    gyro = path_datas.gyro
    acce_uncali = path_datas.acce_uncali
    magn_uncali = path_datas.magn_uncali
    gyro_uncali = path_datas.gyro_uncali
    wifi = path_datas.wifi
    wps = path_datas.waypoint
    ibeacon = path_datas.ibeacon
    rel_positions = calc_rel_positions(acce, ahrs)

    # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
    # ts = np.unique(wps[:, [0]])
    if acce_uncali.any():
        # print("acce_uncali")
        ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data
    elif acce.any():
        # print("acce")
        ts = np.unique(acce[:, [0]])
    else:
        print("no acce or acce_uncali")

    # extract data for each timestamp of waypoints
    res = []
    for t in ts:
        try:
            wp_closest = find_smallest_diff(t, wps)
            closest_wp_ts = wp_closest[0]
            diff_ts_wp_ts = abs(int(t) - int(closest_wp_ts))
            # time_stamp_cut = 2000, only the records within 2 sec of waypoint are kept
            if diff_ts_wp_ts < time_stamp_cut:
                # flag to indicate how close the data point is to the wps
                # print("diff_ts_wp_ts", diff_ts_wp_ts)
                within_100ms = True if abs(diff_ts_wp_ts) <= 100 else False
                within_200ms = True if abs(diff_ts_wp_ts) <= 200 else False
                x = wp_closest[1]
                y = wp_closest[2]
                # print("x, y: ", x, y)
                diff_start_ts = int(t) - int(start_ts)
                diff_start_wp_ts = int(closest_wp_ts) - int(start_ts)
                # print("diff_start_ts, diff_start_wp_ts: ", diff_start_ts, diff_start_wp_ts)
                acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
                ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
                magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
                magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
                gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
                # print("acce: ", acce_closest)
                # print("ahrs: ", ahrs_closest)
                # print("magn: ", magn_closest)
                # print("gyro: ", gyro_closest)
                acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
                magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
                gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
                # print("acce_u_closest: ", acce_u_closest)
                # print("magn_u_closest: ", magn_u_closest)
                # print("gyro_u_closest: ", gyro_u_closest)
                wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
                if len(ibeacon) > 0:
                    beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
                else:
                    beacon_closest = [np.nan, np.nan, np.nan, np.nan]
                rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
                # print([t, x, y, int(closest_wp_ts), acce_closest, acce_u_closest])
                res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
                           acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                           acce_u_closest + magn_u_closest + gyro_u_closest + \
                           wifi_closest + beacon_closest + rel_pos
                          )
            else:
                # print("no wp made it through timestamp cut")
                continue
        except Exception as exc:
            pass
            # print("Error message: ", exc)
            # print("extract_test_data error")
    return res

In [89]:
# %%timeit

# 5.55 ms ± 1.76 ms per loop
path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# for fixing floor expression
# print(floor_map_pairs) # to be used as floor_map later
# assign 1F to 1 rather than zero, just in case we want to use this as integer

# Original floor map
# floor_map = {
#     '1F': 1, '2F': 2, '3F': 3, '4F': 4, '5F': 5, '6F': 6, '7F': 7,
#     '8F': 8, '9F': 9, 'B': -1, 'B1': -1, 'B2': -2, 'B2': -3, 'B3': -3,
#     'BF': -1, 'BM': -1, 'F1': 1, 'F2': 2, 'F3': 3, 'F4': 4, 'F5': 5,
#     'F6': 6, 'F7': 7, 'F8': 8, 'F9': 9, 'F10': 10, 'G': -1, 'L1': 1, 'L2': 2,
#     'L3': 3, 'L3': 4, 'L4': 4, 'L4': 6, 'L5': 5, 'L6': 6, 'L7': 7, 'L8': 8,
#     'L9': 9, 'L10': 10, 'L11': 11, 'LG1': -1, 'LG2': -2,
#     'LM': np.nan, 'M': np.nan, 'P1': np.nan, 'P2': np.nan}

floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

def one_trace_to_rows(path, floor_map):
    try:
        path_info = extract_path(path, floor_map)
        data = extract_data(path)
        # rows = list(itertools.chain(path_info, *data))
        rows = []
        for d in data:
            row = path_info + d
            rows.append(row)
            # print("row: ", row)
        return rows
    except:
        print("one_trace_to_rows error at: ", path)

# path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# time w/ for loop with 1 train_path -> 11.6
# time w/ itertools.chain for 1 train_path -> 11.8
start = time.time()
path_info = extract_path(path, floor_map)
print("path: ", path_info)
rows = one_trace_to_rows(path, floor_map)
print("time to process one train_path", time.time() - start)
#print("col count: ", len(rows[0]))
print("rows: ", rows)

path:  ['5d2709e003f801723c32d896', '5ddbca16c5b77e0006b17a73', 3, 'F4']
acce_uncali
time to process one train_path 2.165910005569458
rows:  [['5d2709e003f801723c32d896', '5ddbca16c5b77e0006b17a73', 3, 'F4', 1574684711276, '1574684711159', 117, 61.49206, 23.005142, 1574684711166, 7, 110, False, True, 1574684711276.0, 117, -0.29989624, 0.3790741, 13.553207, nan, 1574684711276.0, 117, 0.033694934, -0.016102415, -0.56272477, nan, 1574684711276.0, 117, -24.511719, 7.659912, -21.937561, nan, 33.77506776916111, 1574684711276.0, 117, -0.6645508, 0.08508301, 0.079452515, nan, 1574684711276.0, 117, -0.24662781, 0.52871704, 11.8658905, nan, 1574684711276.0, 117, -84.375, 11.872864, -355.47638, nan, 1574684711276.0, 117, -0.83732605, -0.008483887, 0.03387451, nan, '1574684712771', 1612, '882bfb2c7bcc276bf82a25de7d1b6ea8ca569244', '5715aa2933d0474a074e637992e7ff1650a5bb36', '-46', nan, '1574684711501', '1574684713118', 1959, '89cb11b04122cef23388b0da06bd426c1f48a9b5_b1d5781111d84f7b3fe45a0852e5975

In [24]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [25]:
# Check if we can make df

# column names
col_names = ["site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_100ms", "within_200ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

print(len(col_names))

df = pd.DataFrame(rows, columns=col_names)
print("df len: ", len(df))
print("site_id nunique: ", df["site_id"].nunique())
print("file_id nunique: ", df["file_id"].nunique())
print("x value_counts: ", df["x"].value_counts())
print("y value_counts: ", df["y"].value_counts())
print("event ts nunique: ", df["ts"].nunique())
print("start ts nunique: ", df["start_ts"].nunique()) # should be one
print("diff_ts_wp_ts value_counts: ", df["diff_ts_wp_ts"].value_counts())
print("diff_ts_wp_ts nunique: ", df["diff_ts_wp_ts"].nunique())
print("within_100ms value_counts: ", df["within_100ms"].value_counts())
print("within_100ms nunique: ", df["within_100ms"].nunique())
print("within_100ms count: ", df["within_100ms"].count())
print("within_200ms value_counts: ", df["within_200ms"].value_counts())
print("within_200ms nunique: ", df["within_200ms"].nunique())
print("within_200ms count: ", df["within_200ms"].count())
display(df.head())

72
df len:  34
site_id nunique:  1
file_id nunique:  1
x value_counts:  260.75455    26
262.54420     8
Name: x, dtype: int64
y value_counts:  88.66252    26
96.36995     8
Name: y, dtype: int64
event ts nunique:  34
start ts nunique:  1
diff_ts_wp_ts value_counts:  194    1
21     1
37     1
40     1
233    1
75     1
107    1
175    1
241    1
2      1
114    1
117    1
184    1
59     1
210    1
126    1
164    1
56     1
98     1
222    1
156    1
248    1
152    1
94     1
171    1
213    1
18     1
145    1
79     1
229    1
203    1
136    1
133    1
191    1
Name: diff_ts_wp_ts, dtype: int64
diff_ts_wp_ts nunique:  34
within_100ms value_counts:  False    23
True     11
Name: within_100ms, dtype: int64
within_100ms nunique:  2
within_100ms count:  34
within_200ms value_counts:  True     26
False     8
Name: within_200ms, dtype: int64
within_200ms nunique:  2
within_200ms count:  34


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5cd969c339e2fc0b4afe7778,5cf09d2473832800099a41b0,1,F2,1559272288030,1559272287922,108,262.5442,96.36995,1559272287923,1,107,False,True,1559272000000.0,108,-0.752548,1.891525,9.495926,,1559272000000.0,108,-0.029428,0.105313,0.977983,,1559272000000.0,108,9.24,-26.58,-25.019999,,37.654672,1559272000000.0,108,-0.012634,0.012833,-0.015015,,1559272000000.0,108,-0.839889,1.939392,9.621552,,1559272000000.0,108,-13.379999,-39.54,-104.03999,,1559272000000.0,108,-0.011002,0.011002,-0.017105,,1559272288675,753,3e1de51dd2233e2502b2b4e7e3e6566c2c430d83,1a19bee6438f403a1386209022cbaa65ef9943e2,-68,,1559272286193,1559272295927,8005,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-96,1559272000000.0,1628,-0.109002,-0.513015
1,5cd969c339e2fc0b4afe7778,5cf09d2473832800099a41b0,1,F2,1559272288049,1559272287922,127,262.5442,96.36995,1559272287923,1,126,False,True,1559272000000.0,127,-0.821945,1.840088,9.383453,,1559272000000.0,127,-0.024709,0.101833,0.984215,,1559272000000.0,127,9.179999,-26.58,-25.08,,37.679904,1559272000000.0,127,-0.028824,0.036652,-0.014709,,1559272000000.0,127,-0.839889,1.939392,9.621552,,1559272000000.0,127,-13.44,-39.54,-104.1,,1559272000000.0,127,-0.011002,0.011002,-0.017105,,1559272288675,753,3e1de51dd2233e2502b2b4e7e3e6566c2c430d83,1a19bee6438f403a1386209022cbaa65ef9943e2,-68,,1559272286193,1559272295927,8005,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-96,1559272000000.0,1628,-0.109002,-0.513015
2,5cd969c339e2fc0b4afe7778,5cf09d2473832800099a41b0,1,F2,1559272288068,1559272287922,146,262.5442,96.36995,1559272287923,1,145,False,True,1559272000000.0,146,-0.952347,1.891541,9.434906,,1559272000000.0,146,-0.024591,0.103028,0.977686,,1559272000000.0,146,9.42,-26.519999,-25.26,,37.816853,1559272000000.0,146,-0.068832,0.020477,-0.029984,,1559272000000.0,146,-0.898514,1.864014,9.399002,,1559272000000.0,146,-13.199999,-39.48,-104.28,,1559272000000.0,146,-0.027191,0.034821,-0.0168,,1559272288675,753,3e1de51dd2233e2502b2b4e7e3e6566c2c430d83,1a19bee6438f403a1386209022cbaa65ef9943e2,-68,,1559272286193,1559272295927,8005,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-96,1559272000000.0,1628,-0.109002,-0.513015
3,5cd969c339e2fc0b4afe7778,5cf09d2473832800099a41b0,1,F2,1559272288087,1559272287922,165,262.5442,96.36995,1559272287923,1,164,False,True,1559272000000.0,165,-0.994217,1.864014,9.655045,,1559272000000.0,165,-0.029274,0.105325,0.975328,,1559272000000.0,165,9.719999,-26.46,-25.38,,37.930916,1559272000000.0,165,-0.095398,-0.029312,-0.036392,,1559272000000.0,165,-0.977463,1.892731,9.510284,,1559272000000.0,165,-12.9,-39.42,-104.399994,,1559272000000.0,165,-0.0672,0.018646,-0.032074,,1559272288675,753,3e1de51dd2233e2502b2b4e7e3e6566c2c430d83,1a19bee6438f403a1386209022cbaa65ef9943e2,-68,,1559272286193,1559272295927,8005,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-96,1559272000000.0,1628,-0.109002,-0.513015
4,5cd969c339e2fc0b4afe7778,5cf09d2473832800099a41b0,1,F2,1559272288107,1559272287922,185,262.5442,96.36995,1559272287923,1,184,False,True,1559272000000.0,185,-0.977478,1.868805,9.808182,,1559272000000.0,185,-0.035722,0.100948,0.983699,,1559272000000.0,185,9.719999,-26.34,-25.5,,37.927879,1559272000000.0,185,-0.09816,-0.055283,-0.029068,,1559272000000.0,185,-1.006195,1.849655,9.792633,,1559272000000.0,185,-12.9,-39.3,-104.52,,1559272000000.0,185,-0.093765,-0.031143,-0.038483,,1559272288675,753,3e1de51dd2233e2502b2b4e7e3e6566c2c430d83,1a19bee6438f403a1386209022cbaa65ef9943e2,-68,,1559272286193,1559272295927,8005,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-96,1559272000000.0,1628,-0.109002,-0.513015


In [104]:
# Set pool
num_cores = multiprocessing.cpu_count()
print(f"num_cores={num_cores}")
# args = [(p, floor_map) for p in train_paths]
args = [(p, floor_map) for p in grouped_paths_list]

pool = Pool(num_cores)

start = time.time()

# w/ 250ms settings, 3 random samples from each site_id
# 2 paths -> 18.7 sec
# 10 paths -> 315 sec (df len is 1994)
# 100 paths -> 708 sec (df len is 7183)
# all ~ 600 paths -> 

# errors
# grouped_paths_list -> 100 paths -> site_id: 8 errors, 27 correct
# grouped_paths_list -> 100 paths -> file_id: 23 errors, 77 correct

# all in one go
# res = pool.starmap(one_trace_to_rows, args)

# split the args
res = []

for arg in tqdm(np.array_split(args, 50)):
    res.extend(pool.starmap(one_trace_to_rows, arg))

num_cores=4


100%|██████████| 50/50 [08:29<00:00, 10.19s/it]


In [101]:
############################## KEEP THIS CELL FOR LATER REF ##############################

# Error in ~20% of the train paths -> caused by not having acces_uncali to create the event timestamps

# error files
# /5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bbc852a70008c01607.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bda50dc300099d34cc.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085df529994a0008202661.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt
# /5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt
# /5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt
# /5cd56b5ae2acfd2d33b5854a/B1/5d073ba64a19c000086c559b.txt
# /5cd56b5ae2acfd2d33b5854a/F1/5d07603e4cae4f000a2db525.txt
# /5cd56b63e2acfd2d33b591c2/F2/5d0b0668912a980009fe91f2.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt
# /5cd56b64e2acfd2d33b592b3/F2/5d0c9321c99c56000836df18.txt
# /5cd56b64e2acfd2d33b592b3/F3/5d0c9952ea565d0008e34e8b.txt
# /5cd56b64e2acfd2d33b592b3/F4/5d0c9d65ea565d0008e34ea2.txt
# /5cd56b5ae2acfd2d33b58549/5F/5d0613514a19c000086c432a.txt
# /5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b015e2f8a26000805ba44.txt
# /5cd56b79e2acfd2d33b5b74e/F1/5d0af3452f8a26000805b830.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt
# /5cd56b6be2acfd2d33b59d1f/F3/5d0896415125450008037c76.txt

# base_path = "../input/indoor-location-navigation/train"
# error_files = [
#     "/5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt",
#     "/5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt",
#     "/5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt",
#     "/5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt",
#     "/5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt",
#     "/5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt"
# ]

# working_path = "../input/indoor-location-navigation/train/5d2709c303f801723c3299ee/1F/5dad7d6daa1d300006faa80c.txt"
# error_paths = [base_path + e for e in error_files]
# rows = one_trace_to_rows(error_paths[1], floor_map)
# print(rows)

acce_uncali
[['5d2709c303f801723c3299ee', '5dad7d6daa1d300006faa80c', 0, '1F', 1571649995528, '1571649995399', 129, 14.127717, 52.209435, 1571649995405, 6, 123, False, True, 1571649995528.0, 129, 0.072769165, -0.73249817, 8.868912, nan, 1571649995528.0, 129, 0.009623115, 0.031642374, -0.9993657, nan, 1571649995528.0, 129, 1.8981934, -28.825378, -13.98468, nan, 32.09481001235383, 1571649995528.0, 129, 0.5451813, -0.19050598, -0.32951355, nan, 1571649995528.0, 129, 0.20805359, -1.0192108, 9.7104645, nan, 1571649995528.0, 129, 15.956116, -10.406494, -390.15045, nan, 1571649995528.0, 129, 0.591156, -0.13813782, -0.35139465, nan, '1571649997401', 2002, 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'f7303e4e5338a0edbfeabeceb012c8c7b5ed63ba', '-34', nan, '1571649989157', '1571649995585', 186, '07efd69e3167537492f0ead89fb2779633b04949_356a192b7913b04c54574d18c28d46e6395428ab_7c33876368ece2e1b804a2d191df26be063e42d9', '-85', 1571649995972.0, 573, -0.10389824129582742, -0.5451969249452694], ['5d27

In [105]:
# print(len(grouped_paths_list))
print(len(res))
print(len(res[0]))
print(len(res[0][0]))

df_train = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_train = df_train.append(df)

# print("train_path count", len(train_paths[:train_num]))
print("time to process", time.time() - start)
print("length of df made", len(df_train))
display(df_train.head(10))

50
254
72
time to process 510.6515009403229
length of df made 7087


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897466,1573714897333,133,76.45839,27.93658,1573714897343,10,123,False,True,1573715000000.0,133,-0.361328,2.250458,7.886307,,1573715000000.0,133,-0.00276,0.106572,0.991325,,1573715000000.0,133,5.714416,-40.33203,-19.604492,,45.206894,1573715000000.0,133,0.482254,-0.032211,0.103027,,1573715000000.0,133.0,-0.275131,2.052933,7.673828,,1573715000000.0,133,-31.9458,-15.632629,-346.9925,,1573715000000.0,133.0,0.486206,-0.032272,0.103943,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897476,143.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-80,1573715000000.0,836,-0.131092,-0.444588
1,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897486,1573714897333,153,76.45839,27.93658,1573714897343,10,143,False,True,1573715000000.0,153,-0.442734,2.275589,7.873154,,1573715000000.0,153,-0.005194,0.116815,0.99039,,1573715000000.0,153,4.354858,-40.33203,-19.604492,,45.055228,1573715000000.0,153,0.199417,-0.056717,0.107285,,1573715000000.0,153.0,-0.471466,2.331253,7.944977,,1573715000000.0,153,-33.30536,-15.632629,-346.9925,,1573715000000.0,153.0,0.203369,-0.056778,0.1082,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897486,153.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69,1573715000000.0,836,-0.131092,-0.444588
2,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897506,1573714897333,173,76.45839,27.93658,1573714897343,10,163,False,True,1573715000000.0,173,-0.294891,2.194183,8.476486,,1573715000000.0,173,-0.005718,0.119736,0.990181,,1573715000000.0,173,4.354858,-40.33203,-20.925903,,45.64571,1573715000000.0,173,-0.189423,0.116394,0.168015,,1573715000000.0,173.0,-0.296677,2.19838,8.001831,,1573715000000.0,173,-33.30536,-15.632629,-348.3139,,1573715000000.0,173.0,-0.185471,0.116333,0.16893,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897486,153.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69,1573715000000.0,836,-0.131092,-0.444588
3,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897526,1573714897333,193,76.45839,27.93658,1573714897343,10,183,False,True,1573715000000.0,193,-0.50737,2.300736,9.575424,,1573715000000.0,193,-0.008824,0.117493,0.990647,,1573715000000.0,193,4.354858,-40.33203,-20.925903,,45.64571,1573715000000.0,193,-0.375305,0.19043,0.264954,,1573715000000.0,193.0,-0.441528,2.254044,9.113937,,1573715000000.0,193,-33.30536,-15.632629,-348.3139,,1573715000000.0,193.0,-0.371353,0.190369,0.265869,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897542,209.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-85,1573715000000.0,836,-0.131092,-0.444588
4,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897546,1573714897333,213,76.45839,27.93658,1573714897343,10,203,False,False,1573715000000.0,213,-0.206909,2.413254,9.927979,,1573715000000.0,213,-0.01083,0.114942,0.991193,,1573715000000.0,213,4.354858,-39.653015,-21.586609,,45.357558,1573715000000.0,213,-0.238419,0.10788,0.336853,,1573715000000.0,213.0,-0.431366,2.359985,9.81604,,1573715000000.0,213,-33.30536,-14.953613,-348.9746,,1573715000000.0,213.0,-0.234467,0.107819,0.337769,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897549,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-75,1573715000000.0,836,-0.131092,-0.444588
5,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897566,1573714897333,233,76.45839,27.93658,1573714897343,10,223,False,False,1573715000000.0,233,0.582581,2.302521,9.853149,,1573715000000.0,233,-0.007443,0.114021,0.991423,,1573715000000.0,233,3.675842,-41.012573,-20.925903,,46.189137,1573715000000.0,233,-0.095673,-0.041275,0.286255,,1573715000000.0,233.0,0.149826,2.380936,9.912415,,1573715000000.0,233,-33.984375,-16.313171,-348.3139,,1573715000000.0,233.0,-0.091721,-0.041336,0.28717,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897558,225.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-81,1573715000000.0,836,-0.131092,-0.444588
6,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897586,1573714897333,253,76.45839,27.93658,1573714897343,10,243,False,False,1573715000000.0,253,1.181732,2.057114,10.274536,,1573715000000.0,253,0.001232,0.112004,0.991525,,1573715000000.0,253,3.675842,-41.012573,-20.265198,,45.893586,1573715000000.0,253,-0.17662,-0.182419,0.09024,,1573715000000.0,253.0,0.987808,2.20018,9.963882,,1573715000000.0,253,-33.984375,-16.313171,-347.6532,,1573715000000.0,253.0,-0.172668,-0.18248,0.091156,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897588,255.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-75,1573715000000.0,836,-0.131092,-0.444588
7,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714914031,1573714897333,16698,78.48432,8.270371,1573714914267,16934,236,False,False,1573715000000.0,16698,1.05304,0.357239,9.65802,,1573715000000.0,16698,-0.026685,-0.103508,-0.993901,,1573715000000.0,16698,-5.160522,-46.44928,-14.978027,,49.076551,1573715000000.0,16698,0.433777,-0.072693,-0.037598,,1573715000000.0,16698.0,1.478012,0.533814,9.432373,,1573715000000.0,16698,-42.82074,-21.749878,-342.36603,,1573715000000.0,16698.0,0.437729,-0.072754,-0.036682,,1573714915011,17678,da39a3ee5e6b4b0d3255bfef95601890afd80709,e0569c95a2a6f68fe59775a63d071094cd98de65,-51,,1573714910824,1573714913950,16617.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-67,1573715000000.0,16497,-0.006829,-0.536111
8,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714914052,1573714897333,16719,78.48432,8.270371,1573714914267,16934,215,False,False,1573715000000.0,16719,0.300064,0.185455,9.586197,,1573715000000.0,16719,-0.02684,-0.107271,-0.993511,,1573715000000.0,16719,-6.520081,-47.808838,-15.638733,,50.722445,1573715000000.0,16719,0.433777,0.006683,-0.054108,,1573715000000.0,16719.0,0.626877,0.224365,9.707703,,1573715000000.0,16719,-44.180298,-23.109436,-343.02673,,1573715000000.0,16719.0,0.437729,0.006622,-0.053192,,1573714915011,17678,da39a3ee5e6b4b0d3255bfef95601890afd80709,e0569c95a2a6f68fe59775a63d071094cd98de65,-51,,1573714910824,1573714913950,16617.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-67,1573715000000.0,16497,-0.006829,-0.536111
9,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714914072,1573714897333,16739,78.48432,8.270371,1573714914267,16934,195,False,True,1573715000000.0,16739,0.1241,0.390152,8.715897,,1573715000000.0,16739,-0.026109,-0.111348,-0.993067,,1573715000000.0,16739,-7.199097,-47.12982,-15.638733,,50.17586,1573715000000.0,16739,0.455078,0.100952,0.091843,,1573715000000.0,16739.0,0.12709,0.241119,9.245026,,1573715000000.0,16739,-44.859314,-22.43042,-343.02673,,1573715000000.0,16739.0,0.437729,0.006622,-0.053192,,1573714915011,17678,da39a3ee5e6b4b0d3255bfef95601890afd80709,e0569c95a2a6f68fe59775a63d071094cd98de65,-51,,1573714910824,1573714913950,16617.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-67,1573715000000.0,16497,-0.006829,-0.536111


In [106]:
print("df len: ", len(df_train), "\n")
print("site_id nunique: ", df_train["site_id"].nunique(), "\n")
print("site_id value_counts: ", df_train["site_id"].value_counts(), "\n")
print("file_id nunique: ", df_train["file_id"].nunique(), "\n")
print("file_id value_counts: ", df_train["file_id"].value_counts(), "\n")
print("floor value_counts: ", df_train["floor"].value_counts(), "\n")
print("x value_counts: ", df_train["x"].value_counts(), "\n")
print("y value_counts: ", df_train["y"].value_counts(), "\n")
print("event ts nunique: ", df_train["ts"].nunique(), "\n")
print("start ts nunique: ", df_train["start_ts"].nunique(), "\n") # should be one
print("diff_ts_wp_ts value_counts: ", df_train["diff_ts_wp_ts"].value_counts(), "\n")
print("diff_ts_wp_ts nunique: ", df_train["diff_ts_wp_ts"].nunique(), "\n")
display(df_train.head())

df len:  7087 

site_id nunique:  17 

site_id value_counts:  5cd56b64e2acfd2d33b592b3    2154
5cd56b61e2acfd2d33b58d20     652
5a0546857ecc773753327266     492
5cd56b5ae2acfd2d33b58548     466
5cd56b6ae2acfd2d33b59ccb     399
5cd56b67e2acfd2d33b596bd     332
5cd56b5ae2acfd2d33b58546     320
5cd56b5ae2acfd2d33b58544     297
5cd56b64e2acfd2d33b5932f     275
5cd56b6ae2acfd2d33b59ccc     253
5cd56b64e2acfd2d33b59246     250
5c3c44b80379370013e0fd2b     248
5cd56b5ae2acfd2d33b58549     240
5cd56b5ae2acfd2d33b5854a     218
5cd56865eb294480de7167b6     187
5cd56b63e2acfd2d33b591c2     159
5cd56b6ae2acfd2d33b59c90     145
Name: site_id, dtype: int64 

file_id nunique:  50 

file_id value_counts:  5d0c930bc99c56000836df16    972
5d0c9318ea565d0008e34e34    786
5d0c9321c99c56000836df18    396
5d086c4ed85da00008644fd0    302
5d086564bb84450008f56991    271
5cf218dac852a70008c0161d    254
5dccfd0f757dea000608056d    254
5cf359e86a98eb00096035f4    210
5e15b393f4c3420006d522ed    206
5cf5f6fa9eda1

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897466,1573714897333,133,76.45839,27.93658,1573714897343,10,123,False,True,1573715000000.0,133,-0.361328,2.250458,7.886307,,1573715000000.0,133,-0.00276,0.106572,0.991325,,1573715000000.0,133,5.714416,-40.33203,-19.604492,,45.206894,1573715000000.0,133,0.482254,-0.032211,0.103027,,1573715000000.0,133.0,-0.275131,2.052933,7.673828,,1573715000000.0,133,-31.9458,-15.632629,-346.9925,,1573715000000.0,133.0,0.486206,-0.032272,0.103943,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897476,143.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-80,1573715000000.0,836,-0.131092,-0.444588
1,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897486,1573714897333,153,76.45839,27.93658,1573714897343,10,143,False,True,1573715000000.0,153,-0.442734,2.275589,7.873154,,1573715000000.0,153,-0.005194,0.116815,0.99039,,1573715000000.0,153,4.354858,-40.33203,-19.604492,,45.055228,1573715000000.0,153,0.199417,-0.056717,0.107285,,1573715000000.0,153.0,-0.471466,2.331253,7.944977,,1573715000000.0,153,-33.30536,-15.632629,-346.9925,,1573715000000.0,153.0,0.203369,-0.056778,0.1082,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897486,153.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69,1573715000000.0,836,-0.131092,-0.444588
2,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897506,1573714897333,173,76.45839,27.93658,1573714897343,10,163,False,True,1573715000000.0,173,-0.294891,2.194183,8.476486,,1573715000000.0,173,-0.005718,0.119736,0.990181,,1573715000000.0,173,4.354858,-40.33203,-20.925903,,45.64571,1573715000000.0,173,-0.189423,0.116394,0.168015,,1573715000000.0,173.0,-0.296677,2.19838,8.001831,,1573715000000.0,173,-33.30536,-15.632629,-348.3139,,1573715000000.0,173.0,-0.185471,0.116333,0.16893,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897486,153.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69,1573715000000.0,836,-0.131092,-0.444588
3,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897526,1573714897333,193,76.45839,27.93658,1573714897343,10,183,False,True,1573715000000.0,193,-0.50737,2.300736,9.575424,,1573715000000.0,193,-0.008824,0.117493,0.990647,,1573715000000.0,193,4.354858,-40.33203,-20.925903,,45.64571,1573715000000.0,193,-0.375305,0.19043,0.264954,,1573715000000.0,193.0,-0.441528,2.254044,9.113937,,1573715000000.0,193,-33.30536,-15.632629,-348.3139,,1573715000000.0,193.0,-0.371353,0.190369,0.265869,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897542,209.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-85,1573715000000.0,836,-0.131092,-0.444588
4,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1,F2,1573714897546,1573714897333,213,76.45839,27.93658,1573714897343,10,203,False,False,1573715000000.0,213,-0.206909,2.413254,9.927979,,1573715000000.0,213,-0.01083,0.114942,0.991193,,1573715000000.0,213,4.354858,-39.653015,-21.586609,,45.357558,1573715000000.0,213,-0.238419,0.10788,0.336853,,1573715000000.0,213.0,-0.431366,2.359985,9.81604,,1573715000000.0,213,-33.30536,-14.953613,-348.9746,,1573715000000.0,213.0,-0.234467,0.107819,0.337769,,1573714899475,2142,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45,,1573714884460,1573714897549,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-75,1573715000000.0,836,-0.131092,-0.444588


In [107]:
# Visualizing timestamp distribution

# Explore
# print(df_train["ts"].dtype)
# print(df_test["ts"].dtype)

# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_train, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_train, ["floor_converted","ts", "start_ts", "diff_start_ts", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
             "acce_ts", "diff_acce_ts", \
             "ahrs_ts", "diff_ahrs_ts", \
             "magn_ts", "diff_magn_ts", \
             "gyro_ts", "diff_gyro_ts", \
             "acce_u_ts", "diff_acce_u_ts", \
             "magn_u_ts", "diff_magn_u_ts", \
             "gyro_u_ts", "diff_gyro_u_ts", \
             "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_rssi", \
             "rel_ts", "diff_rel_ts"
            ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_train]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_train.head())

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1.0,F2,1573715000000.0,1573715000000.0,133.0,76.45839,27.93658,1573715000000.0,10.0,123.0,False,True,1573715000000.0,133.0,-0.361328,2.250458,7.886307,,1573715000000.0,133.0,-0.00276,0.106572,0.991325,,1573715000000.0,133.0,5.714416,-40.33203,-19.604492,,45.206894,1573715000000.0,133.0,0.482254,-0.032211,0.103027,,1573715000000.0,133.0,-0.275131,2.052933,7.673828,,1573715000000.0,133.0,-31.9458,-15.632629,-346.9925,,1573715000000.0,133.0,0.486206,-0.032272,0.103943,,1573715000000.0,2142.0,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45.0,,1573715000000.0,1573715000000.0,143.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-80.0,1573715000000.0,836.0,-0.131092,-0.444588,0,46,8,14,75,132,2019-11-14 07:01:37.465999872,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00,2019-11-14 07:01:24.460,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00
1,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1.0,F2,1573715000000.0,1573715000000.0,153.0,76.45839,27.93658,1573715000000.0,10.0,143.0,False,True,1573715000000.0,153.0,-0.442734,2.275589,7.873154,,1573715000000.0,153.0,-0.005194,0.116815,0.99039,,1573715000000.0,153.0,4.354858,-40.33203,-19.604492,,45.055228,1573715000000.0,153.0,0.199417,-0.056717,0.107285,,1573715000000.0,153.0,-0.471466,2.331253,7.944977,,1573715000000.0,153.0,-33.30536,-15.632629,-346.9925,,1573715000000.0,153.0,0.203369,-0.056778,0.1082,,1573715000000.0,2142.0,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45.0,,1573715000000.0,1573715000000.0,153.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69.0,1573715000000.0,836.0,-0.131092,-0.444588,0,46,8,14,75,132,2019-11-14 07:01:37.486000128,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00,2019-11-14 07:01:24.460,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00
2,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1.0,F2,1573715000000.0,1573715000000.0,173.0,76.45839,27.93658,1573715000000.0,10.0,163.0,False,True,1573715000000.0,173.0,-0.294891,2.194183,8.476486,,1573715000000.0,173.0,-0.005718,0.119736,0.990181,,1573715000000.0,173.0,4.354858,-40.33203,-20.925903,,45.64571,1573715000000.0,173.0,-0.189423,0.116394,0.168015,,1573715000000.0,173.0,-0.296677,2.19838,8.001831,,1573715000000.0,173.0,-33.30536,-15.632629,-348.3139,,1573715000000.0,173.0,-0.185471,0.116333,0.16893,,1573715000000.0,2142.0,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45.0,,1573715000000.0,1573715000000.0,153.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69.0,1573715000000.0,836.0,-0.131092,-0.444588,0,46,8,14,75,132,2019-11-14 07:01:37.505999872,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00,2019-11-14 07:01:24.460,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00
3,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1.0,F2,1573715000000.0,1573715000000.0,193.0,76.45839,27.93658,1573715000000.0,10.0,183.0,False,True,1573715000000.0,193.0,-0.50737,2.300736,9.575424,,1573715000000.0,193.0,-0.008824,0.117493,0.990647,,1573715000000.0,193.0,4.354858,-40.33203,-20.925903,,45.64571,1573715000000.0,193.0,-0.375305,0.19043,0.264954,,1573715000000.0,193.0,-0.441528,2.254044,9.113937,,1573715000000.0,193.0,-33.30536,-15.632629,-348.3139,,1573715000000.0,193.0,-0.371353,0.190369,0.265869,,1573715000000.0,2142.0,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45.0,,1573715000000.0,1573715000000.0,209.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-85.0,1573715000000.0,836.0,-0.131092,-0.444588,0,46,8,14,75,132,2019-11-14 07:01:37.526000128,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00,2019-11-14 07:01:24.460,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00
4,5a0546857ecc773753327266,5dccfd0f757dea000608056d,1.0,F2,1573715000000.0,1573715000000.0,213.0,76.45839,27.93658,1573715000000.0,10.0,203.0,False,False,1573715000000.0,213.0,-0.206909,2.413254,9.927979,,1573715000000.0,213.0,-0.01083,0.114942,0.991193,,1573715000000.0,213.0,4.354858,-39.653015,-21.586609,,45.357558,1573715000000.0,213.0,-0.238419,0.10788,0.336853,,1573715000000.0,213.0,-0.431366,2.359985,9.81604,,1573715000000.0,213.0,-33.30536,-14.953613,-348.9746,,1573715000000.0,213.0,-0.234467,0.107819,0.337769,,1573715000000.0,2142.0,289f5718b8c46b97fd41e698770bcb8c1808470e,99bb474dc926f36679e8796c5fe3c5d04064575e,-45.0,,1573715000000.0,1573715000000.0,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-75.0,1573715000000.0,836.0,-0.131092,-0.444588,0,46,8,14,75,132,2019-11-14 07:01:37.545999872,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00,2019-11-14 07:01:24.460,2019-11-14,2019-11-14 07:00:00,2019-11-14 07:01:00


In [108]:
# Calculate moving averages
# Differencing respect to time (as each timestep is unevenly spaced)

In [109]:
# Save the file in parquet
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Saving train data
train_file_name = "indoor_train_2.pkl"

with open(train_file_name, "wb") as file:
    pickle.dump(df_train, file)

# Save them to output
# df_train.to_csv('df_train_2.csv',index=False)
# df_test.to_csv('df_test.csv',index=False)

In [110]:
# Load data it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

In [111]:
print((df_train["file_id"].nunique()))
print((df_train["site_id"].nunique()))
print((df_train["site_id"].value_counts()))

50
17
5cd56b64e2acfd2d33b592b3    2154
5cd56b61e2acfd2d33b58d20     652
5a0546857ecc773753327266     492
5cd56b5ae2acfd2d33b58548     466
5cd56b6ae2acfd2d33b59ccb     399
5cd56b67e2acfd2d33b596bd     332
5cd56b5ae2acfd2d33b58546     320
5cd56b5ae2acfd2d33b58544     297
5cd56b64e2acfd2d33b5932f     275
5cd56b6ae2acfd2d33b59ccc     253
5cd56b64e2acfd2d33b59246     250
5c3c44b80379370013e0fd2b     248
5cd56b5ae2acfd2d33b58549     240
5cd56b5ae2acfd2d33b5854a     218
5cd56865eb294480de7167b6     187
5cd56b63e2acfd2d33b591c2     159
5cd56b6ae2acfd2d33b59c90     145
Name: site_id, dtype: int64


In [112]:
test_site_id = sub_df["site"].unique()
train_site_id = df_train["site_id"].unique()
print(test_site_id, "\n")
print(train_site_id, "\n")
a = list(set(test_site_id) & set(train_site_id))
print(a)

['5a0546857ecc773753327266' '5c3c44b80379370013e0fd2b'
 '5d27075f03f801723c2e360f' '5d27096c03f801723c31e5e0'
 '5d27097f03f801723c320d97' '5d27099f03f801723c32511d'
 '5d2709a003f801723c3251bf' '5d2709b303f801723c327472'
 '5d2709bb03f801723c32852c' '5d2709c303f801723c3299ee'
 '5d2709d403f801723c32bd39' '5d2709e003f801723c32d896'
 '5da138274db8ce0c98bbd3d2' '5da1382d4db8ce0c98bbe92e'
 '5da138314db8ce0c98bbf3a0' '5da138364db8ce0c98bc00f1'
 '5da1383b4db8ce0c98bc11ab' '5da138754db8ce0c98bca82f'
 '5da138764db8ce0c98bcaa46' '5da1389e4db8ce0c98bd0547'
 '5da138b74db8ce0c98bd4774' '5da958dd46f8266d0737457b'
 '5dbc1d84c1eb61796cf7c010' '5dc8cea7659e181adb076a3f'] 

['5a0546857ecc773753327266' '5c3c44b80379370013e0fd2b'
 '5cd56865eb294480de7167b6' '5cd56b5ae2acfd2d33b58544'
 '5cd56b5ae2acfd2d33b58546' '5cd56b5ae2acfd2d33b58548'
 '5cd56b5ae2acfd2d33b58549' '5cd56b5ae2acfd2d33b5854a'
 '5cd56b61e2acfd2d33b58d20' '5cd56b63e2acfd2d33b591c2'
 '5cd56b64e2acfd2d33b59246' '5cd56b64e2acfd2d33b592b3'
 '5cd56