In [15]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool

import pickle

pd.set_option("display.max_columns", 100)

In [16]:
# Check out each file. Content, images

root_path = "/home/jupyter/input/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204


In [17]:
# Get submission file
sub_df = pd.read_csv("/home/jupyter/input/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
grouped_df = sub_df.groupby("file").sample(n=2)
all_file_id = grouped_df["file"].unique()
print(len(grouped_df))
print(len(all_file_id))
display(grouped_df.head())
display(sub_df.head())

1252
626


Unnamed: 0,site_path_timestamp,site,file,timestamp
7168,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,25542
7173,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,60788
7348,5da138b74db8ce0c98bd4774_01c41f1aeba5c48c2c4dd...,5da138b74db8ce0c98bd4774,01c41f1aeba5c48c2c4dd568,53254
7350,5da138b74db8ce0c98bd4774_01c41f1aeba5c48c2c4dd...,5da138b74db8ce0c98bd4774,01c41f1aeba5c48c2c4dd568,65963
6605,5da138764db8ce0c98bcaa46_030b3d94de8acae7c9365...,5da138764db8ce0c98bcaa46,030b3d94de8acae7c936563d,74260


Unnamed: 0,site_path_timestamp,site,file,timestamp
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,18763
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,22328


In [47]:
# 200 train paths come out with ~1000 examples, so multiply train examples by 5 to extract similar no. of examples
# test_num = train_num * 5
# test_num = 100
test_num = len(sub_df)
# test_num = round(len(sub_df) / 2)

# set timestamp cut
# time_stamp_cut = 250 # train 2
# time_stamp_cut = 100 # train 2.1

# print(test_num)
# print(len(sub_df.iloc[:test_num, :]))

In [5]:
# using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
# !cp -r /home/jupyter/input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./

In [6]:
import compute_f
import io_f
import visualize_f
import main
from io_f import read_data_file

In [7]:
# Try working out step_positions for 1 trace file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [8]:
# Methods for preprocessing train data: Timestamp handling
def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    data_ts = data[:, [0]]
    diff = []
    for ts in data_ts:
        diff.append(abs(int(t) - int(ts)))
    closest_index = np.argmin(diff) # if multiple records have the same value..?
    return data[closest_index]

In [9]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    z_axis = data[3]
    try:
        accuracy = data[4]
    except IndexError:
        accuracy = np.nan
    return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    bssid = data[2]
    rssi = data[3]
    if len(data) > 5:
        freq = data[4]
        last_seen_ts = data[5]
    else:
        freq = np.nan
        last_seen_ts = data[-1]
    return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    rssi = data[2]
    return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    return [data_ts, diff_ts, x_axis, y_axis]

In [10]:
floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

In [11]:
# Try loading to see if it works properly
train_file_name = "../jupyter/indoor_train_3.pkl"

# Load data it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

In [12]:
display(df_train.head())

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_100ms,within_200ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5cd56c11e2acfd2d33b6b413,5d0a28ea096be600087ee559,4.0,F5,1560947000000.0,1560947000000.0,1.0,58.555584,99.83644,,,,,,1560947000000.0,123.0,2.164307,3.568893,9.806992,,1560947000000.0,123.0,0.133892,0.147807,0.974019,,1560947000000.0,123.0,1.68,-29.58,-12.84,,32.290314,1560947000000.0,123.0,0.628174,0.762878,-0.085388,,1560947000000.0,123.0,2.488541,3.444473,9.980469,,1560947000000.0,123.0,0.42,-82.2,-156.59999,,1560947000000.0,123.0,0.623077,0.808777,-0.158813,,1560947000000.0,642.0,3179f4bf7224dced6eb4c032dcd06f2ea71a5ba0,8788fab8673e3fb9c4ee2e243defd6da04db7e17,-72.0,,1560947000000.0,1560947000000.0,197.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-87.0,1560947000000.0,969.0,-0.12444,-0.402672,93,11350,20,1744,15004,10751,2019-06-19 12:18:59.196000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:18:00,2019-06-19 12:18:49.288999936,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:18:00
1,5cd56c11e2acfd2d33b6b413,5d0a28ea096be600087ee559,4.0,F5,1560947000000.0,1560947000000.0,11969.0,66.17059,85.414116,,,,,,1560947000000.0,11972.0,0.8004,1.854446,9.540192,,1560947000000.0,11972.0,-0.026937,0.156391,0.928714,,1560947000000.0,11972.0,-14.52,-15.0,-30.179998,,36.696903,1560947000000.0,11972.0,-0.255722,-0.009247,0.282333,,1560947000000.0,11972.0,0.640091,1.733612,9.647858,,1560947000000.0,11972.0,-15.78,-67.619995,-173.93999,,1560947000000.0,11972.0,-0.328018,0.073303,0.222351,,1560947000000.0,11841.0,3179f4bf7224dced6eb4c032dcd06f2ea71a5ba0,8788fab8673e3fb9c4ee2e243defd6da04db7e17,-54.0,,1560947000000.0,1560947000000.0,11575.0,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_f4a01...,-71.0,1560947000000.0,11696.0,-0.466097,-0.539031,93,11350,20,1744,15004,2927,2019-06-19 12:19:11.164000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:19:00,2019-06-19 12:19:10.468000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:19:00
2,5cd56c11e2acfd2d33b6b413,5d0a28e924caf50008a4f241,4.0,F5,1560947000000.0,1560947000000.0,0.0,39.006012,97.94369,,,,,,1560947000000.0,130.0,1.058838,2.773285,8.855835,,1560947000000.0,130.0,-0.047736,-0.147921,-0.987703,,1560947000000.0,130.0,-3.78,-21.72,-26.64,,34.579422,1560947000000.0,130.0,-0.007111,0.353607,1.296356,,1560947000000.0,130.0,0.96312,2.782852,9.098709,,1560947000000.0,130.0,-1.02,-73.2,-173.87999,,1560947000000.0,130.0,-0.012207,0.399506,1.222931,,1560947000000.0,606.0,3179f4bf7224dced6eb4c032dcd06f2ea71a5ba0,8788fab8673e3fb9c4ee2e243defd6da04db7e17,-79.0,,1560947000000.0,1560947000000.0,8824.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-70.0,1560947000000.0,819.0,0.100965,-0.486124,93,11349,20,1744,15004,10751,2019-06-19 12:17:24.368000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:17:00,2019-06-19 12:17:15.960000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:17:00
3,5cd56c11e2acfd2d33b6b413,5d0a28e924caf50008a4f241,4.0,F5,1560947000000.0,1560947000000.0,10176.0,44.875984,86.00827,,,,,,1560947000000.0,10168.0,-0.439087,2.589035,8.77449,,1560947000000.0,10168.0,0.097284,-0.095686,-0.648736,,1560947000000.0,10168.0,-26.699999,-9.059999,-23.58,,36.755815,1560947000000.0,10168.0,-0.317429,0.031677,0.092361,,1560947000000.0,10168.0,-0.339783,2.654831,8.366516,,1560947000000.0,10168.0,-23.939999,-60.54,-170.81999,,1560947000000.0,10168.0,-0.322525,0.077576,0.018936,,1560947000000.0,10469.0,3179f4bf7224dced6eb4c032dcd06f2ea71a5ba0,8788fab8673e3fb9c4ee2e243defd6da04db7e17,-81.0,,1560947000000.0,1560947000000.0,10734.0,0e570c3406b79266b7ada12e3b9314e7bb9dde3e_f4a01...,-78.0,1560947000000.0,10345.0,0.558358,0.079294,93,11349,20,1744,15004,2927,2019-06-19 12:17:34.544000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:17:00,2019-06-19 12:17:32.284000000,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:17:00
4,5cd56c11e2acfd2d33b6b413,5d0a28e924caf50008a4f241,4.0,F5,1560947000000.0,1560947000000.0,18787.0,58.084217,85.200005,,,,,,1560947000000.0,18789.0,0.519241,2.779266,10.147965,,1560947000000.0,18789.0,0.111921,-0.091907,-0.611441,,1560947000000.0,18789.0,-13.38,-11.94,-20.1,,26.936926,1560947000000.0,18789.0,0.044189,0.075668,-0.027374,,1560947000000.0,18789.0,0.464203,2.825928,9.956543,,1560947000000.0,18789.0,-10.62,-63.42,-167.34,,1560947000000.0,18789.0,0.039093,0.121567,-0.1008,,1560947000000.0,18642.0,3179f4bf7224dced6eb4c032dcd06f2ea71a5ba0,8788fab8673e3fb9c4ee2e243defd6da04db7e17,-67.0,,1560947000000.0,1560947000000.0,19285.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-71.0,1560947000000.0,18828.0,0.648444,0.158123,93,11349,20,1744,15004,10751,2019-06-19 12:17:43.155000064,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:17:00,2019-06-19 12:17:42.464999936,2019-06-19,2019-06-19 12:00:00,2019-06-19 12:17:00


In [36]:
# def extract_test_data(df):
#     test_rows = []
#     for index, row in df.iterrows():
#         test_site = row["site"]
#         file_name = row["file"]
#         test_ts = row["timestamp"]

#         test_path = root_path + "test/" + file_name + ".txt" # get test_path from file name
#         start_ts = find_start_ts(test_path)
#         diff_start_ts = int(test_ts) - int(start_ts)
#         path_datas = read_data_file(test_path)
#         acce = path_datas.acce
#         ahrs = path_datas.ahrs
#         magn = path_datas.magn
#         wifi = path_datas.wifi

#         # extract data for each timestamp of waypoints
#         res = []
#         acce_closest = split_axis(find_smallest_diff(test_ts, acce))
#         ahrs_closest = split_axis(find_smallest_diff(test_ts, ahrs))
#         magn_closest = split_axis(find_smallest_diff(test_ts, magn))
#         wifi_closest = split_wifi(find_smallest_diff(test_ts, wifi))
#         test_rows.append([test_site, file_name, np.nan, np.nan, test_ts, np.nan, np.nan, start_ts, diff_start_ts] + \
#                           acce_closest + ahrs_closest + magn_closest + wifi_closest + \
#                          [acce_closest[0], ahrs_closest[0], magn_closest[0], wifi_closest[0]])
#     return test_rows

In [25]:
# Generate test data
def extract_test_data(df):
    test_rows = []
    for index, row in tqdm(df.iterrows()):
        try:
            row_id = row["site_path_timestamp"]
            test_site = row["site"]
            file_name = row["file"]
            test_ts = row["timestamp"]
            test_path = root_path + "test/" + file_name + ".txt" # get test_path from file name
            
            # targets
            floor_converted = np.nan
            floor = np.nan
            x = np.nan
            y = np.nan

            # extract data for each timestamp of waypoints
            start_ts = find_start_ts(test_path)
            path_datas = read_data_file(test_path)
            acce = path_datas.acce
            ahrs = path_datas.ahrs
            magn = path_datas.magn
            gyro = path_datas.gyro
            acce_uncali = path_datas.acce_uncali
            magn_uncali = path_datas.magn_uncali
            gyro_uncali = path_datas.gyro_uncali
            wifi = path_datas.wifi
            # wps = path_datas.waypoint
            ibeacon = path_datas.ibeacon
            rel_positions = calc_rel_positions(acce, ahrs)

            diff_ts_wp_ts = test_ts
            within_100ms = np.nan
            within_200ms = np.nan
            diff_start_ts = test_ts
            diff_start_wp_ts = test_ts
            closest_wp_ts = test_ts
            acce_closest = split_axis(find_smallest_diff(test_ts, acce), test_ts)
            ahrs_closest = split_axis(find_smallest_diff(test_ts, ahrs), test_ts)
            magn_closest = split_axis(find_smallest_diff(test_ts, magn), test_ts)
            magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
            gyro_closest = split_axis(find_smallest_diff(test_ts, gyro), test_ts)
            acce_u_closest = split_axis(find_smallest_diff(test_ts, acce_uncali), test_ts)
            magn_u_closest = split_axis(find_smallest_diff(test_ts, magn_uncali), test_ts)
            gyro_u_closest = split_axis(find_smallest_diff(test_ts, gyro_uncali), test_ts)
            wifi_closest = split_wifi(find_smallest_diff(test_ts, wifi), test_ts)
            if len(ibeacon) > 0:
                beacon_closest = split_beacon(find_smallest_diff(test_ts, ibeacon), test_ts)
            else:
                beacon_closest = [np.nan, np.nan, np.nan, np.nan]
            rel_pos = split_rel_pos(find_smallest_diff(test_ts, rel_positions), test_ts)
            test_rows.append([row_id, test_site, file_name, floor_converted, \
                              floor, test_ts, start_ts, diff_start_ts, x, y, \
                              closest_wp_ts, diff_start_wp_ts, diff_ts_wp_ts, \
                              within_100ms, within_200ms] + \
                              acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                              acce_u_closest + magn_u_closest + gyro_u_closest + \
                              wifi_closest + beacon_closest + rel_pos
                            )
        except Exception as exc:
            print("Error message: ", exc)
            print("extract_test_data error at: ", row_id)
            # print("extract_test_data error")
    return test_rows

In [38]:
# # can read_data_file method read test data
# print(test_paths[0])
# test_path = test_paths[0]
# read_data_file(test_path)
# find_start_ts(test_path)

In [39]:
# # try generating test data
# # kaggle notebook -> 48.8 sec
# # here -> 28.9 sec
# start = time.time()
# test_rows = extract_test_data(sub_df.iloc[:5, :])
# print(f"time to process: ", time.time() - start)

In [40]:
# test_df = pd.DataFrame(test_rows)
# display(test_df.head())
# print(len(test_rows[0]))

In [48]:
# Pool for test data
# kaggle notebook -> 144.7 sec for 30 paths
# here -> 26.6 sec for 30 paths at time_stamp_cut = 2000
# here -> 89.5 sec for 100 paths at time_stamp_cut = 2000
# here -> 51.4 sec for 100 paths at time_stamp_cut = 1000
# -> 5100 sec for ~10,000 examples -> 85 min or so
# sub_df length = 10133

# grouped_df length = 1252 -> 366.2 sec w/ 250 ms cutline -> x10 needs ~3600 sec

def apply_pool_to_df(df, f, pool, num_cores):
    result = pool.map(f, [d for d in np.array_split(df, num_cores)])
    pool.close()
    return result

num_cores = multiprocessing.cpu_count()
pool = Pool(num_cores)
print("cores: ", num_cores)

start = time.time()

# res = apply_pool_to_df(grouped_df, extract_test_data, pool, num_cores)
res = apply_pool_to_df(sub_df.iloc[:test_num, :], extract_test_data, pool, num_cores)

0it [00:00, ?it/s]

cores:  16


634it [20:07,  1.90s/it]
633it [21:35,  2.05s/it]
633it [21:45,  2.06s/it]
633it [22:23,  2.12s/it]
634it [27:05,  2.56s/it]
633it [27:44,  2.63s/it]
633it [29:21,  2.78s/it]
633it [29:39,  2.81s/it]
634it [33:52,  3.21s/it]
633it [34:53,  3.31s/it]
633it [36:31,  3.46s/it]
633it [37:09,  3.52s/it]
633it [37:31,  3.56s/it]
634it [38:51,  3.68s/it]
634it [39:12,  3.71s/it]
633it [43:23,  4.11s/it]


In [49]:
# print(f"time to process {len(grouped_df)} examples of sub_df", time.time() - start)
print(f"time to process {len(sub_df.iloc[:test_num, :])} examples of sub_df", time.time() - start)

time to process 10133 examples of sub_df 2603.4500098228455


In [50]:
col_names = ["site_path_timestamp", "site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_250ms", "within_500ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

df_test = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_test = df_test.append(df)
# df_test = df_test.set_index("site_path_timestamp")

# process 1000 records -> 173.9 sec -> all test records are ~10,000 -> 1740 sec (~29min)
print("test_path count", len(test_paths[:test_num]))
print("length of df made", len(df_test))
display(df_test.head(10))

test_path count 626
length of df made 10133


Unnamed: 0,site_path_timestamp,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_250ms,within_500ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9,0,9,,,9,9,9,,,136.0,127,0.798813,4.30072,7.810059,,136.0,127,0.247101,0.104201,0.474897,,136.0,127,30.561829,-1.228333,-38.301086,,49.015379,136.0,127,-0.039139,-0.507996,-0.148392,,136.0,127,0.578552,4.353989,8.195526,,136.0,127,34.687805,6.938171,-377.32544,,136.0,127,-0.077835,-0.334671,-0.166565,,2340,2331,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,82,73.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-87,1144.0,1135,-0.425353,0.24869
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9017,0,9017,,,9017,9017,9017,,,9012.0,-5,-1.106979,4.056503,9.795456,,9012.0,-5,0.162119,0.185954,0.561409,,9012.0,-5,29.867554,-6.085205,-26.150513,,40.161547,9012.0,-5,-0.045532,-0.498398,-0.557999,,9012.0,-5,-1.214722,4.168442,9.794861,,9012.0,-5,33.99353,2.081299,-365.17487,,9012.0,-5,-0.224304,-0.391663,-0.471771,,9508,491,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43,,1578474570052,9040,23.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69,8830.0,-187,-0.537325,0.140535
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,15326,0,15326,,,15326,15326,15326,,,15326.0,0,-0.572464,3.981689,7.08223,,15326.0,0,0.22507,0.022647,-0.200452,,15326.0,0,-6.207275,14.727783,-39.649963,,42.749941,15326.0,0,-0.185089,-0.110107,-0.424301,,15326.0,0,-0.607178,3.937988,6.426224,,15326.0,0,-2.081299,22.894287,-378.67432,,15326.0,0,0.012177,0.03392,-0.356186,,14714,-612,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,b26914599f6d9ba16b43975394e1eeb9d82f4bab,-41,,1578474578027,15393,67.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-65,15487.0,161,0.295503,0.550062
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,18763,0,18763,,,18763,18763,18763,,,18755.0,-8,-0.751434,4.546112,10.231201,,18755.0,-8,0.225055,0.044806,-0.119175,,18755.0,-8,-1.350403,9.870911,-41.67633,,42.850612,18755.0,-8,0.225586,-0.058456,-0.224564,,18755.0,-8,-0.889694,4.286942,10.229401,,18755.0,-8,2.775574,18.037415,-380.70068,,18755.0,-8,-0.060791,-0.072083,-0.242737,,19587,824,da39a3ee5e6b4b0d3255bfef95601890afd80709,de53ffe7e3c71c9ed5c845fa50e0521efa5f3685,-41,,1578474582717,18733,-30.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-76,18816.0,53,0.193542,0.622533
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,22328,0,22328,,,22328,22328,22328,,,22326.0,-2,-2.089798,4.224701,12.037628,,22326.0,-2,0.242105,0.053464,-0.008162,,22326.0,-2,-1.350403,5.014038,-30.87616,,31.309766,22326.0,-2,-0.648468,-0.011581,0.32991,,22326.0,-2,-2.09639,4.14389,10.923737,,22326.0,-2,2.775574,13.180542,-369.9005,,22326.0,-2,-0.732971,0.115402,0.331451,,22074,-254,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-42,,1578474574861,22289,-39.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-76,22305.0,-23,0.031184,0.561484
5,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,29946,0,29946,,,29946,29946,29946,,,29951.0,5,-0.639496,3.784164,8.645645,,29951.0,5,0.172021,0.13235,0.476357,,29951.0,5,18.76831,5.014038,-24.125671,,30.974797,29951.0,5,0.238907,0.034775,-0.031219,,29951.0,5,-0.722702,3.750641,8.994598,,29951.0,5,22.894287,13.180542,-363.15002,,29951.0,5,0.155457,-0.032669,-0.024887,,29805,-141,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,7805f319f3f591986effe78c5b41143180278f2d,-49,,1578474592997,29922,-24.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-91,27813.0,-2133,-0.356804,0.208853
6,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,40283,0,40283,,,40283,40283,40283,,,40280.0,-3,-1.380524,4.713715,9.550659,,40280.0,-3,0.1561,-0.182848,-0.842845,,40280.0,-3,-22.857666,-24.816895,-26.150513,,42.687241,40280.0,-3,0.264999,0.039032,-0.046127,,40280.0,-3,-1.565476,4.757416,9.759552,,40280.0,-3,-18.73169,-16.65039,-365.17487,,40280.0,-3,0.08783,-0.112564,-0.18309,,39885,-398,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,7805f319f3f591986effe78c5b41143180278f2d,-37,,1578474603277,40356,73.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-81,40239.0,-44,0.48922,-0.300186
7,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,51343,0,51343,,,51343,51343,51343,,,51334.0,-9,-0.610168,4.039154,9.527313,,51334.0,-9,0.137495,0.18464,0.732058,,51334.0,-9,29.173279,-13.023376,-22.10083,,38.84759,51334.0,-9,-0.091873,-0.081879,-0.069565,,51334.0,-9,-0.728088,4.128937,9.322006,,51334.0,-9,33.299255,-4.856873,-361.12518,,51334.0,-9,-0.1604,-0.058243,-0.056854,,52424,1081,7182afc4e5c212133d5d7d76eb3df6c24618302b,b2546cae6e588d38618eacc557dd0385812197cf,-49,,1578474615769,51436,93.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-66,50427.0,-916,-0.453474,-0.00148
8,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,64014,0,64014,,,64014,64014,64014,,,64023.0,9,-0.04454,5.64566,7.022369,,64023.0,9,0.075054,-0.300336,-0.889396,,64023.0,9,-18.000793,-35.22339,-20.075989,,44.359453,64023.0,9,-0.131821,0.139175,-0.159576,,64023.0,9,0.443878,5.044708,8.989212,,64023.0,9,-13.874817,-27.056885,-359.10034,,64023.0,9,-0.057602,-0.147186,-0.275223,,65218,1204,b7e6027447eb1f81327d66cfd3adbe557aabf26c,68127b819a86c95b0847a170ce53a91702f67969,-43,,1578474627341,63995,-19.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-80,64084.0,70,0.304664,-0.326737
9,5a0546857ecc773753327266_05d052dde78384b0c543d...,5a0546857ecc773753327266,05d052dde78384b0c543d89c,,,12,0,12,,,12,12,12,,,133.0,121,-1.07106,5.370331,8.023758,,133.0,121,0.299498,-0.052432,-0.376978,,133.0,121,-22.83783,8.981323,-36.393738,,43.894587,133.0,121,-0.20372,-0.00412,-0.029617,,133.0,121,-1.077652,5.330231,8.058472,,133.0,121,-21.505737,18.037415,-375.97504,,133.0,121,-0.288223,-0.145584,-0.07016,,2212,2200,da39a3ee5e6b4b0d3255bfef95601890afd80709,3d7b301dac8ee0890ea302f81f318dba80607e0e,-49,,1578476993283,150,138.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-68,1303.0,1291,0.395829,0.302582


In [51]:
# Check the diff timestamps -> check if we're not picking up something 
# that is more than like 5 seconds away from sub_df timestamp

def print_minmax(df, cols):
    for col in cols:
        print("col: ", col)
        print("max: ", df[col].max())
        print("min: ", df[col].min())

check_diff_cols = ["diff_acce_ts", "diff_ahrs_ts", "diff_magn_ts", \
                   "diff_gyro_ts", "diff_acce_u_ts", "diff_magn_u_ts", \
                   "diff_gyro_u_ts", "diff_wifi_ts", "diff_beacon_ts", \
                   "diff_rel_ts"]

print_minmax(df_test, check_diff_cols)

col:  diff_acce_ts
max:  334
min:  -355
col:  diff_ahrs_ts
max:  334
min:  -355
col:  diff_magn_ts
max:  334
min:  -355
col:  diff_gyro_ts
max:  334
min:  -355
col:  diff_acce_u_ts
max:  334
min:  -355
col:  diff_magn_u_ts
max:  334
min:  -355
col:  diff_gyro_u_ts
max:  334
min:  -355
col:  diff_wifi_ts
max:  20427
min:  -10636
col:  diff_beacon_ts
max:  163103.0
min:  -138883.0
col:  diff_rel_ts
max:  8652
min:  -14736


In [52]:
# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_test, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_test, ["ts", "start_ts", "diff_start_ts", \
                         "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
                         "acce_ts", "diff_acce_ts", "ahrs_ts", "diff_ahrs_ts", \
                         "magn_ts", "diff_magn_ts", "gyro_ts", "diff_gyro_ts", \
                         "acce_u_ts", "diff_acce_u_ts", "magn_u_ts", "diff_magn_u_ts", \
                         "gyro_u_ts", "diff_gyro_u_ts", \
                         "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
                         "beacon_ts", "diff_beacon_ts", "beacon_rssi", "rel_ts", "diff_rel_ts"
                        ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_test]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_test.head())

Unnamed: 0,site_path_timestamp,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_250ms,within_500ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9.0,0.0,9.0,,,9.0,9.0,9.0,,,136.0,127.0,0.798813,4.30072,7.810059,,136.0,127.0,0.247101,0.104201,0.474897,,136.0,127.0,30.561829,-1.228333,-38.301086,,49.015379,136.0,127.0,-0.039139,-0.507996,-0.148392,,136.0,127.0,0.578552,4.353989,8.195526,,136.0,127.0,34.687805,6.938171,-377.32544,,136.0,127.0,-0.077835,-0.334671,-0.166565,,2340.0,2331.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,82.0,73.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-87.0,1144.0,1135.0,-0.425353,0.24869,0,6,0,1426,3861,1694,1970-01-01 00:00:00.009,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9017.0,0.0,9017.0,,,9017.0,9017.0,9017.0,,,9012.0,-5.0,-1.106979,4.056503,9.795456,,9012.0,-5.0,0.162119,0.185954,0.561409,,9012.0,-5.0,29.867554,-6.085205,-26.150513,,40.161547,9012.0,-5.0,-0.045532,-0.498398,-0.557999,,9012.0,-5.0,-1.214722,4.168442,9.794861,,9012.0,-5.0,33.99353,2.081299,-365.17487,,9012.0,-5.0,-0.224304,-0.391663,-0.471771,,9508.0,491.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43.0,,1578475000000.0,9040.0,23.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-69.0,8830.0,-187.0,-0.537325,0.140535,0,6,0,1426,514,1694,1970-01-01 00:00:09.017,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:30.052000000,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,15326.0,0.0,15326.0,,,15326.0,15326.0,15326.0,,,15326.0,0.0,-0.572464,3.981689,7.08223,,15326.0,0.0,0.22507,0.022647,-0.200452,,15326.0,0.0,-6.207275,14.727783,-39.649963,,42.749941,15326.0,0.0,-0.185089,-0.110107,-0.424301,,15326.0,0.0,-0.607178,3.937988,6.426224,,15326.0,0.0,-2.081299,22.894287,-378.67432,,15326.0,0.0,0.012177,0.03392,-0.356186,,14714.0,-612.0,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,b26914599f6d9ba16b43975394e1eeb9d82f4bab,-41.0,,1578475000000.0,15393.0,67.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-65.0,15487.0,161.0,0.295503,0.550062,0,6,0,1180,2934,1694,1970-01-01 00:00:15.326,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:38.027000064,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,18763.0,0.0,18763.0,,,18763.0,18763.0,18763.0,,,18755.0,-8.0,-0.751434,4.546112,10.231201,,18755.0,-8.0,0.225055,0.044806,-0.119175,,18755.0,-8.0,-1.350403,9.870911,-41.67633,,42.850612,18755.0,-8.0,0.225586,-0.058456,-0.224564,,18755.0,-8.0,-0.889694,4.286942,10.229401,,18755.0,-8.0,2.775574,18.037415,-380.70068,,18755.0,-8.0,-0.060791,-0.072083,-0.242737,,19587.0,824.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,de53ffe7e3c71c9ed5c845fa50e0521efa5f3685,-41.0,,1578475000000.0,18733.0,-30.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-76.0,18816.0,53.0,0.193542,0.622533,0,6,0,1426,3635,1694,1970-01-01 00:00:18.763,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:42.716999936,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,22328.0,0.0,22328.0,,,22328.0,22328.0,22328.0,,,22326.0,-2.0,-2.089798,4.224701,12.037628,,22326.0,-2.0,0.242105,0.053464,-0.008162,,22326.0,-2.0,-1.350403,5.014038,-30.87616,,31.309766,22326.0,-2.0,-0.648468,-0.011581,0.32991,,22326.0,-2.0,-2.09639,4.14389,10.923737,,22326.0,-2.0,2.775574,13.180542,-369.9005,,22326.0,-2.0,-0.732971,0.115402,0.331451,,22074.0,-254.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-42.0,,1578475000000.0,22289.0,-39.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-76.0,22305.0,-23.0,0.031184,0.561484,0,6,0,1426,514,1694,1970-01-01 00:00:22.328,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:34.860999936,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00


In [53]:
# Save the file in pickle
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Saving train data
test_file_name = "indoor_test_3.pkl"
with open(test_file_name, "wb") as file:
    pickle.dump(df_test, file)

# Save them to output
# df_test.to_csv('df_test.csv')

In [54]:
# Try loading to see if it works properly
test_file_name = "../jupyter/indoor_test_3.pkl"

# Load data it back in
with open(test_file_name, "rb") as file:
    df_test = pickle.load(file)

# df_test = pd.read_csv(test_file_name, encoding='cp932', index_col=0)

In [55]:
display(df_test.tail())

Unnamed: 0,site_path_timestamp,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_250ms,within_500ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
628,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,82589.0,0.0,82589.0,,,82589.0,82589.0,82589.0,,,82597.0,8.0,-0.806244,0.425034,7.01358,,82597.0,8.0,0.056286,0.068959,0.219348,,82597.0,8.0,11.335754,20.547485,-32.72705,,40.27106,82597.0,8.0,-0.021408,-0.088745,-0.277161,,82597.0,8.0,-0.955872,-0.022705,8.465073,,82597.0,8.0,-34.45282,27.749634,-341.18958,,82597.0,8.0,-0.250977,-0.184784,-0.316788,,81664.0,-925.0,0f927dce74ec3475c7a39299e5bffab222ca665d,621fbeab0ad7fa0465f0b82c3b32361a3a848a5d,-50.0,,1573731000000.0,82529.0,-60.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_cf6a3...,-67.0,82360.0,-229.0,-0.604357,1.123231,23,621,0,115,1654,1399,1970-01-01 00:01:22.589,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:21.233999872,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
629,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,85758.0,0.0,85758.0,,,85758.0,85758.0,85758.0,,,85756.0,-2.0,0.453705,0.089233,9.130066,,85756.0,-2.0,0.040001,0.065,0.274802,,85756.0,-2.0,19.604492,23.32306,-34.066772,,45.703897,85756.0,-2.0,0.78447,-0.208054,-0.277161,,85756.0,-2.0,-0.026932,-0.410553,8.431549,,85756.0,-2.0,-26.184082,30.525208,-342.5293,,85756.0,-2.0,0.092575,-0.441513,-0.541565,,85636.0,-122.0,8e69018f6343506344ab13646ccd9447fc2ffb19,3df5a390b1357c32f1c24fdef1c00848ecfdb966,-41.0,,1573731000000.0,85733.0,-25.0,89cb11b04122cef23388b0da06bd426c1f48a9b5_cfc84...,-94.0,85618.0,-140.0,-0.395269,0.649806,23,621,0,902,1042,1400,1970-01-01 00:01:25.758,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:24.911000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
630,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,90895.0,0.0,90895.0,,,90895.0,90895.0,90895.0,,,90891.0,-4.0,-1.377258,1.400055,15.738663,,90891.0,-4.0,-0.000257,0.064427,0.387938,,90891.0,-4.0,28.561401,19.160461,-34.066772,,48.408903,90891.0,-4.0,-0.026733,-0.285828,0.095688,,90891.0,-4.0,-1.008545,0.002441,16.048111,,90891.0,-4.0,-17.227173,26.36261,-342.5293,,90891.0,-4.0,-0.092789,-0.368546,-0.049408,,91521.0,626.0,8e69018f6343506344ab13646ccd9447fc2ffb19,3df5a390b1357c32f1c24fdef1c00848ecfdb966,-49.0,,1573731000000.0,90967.0,72.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-77.0,90891.0,-4.0,-0.586361,0.574534,23,621,0,902,1042,1579,1970-01-01 00:01:30.895,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:30.806000128,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
631,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,96899.0,0.0,96899.0,,,96899.0,96899.0,96899.0,,,96894.0,-5.0,-0.981018,0.999039,7.342789,,96894.0,-5.0,0.042385,0.102878,0.454465,,96894.0,-5.0,27.183533,19.160461,-26.023865,,42.229247,96894.0,-5.0,-0.076797,-0.215515,0.134567,,96894.0,-5.0,-0.981018,0.999039,7.342789,,96894.0,-5.0,-18.605042,27.056885,-334.4864,,96894.0,-5.0,-0.079468,-0.218338,0.135422,,97428.0,529.0,5d998a8668536c4f51004c25f474117fe9555f78,195eb3cc3f2b34a578f0df6082e204b8c333537b,-50.0,,1573731000000.0,96889.0,-10.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-84.0,96657.0,-242.0,-0.839114,0.689733,23,621,0,585,447,1665,1970-01-01 00:01:36.899,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:37.286000128,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
632,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,100447.0,0.0,100447.0,,,100447.0,100447.0,100447.0,,,100449.0,2.0,-1.54007,0.334045,15.380127,,100449.0,2.0,0.033441,0.088533,0.388586,,100449.0,2.0,25.115967,14.997864,-26.023865,,39.153407,100449.0,2.0,-0.068283,-0.240555,-0.203125,,100449.0,2.0,-0.290283,-0.520096,8.933731,,100449.0,2.0,-22.738647,21.505737,-336.4975,,100449.0,2.0,0.259827,-0.034058,-0.347153,,99429.0,-1018.0,aa449fabc4dcb24836d950b5cff91f08e574c3a7,baf415ae85f3997ffb2ad0797952dbbb4832f378,-52.0,,1573731000000.0,99899.0,-548.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-70.0,100370.0,-77.0,-0.543903,0.528341,23,621,0,1100,3085,1579,1970-01-01 00:01:40.447,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:38.243000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00


In [56]:
print((df_test["site_path_timestamp"].nunique()))
print((df_test["file_id"].nunique()))
print((df_test["site_id"].nunique()))
print((df_test["site_id"].value_counts()))

10133
626
24
5d2709d403f801723c32bd39    1223
5dbc1d84c1eb61796cf7c010     923
5da958dd46f8266d0737457b     778
5d2709bb03f801723c32852c     716
5d27096c03f801723c31e5e0     654
5dc8cea7659e181adb076a3f     648
5da138764db8ce0c98bcaa46     573
5d2709e003f801723c32d896     531
5d2709b303f801723c327472     527
5d2709c303f801723c3299ee     509
5da138b74db8ce0c98bd4774     445
5da138754db8ce0c98bca82f     386
5da1383b4db8ce0c98bc11ab     380
5da1382d4db8ce0c98bbe92e     311
5d27097f03f801723c320d97     303
5a0546857ecc773753327266     299
5d2709a003f801723c3251bf     218
5da1389e4db8ce0c98bd0547     174
5da138314db8ce0c98bbf3a0     171
5da138364db8ce0c98bc00f1     139
5da138274db8ce0c98bbd3d2     103
5d27099f03f801723c32511d      49
5d27075f03f801723c2e360f      47
5c3c44b80379370013e0fd2b      26
Name: site_id, dtype: int64


In [57]:
!ls -la -h

total 4.6G
drwxr-xr-x 6 root root 4.0K Mar 25 06:26 .
drwxr-xr-x 1 root root 4.0K Feb 17 07:58 ..
drwxr-xr-x 2 root root 4.0K Mar 25 05:53 .ipynb_checkpoints
drwxr-xr-x 2 root root 4.0K Mar 13 04:33 __pycache__
-rw-r--r-- 1 root root  13K Mar 13 04:24 compute_f.py
-rw-r--r-- 1 root root 2.7K Mar 13 04:14 data_downloader.ipynb
drwxr-xr-x 4 root root 4.0K Mar 13 04:33 github_codes
-rw-r--r-- 1 root root 139K Mar 25 01:18 indoor-preprocess-test-2-gcp.ipynb
-rw-r--r-- 1 root root  64K Mar 25 06:26 indoor-preprocess-test-3-gcp.ipynb
-rw-r--r-- 1 root root  91K Mar 25 03:35 indoor-preprocess-train-2.ipynb
-rw-r--r-- 1 root root 236K Mar 18 10:25 indoor-preprocess-train-3.ipynb
-rw-r--r-- 1 root root 233K Mar 25 06:08 indoor-preprocess-train-4.ipynb
-rw-r--r-- 1 root root 385M Mar 15 09:38 indoor_test_2.pkl
-rw-r--r-- 1 root root  18M Mar 25 06:26 indoor_test_3.pkl
-rw-r--r-- 1 root root 261M Mar 18 10:00 indoor_train_3.pkl
-rw-r--r-- 1 root root 432M Mar 18 07:33 indoor_train_res.pkl
-rw-r--