In [2]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool

import pickle

pd.set_option("display.max_columns", 100)

In [3]:
# Check out each file. Content, images

root_path = "/home/jupyter/input/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

No. Files in Train: 26,925 
No. Files in Test: 626 
No. of metadata files: 204


In [4]:
# Get submission file
sub_df = pd.read_csv("/home/jupyter/input/sample_submission.csv")
sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
grouped_df = sub_df.groupby("file").sample(n=2)
all_file_id = grouped_df["file"].unique()
print(len(grouped_df))
print(len(all_file_id))
display(grouped_df.head())
display(sub_df.head())

1252
626


Unnamed: 0,site_path_timestamp,site,file,timestamp
7169,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,37134
7179,5da1389e4db8ce0c98bd0547_00ff0c9a71cc37a2ebdd0...,5da1389e4db8ce0c98bd0547,00ff0c9a71cc37a2ebdd0f05,86795
7348,5da138b74db8ce0c98bd4774_01c41f1aeba5c48c2c4dd...,5da138b74db8ce0c98bd4774,01c41f1aeba5c48c2c4dd568,53254
7353,5da138b74db8ce0c98bd4774_01c41f1aeba5c48c2c4dd...,5da138b74db8ce0c98bd4774,01c41f1aeba5c48c2c4dd568,84243
6607,5da138764db8ce0c98bcaa46_030b3d94de8acae7c9365...,5da138764db8ce0c98bcaa46,030b3d94de8acae7c936563d,82260


Unnamed: 0,site_path_timestamp,site,file,timestamp
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,15326
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,18763
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,22328


In [29]:
# 200 train paths come out with ~1000 examples, so multiply train examples by 5 to extract similar no. of examples
# test_num = train_num * 5
# test_num = 100
test_num = len(sub_df)
# test_num = round(len(sub_df) / 2)

# set timestamp cut
time_stamp_cut = 250

# print(test_num)
# print(len(sub_df.iloc[:test_num, :]))

In [30]:
# using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
# !cp -r /home/jupyter/input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./

In [31]:
import compute_f
import io_f
import visualize_f
import main
from io_f import read_data_file

In [32]:
# Try working out step_positions for 1 trace file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas[2:5])
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [33]:
# Methods for preprocessing train data: Timestamp handling
def find_diff_ts(ts, data):
    data_ts = data[0]
    diff_ts = int(data_ts) - int(ts)
    return diff_ts

def find_start_ts(path):
    with open(path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        m = re.search(r"(?<=startTime.)(.*)", line_data)
        start_ts = m.groups(0)
        if m:
            return (start_ts[0])

def find_smallest_diff(t, data):
    data_ts = data[:, [0]]
    diff = []
    for ts in data_ts:
        diff.append(abs(int(t) - int(ts)))
    closest_index = np.argmin(diff) # if multiple records have the same value..?
    return data[closest_index]

In [34]:
# Method for preprocessing train data: splitting acce/ahrs/gyro/magn
def split_axis(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    z_axis = data[3]
    try:
        accuracy = data[4]
    except IndexError:
        accuracy = np.nan
    return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# Method for preprocessing train data: splitting wifi
def split_wifi(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    bssid = data[2]
    rssi = data[3]
    if len(data) > 5:
        freq = data[4]
        last_seen_ts = data[5]
    else:
        freq = np.nan
        last_seen_ts = data[-1]
    return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# Method for preprocessing train data: splitting ibeacon
def split_beacon(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    ssid = data[1]
    rssi = data[2]
    return [data_ts, diff_ts, ssid, rssi]

# Method for preprocessing train data: calc rel pos
def split_rel_pos(data, start_ts):
    data_ts = data[0]
    diff_ts = int(data[0]) - int(start_ts)
    x_axis = data[1]
    y_axis = data[2]
    return [data_ts, diff_ts, x_axis, y_axis]

In [35]:
floor_map = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

In [36]:
# def extract_test_data(df):
#     test_rows = []
#     for index, row in df.iterrows():
#         test_site = row["site"]
#         file_name = row["file"]
#         test_ts = row["timestamp"]

#         test_path = root_path + "test/" + file_name + ".txt" # get test_path from file name
#         start_ts = find_start_ts(test_path)
#         diff_start_ts = int(test_ts) - int(start_ts)
#         path_datas = read_data_file(test_path)
#         acce = path_datas.acce
#         ahrs = path_datas.ahrs
#         magn = path_datas.magn
#         wifi = path_datas.wifi

#         # extract data for each timestamp of waypoints
#         res = []
#         acce_closest = split_axis(find_smallest_diff(test_ts, acce))
#         ahrs_closest = split_axis(find_smallest_diff(test_ts, ahrs))
#         magn_closest = split_axis(find_smallest_diff(test_ts, magn))
#         wifi_closest = split_wifi(find_smallest_diff(test_ts, wifi))
#         test_rows.append([test_site, file_name, np.nan, np.nan, test_ts, np.nan, np.nan, start_ts, diff_start_ts] + \
#                           acce_closest + ahrs_closest + magn_closest + wifi_closest + \
#                          [acce_closest[0], ahrs_closest[0], magn_closest[0], wifi_closest[0]])
#     return test_rows

In [37]:
# Generate test data
def extract_test_data(df):
    test_rows = []
    for index, row in tqdm(df.iterrows()):
        try:
            row_id = row["site_path_timestamp"]
            test_site = row["site"]
            file_name = row["file"]
            test_ts = row["timestamp"]
            test_path = root_path + "test/" + file_name + ".txt" # get test_path from file name

            # extract data for each timestamp of waypoints
            start_ts = find_start_ts(test_path)
            path_datas = read_data_file(test_path)
            acce = path_datas.acce
            ahrs = path_datas.ahrs
            magn = path_datas.magn
            gyro = path_datas.gyro
            acce_uncali = path_datas.acce_uncali
            magn_uncali = path_datas.magn_uncali
            gyro_uncali = path_datas.gyro_uncali
            wifi = path_datas.wifi
            wps = path_datas.waypoint
            ibeacon = path_datas.ibeacon
            rel_positions = calc_rel_positions(acce, ahrs)

            # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
            # ts = np.unique(wps[:, [0]])
            ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data

            # extract data for each timestamp of waypoints
            for i, t in enumerate(ts):
                # wp_closest = find_smallest_diff(t, wps)
                # closest_wp_ts = wp_closest[0]
                diff_ts_wp_ts = int(t) - int(test_ts)
                # time_stamp_cut = 2000, only the records within 2 sec of test timestamp
                if abs(diff_ts_wp_ts) < time_stamp_cut:
                    # flag to indicate how close the data point is to the wps
                    within_100ms = True if abs(diff_ts_wp_ts) <= 100 else False
                    within_200ms = True if abs(diff_ts_wp_ts) <= 200 else False
                    # x = wp_closest[1]
                    # y = wp_closest[2]
                    diff_start_ts = int(t) - int(start_ts)
                    diff_start_wp_ts = int(test_ts)
                    acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
                    ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
                    magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
                    magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
                    gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
                    acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
                    magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
                    gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
                    wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
                    if len(ibeacon) > 0:
                        beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
                    else:
                        beacon_closest = [np.nan, np.nan, np.nan, np.nan]
                    rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)

                    # train append example
                    # res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_500ms, within_1000ms] + \
                    #            acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                    #            acce_u_closest + magn_u_closest + gyro_u_closest + \
                    #            wifi_closest + beacon_closest + rel_pos
                    #           )
                    test_rows.append([row_id, test_site, file_name, np.nan, np.nan, \
                                      t, start_ts, diff_start_ts, np.nan, np.nan, test_ts, diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
                                      acce_closest + ahrs_closest + magn_closest + gyro_closest + \
                                      acce_u_closest + magn_u_closest + gyro_u_closest + \
                                      wifi_closest + beacon_closest + rel_pos
                                    )
                else:
                    continue

        except Exception as exc:
            print("Error message: ", exc)
            print("extract_test_data error at: ", row_id)
            # print("extract_test_data error")
    return test_rows

In [38]:
# # can read_data_file method read test data
# print(test_paths[0])
# test_path = test_paths[0]
# read_data_file(test_path)
# find_start_ts(test_path)

In [39]:
# # try generating test data
# # kaggle notebook -> 48.8 sec
# # here -> 28.9 sec
# start = time.time()
# test_rows = extract_test_data(sub_df.iloc[:5, :])
# print(f"time to process: ", time.time() - start)

In [40]:
# test_df = pd.DataFrame(test_rows)
# display(test_df.head())
# print(len(test_rows[0]))

In [42]:
# Pool for test data
# kaggle notebook -> 144.7 sec for 30 paths
# here -> 26.6 sec for 30 paths at time_stamp_cut = 2000
# here -> 89.5 sec for 100 paths at time_stamp_cut = 2000
# here -> 51.4 sec for 100 paths at time_stamp_cut = 1000
# -> 5100 sec for ~10,000 examples -> 85 min or so
# sub_df length = 10133

# grouped_df length = 1252 -> 366.2 sec w/ 250 ms cutline -> x10 needs ~3600 sec

def apply_pool_to_df(df, f, pool, num_cores):
    result = pool.map(f, [d for d in np.array_split(df, num_cores)])
    pool.close()
    return result

num_cores = multiprocessing.cpu_count()
pool = Pool(num_cores)
print("cores: ", num_cores)

start = time.time()

# res = apply_pool_to_df(grouped_df, extract_test_data, pool, num_cores)
res = apply_pool_to_df(sub_df.iloc[:test_num, :], extract_test_data, pool, num_cores)

0it [00:00, ?it/s]

cores:  16


634it [31:09,  2.95s/it]
633it [33:32,  3.18s/it]
633it [34:17,  3.25s/it]
633it [34:24,  3.26s/it]
633it [40:53,  3.88s/it]
634it [41:08,  3.89s/it]
633it [44:13,  4.19s/it]
633it [45:11,  4.28s/it]
634it [50:36,  4.79s/it]
633it [53:01,  5.03s/it]
633it [53:21,  5.06s/it]
633it [54:21,  5.15s/it]
633it [57:33,  5.46s/it]
634it [57:47,  5.47s/it]
634it [59:09,  5.60s/it]
633it [1:01:45,  5.85s/it]


In [43]:
# print(f"time to process {len(grouped_df)} examples of sub_df", time.time() - start)
print(f"time to process {len(sub_df.iloc[:test_num, :])} examples of sub_df", time.time() - start)

time to process 10133 examples of sub_df 3709.0271265506744


In [44]:
col_names = ["site_path_timestamp", "site_id", "file_id", "floor_converted", "floor", \
             "ts", "start_ts", "diff_start_ts", "x", "y", \
             "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_250ms", "within_500ms", \
             "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
             "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
             "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
             "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
             "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
             "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
             "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
             "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
             "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
             "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
            ]

df_test = pd.DataFrame(res[0], columns=col_names)
for r in res[1:]:
    df = pd.DataFrame(r, columns=col_names)
    df_test = df_test.append(df)
# df_test = df_test.set_index("site_path_timestamp")

# process 1000 records -> 173.9 sec -> all test records are ~10,000 -> 1740 sec (~29min)
print("test_path count", len(test_paths[:test_num]))
print("length of df made", len(df_test))
display(df_test.head(10))

test_path count 626
length of df made 241106


Unnamed: 0,site_path_timestamp,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_250ms,within_500ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,136.0,0,136,,,9,9,127,False,True,136.0,136,0.798813,4.30072,7.810059,,136.0,136,0.247101,0.104201,0.474897,,136.0,136,30.561829,-1.228333,-38.301086,,49.015379,136.0,136,-0.039139,-0.507996,-0.148392,,136.0,136,0.578552,4.353989,8.195526,,136.0,136,34.687805,6.938171,-377.32544,,136.0,136,-0.077835,-0.334671,-0.166565,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,110,110.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91,1144.0,1144,-0.425353,0.24869
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,156.0,0,156,,,9,9,147,False,True,156.0,156,0.026688,4.911835,7.244446,,156.0,156,0.247101,0.104201,0.474897,,156.0,156,29.173279,-1.922607,-36.950684,,47.118252,156.0,156,-0.084946,-0.4478,-0.20752,,156.0,156,0.524673,4.493454,7.43837,,156.0,156,33.299255,6.243896,-375.97504,,156.0,156,-0.049072,-0.504059,-0.15538,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,110,110.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91,1144.0,1144,-0.425353,0.24869
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,176.0,0,176,,,9,9,167,False,True,176.0,176,-0.41864,5.354172,7.295914,,176.0,176,0.254368,0.11151,0.452041,,176.0,176,27.786255,-0.535583,-37.625122,,46.776197,176.0,176,-0.110519,-0.327423,-0.163315,,176.0,176,-0.269592,5.217697,7.327042,,176.0,176,31.912231,7.63092,-376.64948,,176.0,176,-0.094879,-0.443863,-0.214508,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,196.0,0,196,,,9,9,187,False,True,196.0,196,-0.540146,5.456512,7.113358,,196.0,196,0.254368,0.11151,0.452041,,196.0,196,27.786255,-0.535583,-36.950684,,46.235439,196.0,196,-0.267639,-0.278961,-0.078094,,196.0,196,-0.55571,5.455322,7.103775,,196.0,196,31.912231,7.63092,-375.97504,,196.0,196,-0.120453,-0.323486,-0.170303,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,216.0,0,216,,,9,9,207,False,False,216.0,216,-0.078064,5.03154,7.386291,,216.0,216,0.256831,0.115168,0.441099,,216.0,216,27.786255,-0.535583,-38.975525,,47.869138,216.0,216,-0.461517,-0.306656,-0.005112,,216.0,216,-0.302521,5.271561,7.274963,,216.0,216,31.912231,7.63092,-377.99988,,216.0,216,-0.277573,-0.275024,-0.085083,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,216,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71,1144.0,1144,-0.425353,0.24869
5,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,237.0,0,237,,,9,9,228,False,False,237.0,237,0.186493,4.829239,7.793304,,237.0,237,0.255339,0.113015,0.437278,,237.0,237,26.399231,-1.228333,-36.950684,,45.428859,237.0,237,-0.604263,-0.352997,-0.032806,,237.0,237,0.05542,4.89209,7.538925,,237.0,237,30.525208,6.938171,-375.97504,,237.0,237,-0.471451,-0.302719,-0.0121,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,244,244.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-86,1144.0,1144,-0.425353,0.24869
6,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,257.0,0,257,,,9,9,248,False,False,257.0,257,0.383423,4.802292,8.195541,,257.0,257,0.251375,0.108101,0.43634,,257.0,257,27.786255,1.545715,-38.301086,,47.343832,257.0,257,-0.605331,-0.345535,-0.086075,,257.0,257,0.319382,4.796906,8.047699,,257.0,257,31.912231,9.712219,-377.32544,,257.0,257,-0.614197,-0.34906,-0.039795,,2340,2340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,,1578474544726,264,264.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-77,1144.0,1144,-0.425353,0.24869
7,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,8769.0,0,8769,,,9017,9017,-248,False,False,8769.0,8769,-0.777161,3.463348,9.576386,,8769.0,8769,0.153666,0.178047,0.604956,,8769.0,8769,29.173279,-10.247803,-26.150513,,40.496259,8769.0,8769,-0.246338,0.000137,-0.142014,,8769.0,8769,-0.757416,3.665054,8.956299,,8769.0,8769,33.299255,-2.081299,-365.17487,,8769.0,8769,-0.219513,-0.08754,-0.166031,,9508,9508,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43,,1578474570052,8719,8719.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-72,8830.0,8830,-0.537325,0.140535
8,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,8790.0,0,8790,,,9017,9017,-227,False,False,8790.0,8790,-0.579651,3.456757,10.193497,,8790.0,8790,0.151173,0.176681,0.604943,,8790.0,8790,28.48053,-8.860779,-26.824951,,40.115234,8790.0,8790,-0.125427,-0.003052,-0.151062,,8790.0,8790,-0.641296,3.298737,10.080368,,8790.0,8790,32.606506,-0.694275,-365.8493,,8790.0,8790,-0.256271,0.004074,-0.149002,,9508,9508,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43,,1578474570052,8826,8826.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-76,8830.0,8830,-0.537325,0.140535
9,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,8810.0,0,8810,,,9017,9017,-207,False,False,8810.0,8810,-0.175034,3.346024,10.130646,,8810.0,8810,0.149623,0.175935,0.604587,,8810.0,8810,29.173279,-8.166504,-27.500916,,40.91543,8810.0,8810,0.119583,0.025711,-0.208588,,8810.0,8810,-0.449768,3.527985,10.211456,,8810.0,8810,33.299255,0.0,-366.52527,,8810.0,8810,-0.135361,0.000885,-0.158051,,9508,9508,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43,,1578474570052,8826,8826.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-76,8830.0,8830,-0.537325,0.140535


In [45]:
# LabelEncode site_id, file_id, floor_converted, ssid, bssid
def col_encode(df, cols):
    for col in cols:
        le = preprocessing.LabelEncoder()
        df["%s_le"%col] = le.fit_transform(df[col])

col_enc = ["site_id", "file_id", "floor", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
col_encode(df_test, col_enc)

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

convert_dtypes(df_test, ["ts", "start_ts", "diff_start_ts", \
                         "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts",\
                         "acce_ts", "diff_acce_ts", "ahrs_ts", "diff_ahrs_ts", \
                         "magn_ts", "diff_magn_ts", "gyro_ts", "diff_gyro_ts", \
                         "acce_u_ts", "diff_acce_u_ts", "magn_u_ts", "diff_magn_u_ts", \
                         "gyro_u_ts", "diff_gyro_u_ts", \
                         "wifi_ts", "diff_wifi_ts", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
                         "beacon_ts", "diff_beacon_ts", "beacon_rssi", "rel_ts", "diff_rel_ts"
                        ], float)

# convert ts and wifi_last_see_ts to dates
for df in [df_test]:
    for col in ["ts", "wifi_last_seen_ts"]:
        df["%s_date"%col] = pd.to_datetime(df[col],unit="ms")
        df["%s_day"%col] = df["%s_date"%col].dt.floor("d")
        df["%s_hour"%col] = df["%s_date"%col].dt.floor("h")
        df["%s_minute"%col] = df["%s_date"%col].values.astype("<M8[m]")

# Check
display(df_test.head())

Unnamed: 0,site_path_timestamp,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_250ms,within_500ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,136.0,0.0,136.0,,,9.0,9.0,127.0,False,True,136.0,136.0,0.798813,4.30072,7.810059,,136.0,136.0,0.247101,0.104201,0.474897,,136.0,136.0,30.561829,-1.228333,-38.301086,,49.015379,136.0,136.0,-0.039139,-0.507996,-0.148392,,136.0,136.0,0.578552,4.353989,8.195526,,136.0,136.0,34.687805,6.938171,-377.32544,,136.0,136.0,-0.077835,-0.334671,-0.166565,,2340.0,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,110.0,110.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91.0,1144.0,1144.0,-0.425353,0.24869,0,6,0,1505,4141,2735,1970-01-01 00:00:00.136,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,156.0,0.0,156.0,,,9.0,9.0,147.0,False,True,156.0,156.0,0.026688,4.911835,7.244446,,156.0,156.0,0.247101,0.104201,0.474897,,156.0,156.0,29.173279,-1.922607,-36.950684,,47.118252,156.0,156.0,-0.084946,-0.4478,-0.20752,,156.0,156.0,0.524673,4.493454,7.43837,,156.0,156.0,33.299255,6.243896,-375.97504,,156.0,156.0,-0.049072,-0.504059,-0.15538,,2340.0,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,110.0,110.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-91.0,1144.0,1144.0,-0.425353,0.24869,0,6,0,1505,4141,2735,1970-01-01 00:00:00.156,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,176.0,0.0,176.0,,,9.0,9.0,167.0,False,True,176.0,176.0,-0.41864,5.354172,7.295914,,176.0,176.0,0.254368,0.11151,0.452041,,176.0,176.0,27.786255,-0.535583,-37.625122,,46.776197,176.0,176.0,-0.110519,-0.327423,-0.163315,,176.0,176.0,-0.269592,5.217697,7.327042,,176.0,176.0,31.912231,7.63092,-376.64948,,176.0,176.0,-0.094879,-0.443863,-0.214508,,2340.0,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,216.0,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71.0,1144.0,1144.0,-0.425353,0.24869,0,6,0,1505,4141,2735,1970-01-01 00:00:00.176,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,196.0,0.0,196.0,,,9.0,9.0,187.0,False,True,196.0,196.0,-0.540146,5.456512,7.113358,,196.0,196.0,0.254368,0.11151,0.452041,,196.0,196.0,27.786255,-0.535583,-36.950684,,46.235439,196.0,196.0,-0.267639,-0.278961,-0.078094,,196.0,196.0,-0.55571,5.455322,7.103775,,196.0,196.0,31.912231,7.63092,-375.97504,,196.0,196.0,-0.120453,-0.323486,-0.170303,,2340.0,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,216.0,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71.0,1144.0,1144.0,-0.425353,0.24869,0,6,0,1505,4141,2735,1970-01-01 00:00:00.196,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,216.0,0.0,216.0,,,9.0,9.0,207.0,False,False,216.0,216.0,-0.078064,5.03154,7.386291,,216.0,216.0,0.256831,0.115168,0.441099,,216.0,216.0,27.786255,-0.535583,-38.975525,,47.869138,216.0,216.0,-0.461517,-0.306656,-0.005112,,216.0,216.0,-0.302521,5.271561,7.274963,,216.0,216.0,31.912231,7.63092,-377.99988,,216.0,216.0,-0.277573,-0.275024,-0.085083,,2340.0,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,216.0,216.0,d9c573b719a17da4836208fc436f87b5ca1aa877_b6589...,-71.0,1144.0,1144.0,-0.425353,0.24869,0,6,0,1505,4141,2735,1970-01-01 00:00:00.216,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00


In [46]:
# Save the file in pickle
# https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# Saving train data
test_file_name = "indoor_test_2.pkl"
with open(test_file_name, "wb") as file:
    pickle.dump(df_test, file)

# Save them to output
# df_test.to_csv('df_test.csv')

In [5]:
# Try loading to see if it works properly
test_file_name = "../jupyter/indoor_test_2.pkl"

# Load data it back in
with open(test_file_name, "rb") as file:
    df_test = pickle.load(file)

# df_test = pd.read_csv(test_file_name, encoding='cp932', index_col=0)

In [6]:
display(df_test.tail())

Unnamed: 0,site_path_timestamp,site_id,file_id,floor_converted,floor,ts,start_ts,diff_start_ts,x,y,closest_wp_ts,diff_start_wp_ts,diff_ts_wp_ts,within_250ms,within_500ms,acce_ts,diff_acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,diff_ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,diff_magn_ts,magn_x,magn_y,magn_z,magn_acc,magn_strength,gyro_ts,diff_gyro_ts,gyro_x,gyro_y,gyro_z,gyro_acc,acce_u_ts,diff_acce_u_ts,acce_u_x,acce_u_y,acce_u_z,acce_u_acc,magn_u_ts,diff_magn_u_ts,magn_u_x,magn_u_y,magn_u_z,magn_u_acc,gyro_u_ts,diff_gyro_u_ts,gyro_u_x,gyro_u_y,gyro_u_z,gyro_u_acc,wifi_ts,diff_wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,beacon_ts,diff_beacon_ts,beacon_ssid,beacon_rssi,rel_ts,diff_rel_ts,rel_x,rel_y,site_id_le,file_id_le,floor_le,wifi_ssid_le,wifi_bssid_le,beacon_ssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
15143,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,100607.0,0.0,100607.0,,,100447.0,100447.0,160.0,False,True,100607.0,100607.0,-0.290283,-0.520096,8.933731,,100607.0,100607.0,0.033441,0.088533,0.388586,,100607.0,100607.0,25.115967,14.997864,-26.023865,,39.153407,100607.0,100607.0,0.262497,-0.031235,-0.348007,,100607.0,100607.0,-0.290283,-0.520096,8.933731,,100607.0,100607.0,-22.738647,21.505737,-336.4975,,100607.0,100607.0,0.259827,-0.034058,-0.347153,,99429.0,99429.0,aa449fabc4dcb24836d950b5cff91f08e574c3a7,baf415ae85f3997ffb2ad0797952dbbb4832f378,-52.0,,1573731000000.0,99899.0,99899.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-70.0,100370.0,100370.0,-0.543903,0.528341,23,621,0,1165,3305,2560,1970-01-01 00:01:40.607,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:38.243000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
15144,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,100627.0,0.0,100627.0,,,100447.0,100447.0,180.0,False,True,100627.0,100627.0,-0.334579,-0.462631,8.874481,,100627.0,100627.0,0.065881,0.079208,0.368783,,100627.0,100627.0,23.049927,14.303589,-28.034973,,39.010915,100627.0,100627.0,0.258774,0.057205,-0.31073,,100627.0,100627.0,-0.334579,-0.462631,8.874481,,100627.0,100627.0,-22.738647,21.505737,-336.4975,,100627.0,100627.0,0.256104,0.054382,-0.309876,,99429.0,99429.0,aa449fabc4dcb24836d950b5cff91f08e574c3a7,baf415ae85f3997ffb2ad0797952dbbb4832f378,-52.0,,1573731000000.0,99899.0,99899.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-70.0,100370.0,100370.0,-0.543903,0.528341,23,621,0,1165,3305,2560,1970-01-01 00:01:40.627,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:38.243000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
15145,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,100646.0,0.0,100646.0,,,100447.0,100447.0,199.0,False,True,100646.0,100646.0,-0.517136,-0.274094,8.735611,,100646.0,100646.0,0.067315,0.080598,0.366332,,100646.0,100646.0,21.672058,14.997864,-30.044556,,39.966103,100646.0,100646.0,0.211899,0.095016,-0.273438,,100646.0,100646.0,-0.517136,-0.274094,8.735611,,100646.0,100646.0,-24.116516,22.200012,-338.50708,,100646.0,100646.0,0.209229,0.092194,-0.272583,,99429.0,99429.0,aa449fabc4dcb24836d950b5cff91f08e574c3a7,baf415ae85f3997ffb2ad0797952dbbb4832f378,-52.0,,1573731000000.0,99899.0,99899.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-70.0,100370.0,100370.0,-0.543903,0.528341,23,621,0,1165,3305,2560,1970-01-01 00:01:40.646,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:38.243000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
15146,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,100666.0,0.0,100666.0,,,100447.0,100447.0,219.0,False,False,100666.0,100666.0,-0.504562,-0.057419,8.43454,,100666.0,100666.0,0.068261,0.082181,0.36403,,100666.0,100666.0,22.36023,14.997864,-29.374695,,39.847064,100666.0,100666.0,0.066483,0.019913,-0.226562,,100666.0,100666.0,-0.504562,-0.057419,8.43454,,100666.0,100666.0,-23.428345,22.200012,-337.83722,,100666.0,100666.0,0.063812,0.01709,-0.225708,,99429.0,99429.0,aa449fabc4dcb24836d950b5cff91f08e574c3a7,baf415ae85f3997ffb2ad0797952dbbb4832f378,-52.0,,1573731000000.0,99899.0,99899.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-70.0,100370.0,100370.0,-0.543903,0.528341,23,621,0,1165,3305,2560,1970-01-01 00:01:40.666,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:38.243000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00
15147,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,,,100686.0,0.0,100686.0,,,100447.0,100447.0,239.0,False,False,100686.0,100686.0,-0.658997,0.208939,8.559647,,100686.0,100686.0,0.068242,0.082547,0.362251,,100686.0,100686.0,22.36023,14.997864,-28.034973,,38.869982,100686.0,100686.0,-0.076797,-0.021103,-0.172241,,100686.0,100686.0,-0.658997,0.208939,8.559647,,100686.0,100686.0,-23.428345,22.200012,-336.4975,,100686.0,100686.0,-0.079468,-0.023926,-0.171387,,99429.0,99429.0,aa449fabc4dcb24836d950b5cff91f08e574c3a7,baf415ae85f3997ffb2ad0797952dbbb4832f378,-52.0,,1573731000000.0,99899.0,99899.0,d9c573b719a17da4836208fc436f87b5ca1aa877_902ba...,-70.0,100370.0,100370.0,-0.543903,0.528341,23,621,0,1165,3305,2560,1970-01-01 00:01:40.686,1970-01-01,1970-01-01,1970-01-01 00:01:00,2019-11-14 11:32:38.243000064,2019-11-14,2019-11-14 11:00:00,2019-11-14 11:32:00


In [7]:
print((df_test["site_path_timestamp"].nunique()))
print((df_test["file_id"].nunique()))
print((df_test["site_id"].nunique()))
print((df_test["site_id"].value_counts()))

10131
626
24
5d2709d403f801723c32bd39    29703
5dbc1d84c1eb61796cf7c010    22209
5da958dd46f8266d0737457b    18165
5d2709bb03f801723c32852c    17239
5dc8cea7659e181adb076a3f    15499
5d27096c03f801723c31e5e0    15364
5da138764db8ce0c98bcaa46    13635
5d2709e003f801723c32d896    12633
5d2709b303f801723c327472    12431
5d2709c303f801723c3299ee    12086
5da138b74db8ce0c98bd4774    10562
5da138754db8ce0c98bca82f     9127
5da1383b4db8ce0c98bc11ab     9030
5da1382d4db8ce0c98bbe92e     7589
5d27097f03f801723c320d97     7298
5a0546857ecc773753327266     6878
5d2709a003f801723c3251bf     5087
5da1389e4db8ce0c98bd0547     4107
5da138314db8ce0c98bbf3a0     3949
5da138364db8ce0c98bc00f1     3317
5da138274db8ce0c98bbd3d2     2429
5d27099f03f801723c32511d     1110
5d27075f03f801723c2e360f     1077
5c3c44b80379370013e0fd2b      582
Name: site_id, dtype: int64
