In [None]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
import lightgbm as lgb

import multiprocessing
from multiprocessing import Pool, Manager

import pickle
import math
import gc
import psutil
from collections import Counter

pd.set_option("display.max_columns", 100)

In [None]:
# Settings and altering components for GCP

# path settings
root_path = "../input/indoor-location-navigation/"
# root_path = "../jupyter/input/"
train_paths = glob.glob(root_path + "train" + "/*/*/*")
test_paths = glob.glob(root_path + "test" + "/*")
metafiles = glob.glob(root_path + "metadata" + "/*")

# function imports using github repo in kaggle kernels
# https://www.kaggle.com/getting-started/71642
!cp -r ../input/indoorlocationcompetition20master/indoor-location-competition-20-master/* ./
from io_f import read_data_file
from compute_f import compute_step_positions, compute_steps, \
compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

# import for gcp settings
# import compute_f
# import io_f
# import visualize_f
# import main
# from io_f import read_data_file
# from compute_f import compute_step_positions, compute_steps, \
# compute_headings, compute_stride_length, compute_step_heading, compute_rel_positions, split_ts_seq

In [None]:
# # Make directory for saving files
# !mkdir train
# !mkdir test

In [None]:
# !ls ./train

In [None]:
# filter milisecond setting 
# IMU_CUT = 250
IMU_CUTS = [1000, 2000, 5000]
# WPS_CUT = 5000

# train number setting
# TRAIN_NUM = len(train_paths)
# TRAIN_NUM = round(len(train_paths) / 2)
TRAIN_NUM = 10

# floor translation
FLOOR_MAP = {"B3":-3,"B2":-2,"B1":-1,"F1":0,"1F":0,"F2":1,"2F":1,"F3":2,"3F":2,"F4":3,"4F":3,
             "F5":4,"5F":4,"F6":5,"6F":5,"F7":6,"7F":6,"F8":7,"8F": 7,"F9":8,"9F":8,"F10":9,
             "B":0,"BF":1,"BM":2, "G":0, "M":0, "P1":0,"P2":1, "LG2":-2,"LG1":-1,"LG":0,"LM":0,
             "L1":1,"L2":2,"L3":3,"L4":4,"L5":5,"L6":6,"L7":7,"L8":8,"L9":9,"L10":10,"L11":11}

# Columns to shift to the beginning of df
SHIFT_COLS = ["rel_y", "rel_x", "rel_diff", \
              "magn_u_z_avg", "magn_u_y_avg", "magn_u_x_avg", \
              "gyro_z_avg", "gyro_y_avg", "gyro_x_avg", \
              "ahrs_z_avg", "ahrs_y_avg", "ahrs_x_avg",  \
              "magn_st", "magn_z_avg", "magn_y_avg", "magn_x_avg", \
              "acce_z_avg", "acce_y_avg", "acce_x_avg", \
              "site_id", "file_id", "floor_int", "floor", \
              "y", "x", "wps_diff", "wifi_ts"]

SHIFT_COLS_TEST = ["rel_y", "rel_x", "rel_diff", \
                   "magn_u_z_avg", "magn_u_y_avg", "magn_u_x_avg", \
                   "gyro_z_avg", "gyro_y_avg", "gyro_x_avg", \
                   "ahrs_z_avg", "ahrs_y_avg", "ahrs_x_avg",  \
                   "magn_st", "magn_z_avg", "magn_y_avg", "magn_x_avg", \
                   "acce_z_avg", "acce_y_avg", "acce_x_avg", \
                   "site_id", "file_id", "floor_int", "floor", \
                   "y", "x", "wps_diff", "wifi_ts", "site_path_timestamp"]

INT_COLS = ["wifi_ts"]
CAT_COLS = ["file_id", "site_id", "floor"]

In [None]:
# Preprocess
print("No. Files in Train: {:,}".format(len(train_paths)), "\n" +
      "No. Files in Test: {:,}".format(len(test_paths)), "\n" +
      "No. of metadata files: {:,}".format(len(metafiles)))

# Reading in 1 file
def pick_example(max_range, paths):
    ex = random.randint(0, max_range)
    example_path = paths[ex]
    path = f"{example_path}"
    paths = path.split("/")
    site = paths[4]
    floorNo = paths[5]
    floor_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{root_path}metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    return path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter

path, site, floorNo, floor_plan_filename, \
json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
print("example path: ", path)
print("site: ", site)
print("floorNo: ", floorNo)
print("floor_plan_filename: ", floor_plan_filename)
print("json_plan_filename: ", json_plan_filename)
print("width: {}, height: {} ".format(width_meter, height_meter))

with open(path) as p:
    lines = p.readlines()
print("No. Lines in 1 example: {:,}". format(len(lines)))

In [None]:
# for line in lines:
#     print(line)

In [None]:
# Redefine the data extraction class

from dataclasses import dataclass

@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray


def read_data_file_ed(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, '_'.join([ssid, bssid]), rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            txpower = line_data[5]
            rssi = line_data[6]
            distance = line_data[7]
            mac_address = line_data[-2]
            beacon_ts = line_data[-1]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), txpower, rssi, distance, mac_address, beacon_ts]
            ibeacon.append(ibeacon_data)
            continue

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])

    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)

    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)

In [None]:
# Find out how many wps datapoints and wifi datapoints one floor has
train_path_floor = glob.glob(root_path + "train" + "/*/*/")
# train_paths = glob.glob(root_path + "train" + "/*/*/*")
ex = random.randint(0, 6)
print(train_path_floor[ex])
print("no. of files of that floor: ", len(os.listdir(train_path_floor[ex])))
count = 0
for f in os.listdir(train_path_floor[ex]):
    file_path = train_path_floor[ex] + f
    data = read_data_file_ed(file_path)
    count += len(data.waypoint)
    
print(count)

In [None]:
# path, site, floorNo, floor_plan_filename, json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)
# show_site_png(root_path, site=site)

In [None]:
# Feature candidate
# You can't get the waypoint in test, so use acce and ahrs data to calculate relative positions
def calc_rel_positions(acce_datas, ahrs_datas):
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce_datas)
    headings = compute_headings(ahrs_datas)
    stride_lengths = compute_stride_length(step_acce_max_mins)
    step_headings = compute_step_heading(step_timestamps, headings)
    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    # only use del if we don't need timestamps
    # rel_positions_del = np.delete(rel_positions, 0, 1)
    return rel_positions

# Feature candidate
# Modify extract_magnetic_strength from github for one magnetic data point
def extract_one_magn_strength(magn_datas):
    d = np.array(magn_datas)
    return np.mean(np.sqrt(np.sum(d ** 2, axis=0)))

In [None]:
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

In [None]:
# Common methods
def extract_imu_rep(imu_data, wifi_ts, imu_cut):
    imu_ts = imu_data[:, 0].astype(int)
    diff_list = []
    for ts in imu_ts:
        diff = abs(int(wifi_ts) - ts)
        diff_list.append(diff)
    # diff_idx = np.argmin(diff_list)
    # acce_diff_range = [(i,a) for i, a in enumerate(diff_list) if a < cut_line] # uncomment if we need to check acce_diff
    imu_diff_range = [i for i, a in enumerate(diff_list) if a < imu_cut]
    imu_filtered = imu_data[imu_diff_range]
    if imu_filtered.shape[0] == 0:
        print("no imu")
        imu_avg_x = np.nan
        imu_avg_y = np.nan
        imu_avg_z = np.nan
    else:
        imu_avg_x = imu_filtered[:, 1].mean()
        imu_avg_y = imu_filtered[:, 2].mean()
        imu_avg_z = imu_filtered[:, 3].mean()
        #print(imu_avg_x, imu_avg_y, imu_avg_z)
    return imu_avg_x, imu_avg_y, imu_avg_z

def shift_columns(cols, df):
    for col in cols:
        df_cols = list(df.columns)
        df_cols.insert(0, df_cols.pop(df_cols.index(col)))
        df = df[df_cols]
    return df

# convert data types of certain columns
def convert_dtypes(df, col_list, dtype):
    for col in col_list:
        df[col] = df[col].astype(dtype)

---
## Train generator
---

In [None]:
# Train specific methods
def extract_nearest_wps(wps_data, wifi_ts):
    wps_ts = wps_data[:, 0].astype(int)
    diff_list = []
    for ts in wps_ts:
        diff = abs(int(wifi_ts) - ts)
        diff_list.append(diff)
    diff_idx = np.argmin(diff_list)
    return diff_list[diff_idx], wps_data[diff_idx]

def extract_train_path(path):
    try:
        ex_path = f"{path}"
        ex_paths = ex_path.split("/")
        site_id = ex_paths[4]
        floor = ex_paths[5]
        f = FLOOR_MAP[floor]
        file_id = ex_paths[6].split(".")[0]
        return site_id, file_id, f, floor
    except:
        print("extract_path error")

def make_wifi_df_train(path):
    # First path
    datas = read_data_file_ed(path)
    
    # Put IMU data in dict for later iteration
    imu_dict = {}
    imu_dict["acce"] = datas.acce
    imu_dict["magn"] = datas.magn
    imu_dict["ahrs"] = datas.ahrs
    imu_dict["gyro"] = datas.gyro
    # acce_uncali = datas.acce_uncali
    imu_dict["magn_uncali"] = datas.magn_uncali # Only use magn for uncalibrated data, as it seems more important in initial modeling result
    # gyro_uncali = datas.gyro_uncali
    
    # Leave the non-imu data as they are
    wifi_datas = datas.wifi
    # ibeacon_datas = datas.ibeacon # ibeacon to be used only for test data
    wps = datas.waypoint
    rel_positions = calc_rel_positions(imu_dict["acce"], imu_dict["ahrs"])
    
    # print("wifi unique ts len: ", len(set(wifi_datas[:, 0])))
    # Make wifi df with wifi_ts
    if wifi_datas.shape[0] == 0:
        print("no wifi data at: ", path)
        return []

    # Make wifi df with wifi_ts
    dfs = []
    df = pd.DataFrame(wifi_datas[:,[0,2,4]])
    for wifi_ts, g in df.groupby(0):
        g = g.drop_duplicates(subset=1)
        tmp = g.iloc[:,1:]
        feat = tmp.set_index(1).T
        feat["wifi_ts"] = wifi_ts

        # get closest wps
        closest_wps = extract_nearest_wps(wps, wifi_ts)
        feat["wps_diff"] = closest_wps[0]
        feat["x"] = closest_wps[1][1]
        feat["y"] = closest_wps[1][2]

        # get floor and other path data
        site_id, file_id, f, floor = extract_train_path(path)
        feat["floor"] = floor
        feat["floor_int"] = f
        feat["file_id"] = file_id
        feat["site_id"] = site_id

        # Loop over IMU_CUTS
        for key, imu in imu_dict.items():
            for imu_cut in IMU_CUTS:
                imu_avgs = extract_imu_rep(imu, closest_wps[1][0], imu_cut)
                feat[f"{key}_{imu_cut}_x_avg"] = imu_avgs[0]
                feat[f"{key}_{imu_cut}_y_avg"] = imu_avgs[1]
                feat[f"{key}_{imu_cut}_z_avg"] = imu_avgs[2]
                if key == "magn":
                    feat[f"{key}_{imu_cut}_st_avg"] = extract_one_magn_strength(imu_avgs)
        
        # get closest relative positions that was worked out with acce and ahrs data
        rel_pos = extract_nearest_wps(rel_positions, wifi_ts)
        feat["rel_diff"] = rel_pos[0]
        feat["rel_x"] = rel_pos[1][1]
        feat["rel_y"] = rel_pos[1][2]
        
        dfs.append(feat)
    
    return dfs


def make_train_df(paths_df, site_list):
    for site in site_list:
        df = paths_df[paths_df["site_id"] == site]
        paths = df["path"].unique()
        # get top bssids for site
        dfs_all = pool.map(make_wifi_df_train, tqdm(paths))
        dfs_unpack = [row for df in dfs_all for row in df]
        wifi_df = pd.concat(dfs_unpack)
        all_cols = wifi_df.columns
        cols = [col for col in all_cols if len(col) < 17]
        cols = reversed(cols)
        wifi_df = shift_columns(cols, wifi_df)
        # display(wifi_df.head())
        wifi_df = wifi_df.fillna(-999)
        convert_dtypes(wifi_df, tqdm(INT_COLS), int)
        convert_dtypes(wifi_df, tqdm(CAT_COLS), "category")
        # display(wifi_df.head())
        # wifi_df.to_csv(f"./train/{site}_train.csv", index=False)
        del wifi_df

In [None]:
# train_path filtering

def extract_path_for_grouplist(path):
    ex_path = f"{path}"
    ex_paths = ex_path.split("/")
    site_id = ex_paths[4]
    file_id = ex_paths[6].split(".")[0]
    return [path, site_id, file_id]

# create pathlist to be used by 2 types of paths list
path_list = [extract_path_for_grouplist(item) for item in train_paths]
df_paths = pd.DataFrame(path_list, columns=["path", "site_id", "file_id"])
site_id_path_list = df_paths["site_id"].unique()

# grouped_paths_list -> It takes 3 records from every site_id
grouped_paths_df = df_paths.groupby("site_id").sample(n=3)
grouped_paths_list = list(grouped_paths_df["path"].unique())
display(grouped_paths_df.head())
print(len(df_paths))

In [None]:
start = time.time()
num_cores = multiprocessing.cpu_count()
pool = Pool(1)

# # Checking purposes
# # 9 paths, 4 cores -> 28.5 sec
# # 9 paths, 1 core  -> 42.0 sec
grouped_paths_df = grouped_paths_df.iloc[:9,:]
# grouped_paths_df = grouped_paths_df.sample(n=100)
train_sites_list = grouped_paths_df["site_id"].unique()
make_train_df(grouped_paths_df, train_sites_list)

# REAL training
# train_sites_list = df_paths["site_id"].unique()
# make_train_df(df_paths, train_sites_list)

print("time to extract data: ", time.time() - start)
pool.close()

---
## Test generator
---

In [None]:
# Test specific methods
def extract_nearest_wifi(wifi_datas, timestamp):
    diff_list = []
    wifi_ts = wifi_datas[:, 0]
    for ts in wifi_ts:
        diff = abs(int(timestamp) - int(ts))
        diff_list.append(diff)
    min_value = min(diff_list)
    diff_indices = [i for i, x in enumerate(diff_list) if x == min_value]
    wifi_datas = wifi_datas[diff_indices]
    return wifi_datas

def extract_correct_ts(ibeacon_datas, timestamp):
    if ibeacon_datas.shape[0] == 0:
        print("no beacon")
        return np.nan
    else:
        diff_list = []
        ibeacon_ts = ibeacon_datas[:, 0]
        for ts in ibeacon_ts:
            diff = abs(int(timestamp) - int(ts))
            diff_list.append(diff)
        min_value = min(diff_list)
        diff_indices = [i for i, x in enumerate(diff_list) if x == min_value]
        ibeacon_datas = ibeacon_datas[diff_indices].flatten()
        ibeacon_last_ts = int(ibeacon_datas[-1])
        ibeacon_ts = int(ibeacon_datas[0])
        correct_ts = ibeacon_last_ts - (ibeacon_ts - int(timestamp))
        return correct_ts

def make_wifi_df_test(zipped_paths):
    site_id, file_id, timestamp, site_path_timestamp = zipped_paths
    file_path = "../input/indoor-location-navigation/test/" + file_id + ".txt"
    datas = read_data_file_ed(file_path)
    
    # Put IMU data in dict for later iteration
    imu_dict = {}
    imu_dict["acce"] = datas.acce
    imu_dict["magn"] = datas.magn
    imu_dict["ahrs"] = datas.ahrs
    imu_dict["gyro"] = datas.gyro
    # acce_uncali = datas.acce_uncali
    imu_dict["magn_uncali"] = datas.magn_uncali # Only use magn for uncalibrated data, as it seems more important in initial modeling result
    # gyro_uncali = datas.gyro_uncali
    
    # Leave the non-imu data as they are
    wifi_datas = datas.wifi
    ibeacon_datas = datas.ibeacon
    wps = datas.waypoint
    rel_positions = calc_rel_positions(imu_dict["acce"], imu_dict["ahrs"])
    
    # print("wifi unique ts len: ", len(set(wifi_datas[:, 0])))

    # Make wifi df with wifi_ts
    if wifi_datas.shape[0] == 0:
        print("no wifi data at: ", path)
        return []

    # Make wifi df with wifi_ts
    wifi_datas = extract_nearest_wifi(wifi_datas, timestamp)
    
    dfs = []
    df = pd.DataFrame(wifi_datas[:,[0,2,4]])
    for wifi_ts, g in df.groupby(0):
        g = g.drop_duplicates(subset=1)
        tmp = g.iloc[:,1:]
        feat = tmp.set_index(1).T
        feat["site_path_timestamp"] = site_path_timestamp
        correct_ts = extract_correct_ts(ibeacon_datas, timestamp) # get corrected timestamp using the last timestamp in ibeacon data
        feat["correct_wps_ts"] = correct_ts if correct_ts is not np.nan else np.nan
        # feat["timestamp"] = timestamp
        # feat["wifi_ts"] = wifi_ts
        feat["wifi_ts"] = correct_ts + (int(wifi_ts) - int(timestamp))

        # get closest wps
        feat["wps_diff"] = abs(int(wifi_ts) - int(timestamp))
        feat["x"] = np.nan
        feat["y"] = np.nan

        # get floor and other path data
        feat["floor"] = np.nan
        feat["floor_int"] = np.nan
        feat["file_id"] = file_id
        feat["site_id"] = site_id

        # Loop over IMU_CUTS
        for key, imu in imu_dict.items():
            for imu_cut in IMU_CUTS:
                imu_avgs = extract_imu_rep(imu, timestamp, imu_cut)
                feat[f"{key}_{imu_cut}_x_avg"] = imu_avgs[0]
                feat[f"{key}_{imu_cut}_y_avg"] = imu_avgs[1]
                feat[f"{key}_{imu_cut}_z_avg"] = imu_avgs[2]
                if key == "magn":
                    feat[f"{key}_{imu_cut}_st_avg"] = extract_one_magn_strength(imu_avgs)
        
        # get closest relative positions that was worked out with acce and ahrs data
        rel_pos = extract_nearest_wps(rel_positions, wifi_ts)
        feat["rel_diff"] = rel_pos[0]
        feat["rel_x"] = rel_pos[1][1]
        feat["rel_y"] = rel_pos[1][2]
        
        dfs.append(feat)
    
    return dfs

def make_test_df(zipped_path, site):
    dfs_all = pool.map(make_wifi_df_test, tqdm(zipped_path))
    dfs_unpack = [row for df in dfs_all for row in df]
    wifi_df = pd.concat(dfs_unpack)
    all_cols = wifi_df.columns
    cols = [col for col in all_cols if len(col) < 20]
    cols = reversed(cols)
    wifi_df = shift_columns(cols, wifi_df)
    wifi_df = wifi_df.fillna(-999)
    convert_dtypes(wifi_df, tqdm(INT_COLS), int)
    convert_dtypes(wifi_df, tqdm(CAT_COLS), "category")
    # print(wifi_df.iloc[:, :30].info())
    wifi_df.to_csv(f"./test/{site}_test.csv", index=False)
    del wifi_df

In [None]:
# Get submission file
sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
sub_df[["site_id", "file_id", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
sub_df = sub_df.drop(columns=["floor", "x", "y"])
# sub_df_site_list = sub_df["site_id"].unique()

In [None]:
start = time.time()
num_cores = multiprocessing.cpu_count()
pool = Pool(num_cores)

# 100 records:  33.47870922088623 sec
# comment out to run all
# sub_df = sub_df.sample(n=100)
sub_df = sub_df.iloc[:9, :]
test_sites = sub_df["site_id"].unique()

# Run generator for each building
for site in test_sites:
    sub_df_filtered = sub_df[sub_df["site_id"] == site]
    site_file_zip = list(zip(sub_df_filtered["site_id"], \
                             sub_df_filtered["file_id"], \
                             sub_df_filtered["timestamp"], \
                             sub_df_filtered["site_path_timestamp"]))
    make_test_df(site_file_zip, site)

# display(wifi_df.head())

print("time to extract data: ", time.time() - start)
pool.close()

In [None]:
# start = time.time()

# num_cores = multiprocessing.cpu_count()
# print(f"num_cores={num_cores}")
# pool = Pool(num_cores)

# # 10 paths:  6.070369720458984
# # 100 paths:  87.05400061607361
# # dfs_all = pool.map(make_wifi_df, tqdm(train_paths[:TRAIN_NUM]))
# dfs_all = pool.map(make_wifi_df, tqdm(grouped_paths_list[:10]))

# # time to process:  11.514546155929565
# # dfs_all = []
# # for path in train_paths[:TRAIN_NUM]:
# #     dfs_all.append(make_wifi_df(path))

# print(len(dfs_all))
# print("time to extract data: ", time.time() - start)
# pool.close()

In [None]:
# start = time.time()

# num_cores = multiprocessing.cpu_count()
# pool = Pool(num_cores)

# # Do this for each building

# # 10 paths:  8.992910146713257
# # 100 paths:  2454.589078426361
# dfs_unpack = [row for df in dfs_all for row in df]
# wifi_df = pd.concat(dfs_unpack)

# print("time for df conversion: ", time.time() - start)
# print(len(wifi_df.columns))
# print(len(wifi_df))
# display(wifi_df.head())
# pool.close()

In [None]:
# wifi_df.iloc[:,:50].info()

In [None]:
# start = time.time()

# # move columns
# cols = ["acce_z_avg", "acce_y_avg", "acce_x_avg", \
#         "site_id", "file_id", "floor_int", "floor", \
#         "y", "x", "wps_diff", "wifi_ts"]

# for col in cols:
#     df_cols = list(wifi_df.columns)
#     df_cols.insert(0, df_cols.pop(df_cols.index(col)))
#     wifi_df = wifi_df[df_cols]
  
# # Fillna
# wifi_df = wifi_df.fillna(-999)

# display(wifi_df.head())
# print(len(wifi_df))

# print("time to shift columns: ", time.time() - start)
# print(wifi_df.iloc[:,:50].info())

In [None]:
# print("available RAM:", psutil.virtual_memory())

# train_file_name = "indoor_train_5.pkl"

# with open(train_file_name, "wb") as file:
#     pickle.dump(wifi_df, file)

# del wifi_df
# del dfs_unpack
# del dfs_all
# gc.collect()

# print("available RAM after cleanup:", psutil.virtual_memory())

In [None]:
# # Load data it back in
# train_file_name = "indoor_train_5.pkl"

# with open(train_file_name, "rb") as file:
#     df_train = pickle.load(file)

In [None]:
# print("df len: ", len(df_train), "\n")
# print("site_id nunique: ", df_train["site_id"].nunique(), "\n")
# print("site_id value_counts: ", df_train["site_id"].value_counts(), "\n")
# print("file_id nunique: ", df_train["file_id"].nunique(), "\n")
# print("x value_counts: ", df_train["x"].value_counts(), "\n")
# print("y value_counts: ", df_train["y"].value_counts(), "\n")
# print("wifi_ts nunique: ", df_train["wifi_ts"].nunique(), "\n")
# print("wps_diff nunique: ", df_train["wps_diff"].nunique(), "\n")
# display(df_train.head())

In [None]:
# df_train_pp = df_train.loc[:, ["site_id", "x", "y", "acce_x_avg", "acce_y_avg", "acce_z_avg"]]
# display(df_train_pp.head())
# sns.pairplot(df_train_pp, hue="site_id")

In [None]:
# # Check the wps_diff distribution
# # Need to filter out those wps that are above 5000ms difference from wifi_ts
# f, ax = plt.subplots(figsize=(8, 8))
# f.patch.set_facecolor("white")
# sns.distplot(df_train["wps_diff"])
# plt.show()

In [None]:
# df_train_slim = df_train[df_train["wps_diff"] < WPS_CUT]
# perc = round(len(df_train_slim)/len(df_train)*100, 2)

# print("no of records: ", len(df_train))
# print(f"Filter df_train with {WPS_CUT}, it retains {perc} % of data")

In [None]:
# # Visualizing timestamp distribution

# # LabelEncode site_id, file_id, floor_converted, ssid, bssid
# # def col_encode(df, cols):
# #     for col in cols:
# #         le = preprocessing.LabelEncoder()
# #         df["%s_le"%col] = le.fit_transform(df[col])

# # col_enc = ["site_id", "file_id", "wifi_ssid", "wifi_bssid", "beacon_ssid"]
# # col_encode(df_train, tqdm(col_enc))

# # convert data types of certain columns
# def convert_dtypes(df, col_list, dtype):
#     for col in col_list:
#         df[col] = df[col].astype(dtype)

# convert_dtypes(df_train, tqdm(["wifi_ts"]), int)
# convert_dtypes(df_train, tqdm(["file_id", "site_id", "floor"]), "category")

# # Check
# display(df_train.head())

In [None]:
# # Methods for preprocessing train data: Timestamp handling
# def find_diff_ts(ts, data):
#     data_ts = data[0]
#     diff_ts = int(data_ts) - int(ts)
#     return diff_ts

# def find_start_ts(path):
#     with open(path, 'r', encoding='utf-8') as file:
#         lines = file.readlines()

#     for line_data in lines:
#         line_data = line_data.strip()
#         m = re.search(r"(?<=startTime.)(.*)", line_data)
#         start_ts = m.groups(0)
#         if m:
#             return (start_ts[0])

# def find_smallest_diff(t, data):
#     if data.size == 0:
#         return np.array([])
#     else:
#         data_ts = data[:, [0]]
#         diff = []
#         for ts in data_ts:
#             diff.append(abs(int(t) - int(ts)))
#         closest_index = np.argmin(diff) # if multiple records have the same value..?
#         return data[closest_index]

In [None]:
# # Method for preprocessing train data: splitting acce/ahrs/gyro/magn
# def split_axis(data, start_ts):
#     if data.size == 0:
#         # print("no axis data")
#         return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
#     else:
#         data_ts = data[0]
#         diff_ts = int(data[0]) - int(start_ts)
#         x_axis = data[1]
#         y_axis = data[2]
#         z_axis = data[3]
#         try:
#             accuracy = data[4]
#         except IndexError:
#             accuracy = np.nan
#         return [data_ts, diff_ts, x_axis, y_axis, z_axis, accuracy]

# # Method for preprocessing train data: splitting wifi
# def split_wifi(data, start_ts):
#     if data.size == 0:
#         # print("no wifi data")
#         return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
#     else:
#         data_ts = data[0]
#         diff_ts = int(data[0]) - int(start_ts)
#         ssid = data[1]
#         bssid = data[2]
#         rssi = data[3]
#         if len(data) > 5:
#             freq = data[4]
#             last_seen_ts = data[5]
#         else:
#             freq = np.nan
#             last_seen_ts = data[-1]
#         return [data_ts, diff_ts, ssid, bssid, rssi, freq, last_seen_ts]

# # Method for preprocessing train data: splitting ibeacon
# def split_beacon(data, start_ts):
#     if data.size == 0:
#         # print("no beacon data")
#         return [np.nan, np.nan, np.nan, np.nan]
#     else:
#         data_ts = data[0]
#         diff_ts = int(data[0]) - int(start_ts)
#         ssid = data[1]
#         rssi = data[2]
#         return [data_ts, diff_ts, ssid, rssi]

# # Method for preprocessing train data: calc rel pos
# def split_rel_pos(data, start_ts):
#     if data.size == 0:
#         # print("no rel_pos data")
#         return [np.nan, np.nan, np.nan, np.nan]
#     else:
#         data_ts = data[0]
#         diff_ts = int(data[0]) - int(start_ts)
#         x_axis = data[1]
#         y_axis = data[2]
#         return [data_ts, diff_ts, x_axis, y_axis]

In [None]:
# # Extract path and other data
# def extract_path(path, floor_map):
#     # split path
#     try:
#         ex_path = f"{path}"
#         ex_paths = ex_path.split("/")
#         site_id = ex_paths[4]
#         floor = ex_paths[5]
#         f = floor_map[floor]
#         file_id = ex_paths[6].split(".")[0]
#         return [site_id, file_id, f, floor]
#     except:
#         print("extract_path error")

# # Definitely needs to be refactored
# def extract_data(path):
#     start_ts = find_start_ts(path)
#     path_datas = read_data_file(path)
#     acce = path_datas.acce
#     ahrs = path_datas.ahrs
#     magn = path_datas.magn
#     gyro = path_datas.gyro
#     acce_uncali = path_datas.acce_uncali
#     magn_uncali = path_datas.magn_uncali
#     gyro_uncali = path_datas.gyro_uncali
#     wifi = path_datas.wifi
#     wps = path_datas.waypoint
#     ibeacon = path_datas.ibeacon
#     rel_positions = calc_rel_positions(acce, ahrs)

#     # Changed from: just extracting wps time stamps -> take all acce uncalib timestamps
#     # ts = np.unique(wps[:, [0]])
#     if acce_uncali.any():
#         # print("acce_uncali")
#         ts = np.unique(acce_uncali[:, [0]]) # take uncalibrated access, as sometimes access has less data
#     elif acce.any():
#         # print("acce")
#         ts = np.unique(acce[:, [0]])
#     else:
#         print("no acce or acce_uncali")

#     # extract data for each timestamp of waypoints
#     res = []
#     for t in ts:
#         try:
#             wp_closest = find_smallest_diff(t, wps)
#             closest_wp_ts = wp_closest[0]
#             diff_ts_wp_ts = abs(int(t) - int(closest_wp_ts))
#             # time_stamp_cut = 2000, only the records within 2 sec of waypoint are kept
#             if diff_ts_wp_ts < time_stamp_cut:
#                 # flag to indicate how close the data point is to the wps
#                 # print("diff_ts_wp_ts", diff_ts_wp_ts)
#                 within_100ms = True if abs(diff_ts_wp_ts) <= 100 else False
#                 within_200ms = True if abs(diff_ts_wp_ts) <= 200 else False
#                 x = wp_closest[1]
#                 y = wp_closest[2]
#                 # print("x, y: ", x, y)
#                 diff_start_ts = int(t) - int(start_ts)
#                 diff_start_wp_ts = int(closest_wp_ts) - int(start_ts)
#                 # print("diff_start_ts, diff_start_wp_ts: ", diff_start_ts, diff_start_wp_ts)
#                 acce_closest = split_axis(find_smallest_diff(t, acce), start_ts)
#                 ahrs_closest = split_axis(find_smallest_diff(t, ahrs), start_ts)
#                 magn_closest = split_axis(find_smallest_diff(t, magn), start_ts)
#                 magn_closest.append(extract_one_magn_strength(magn_closest)) # append magnetic strength only for the magn data
#                 gyro_closest = split_axis(find_smallest_diff(t, gyro), start_ts)
#                 # print("acce: ", acce_closest)
#                 # print("ahrs: ", ahrs_closest)
#                 # print("magn: ", magn_closest)
#                 # print("gyro: ", gyro_closest)
#                 acce_u_closest = split_axis(find_smallest_diff(t, acce_uncali), start_ts)
#                 magn_u_closest = split_axis(find_smallest_diff(t, magn_uncali), start_ts)
#                 gyro_u_closest = split_axis(find_smallest_diff(t, gyro_uncali), start_ts)
#                 # print("acce_u_closest: ", acce_u_closest)
#                 # print("magn_u_closest: ", magn_u_closest)
#                 # print("gyro_u_closest: ", gyro_u_closest)
#                 wifi_closest = split_wifi(find_smallest_diff(t, wifi), start_ts)
#                 if len(ibeacon) > 0:
#                     beacon_closest = split_beacon(find_smallest_diff(t, ibeacon), start_ts)
#                 else:
#                     beacon_closest = [np.nan, np.nan, np.nan, np.nan]
#                 rel_pos = split_rel_pos(find_smallest_diff(t, rel_positions), start_ts)
#                 # print([t, x, y, int(closest_wp_ts), acce_closest, acce_u_closest])
#                 res.append([int(t), start_ts, diff_start_ts, x, y, int(closest_wp_ts), diff_start_wp_ts, diff_ts_wp_ts, within_100ms, within_200ms] + \
#                            acce_closest + ahrs_closest + magn_closest + gyro_closest + \
#                            acce_u_closest + magn_u_closest + gyro_u_closest + \
#                            wifi_closest + beacon_closest + rel_pos
#                           )
#             else:
#                 # print("no wp made it through timestamp cut")
#                 continue
#         except Exception as exc:
#             pass
#             # print("Error message: ", exc)
#             # print("extract_test_data error")
#     return res

In [None]:
# # %%timeit

# # 5.55 ms ± 1.76 ms per loop
# path, site, floorNo, floor_plan_filename, \
# json_plan_filename, width_meter, height_meter = pick_example(len(train_paths), train_paths)

# def one_trace_to_rows(path, floor_map):
#     try:
#         path_info = extract_path(path, floor_map)
#         data = extract_data(path)
#         # rows = list(itertools.chain(path_info, *data))
#         rows = []
#         for d in data:
#             row = path_info + d
#             rows.append(row)
#             # print("row: ", row)
#         return rows
#     except:
#         print("one_trace_to_rows error at: ", path)

# # path -> train/5cd56bdbe2acfd2d33b663c0/L3/5dfc8108241c3600064049b9.txt
# # time w/ for loop with 1 train_path -> 11.6
# # time w/ itertools.chain for 1 train_path -> 11.8
# start = time.time()
# path_info = extract_path(path, floor_map)
# print("path: ", path_info)
# rows = one_trace_to_rows(path, floor_map)
# print("time to process one train_path", time.time() - start)
# #print("col count: ", len(rows[0]))
# print("rows: ", rows)

In [None]:
# # Run row making function for all training paths
# # print(train_paths[:10])
# import time
# start = time.time()

# all_rows = []
# for train_path in train_paths[:10]:
#     rows = one_trace_to_rows(train_path, floor_map)
#     all_rows.extend(rows)

# one_trace_df = pd.DataFrame(all_rows)
# display(len(one_trace_df))

# # Data below are the time it took to create the old version of training data (only waypoints)
# # without Pool
# # 10 -> 1.64 sec
# # 100 -> 28.12 sec
# # 1000 -> 286.67 sec
# # to process training (~26,000 files) -> ~7500 sec (~2hours)
# print(time.time() - start)

# with Pool
# no need for wrapper with pool.starmap -> https://qiita.com/okiyuki99/items/a54797cb44eb4ae571f6

# Memo about Pool
# with Pool
# 10 -> 1.09 sec
# 100 -> 12.35 sec
# 1000 -> 113.87 sec
# to process training (~26,000 files) -> ~3000 sec (~50min)

In [None]:
# # Check if we can make df

# # column names
# col_names = ["site_id", "file_id", "floor_converted", "floor", \
#              "ts", "start_ts", "diff_start_ts", "x", "y", \
#              "closest_wp_ts", "diff_start_wp_ts", "diff_ts_wp_ts", "within_100ms", "within_200ms", \
#              "acce_ts", "diff_acce_ts", "acce_x", "acce_y", "acce_z", "acce_acc", \
#              "ahrs_ts", "diff_ahrs_ts", "ahrs_x", "ahrs_y", "ahrs_z", "ahrs_acc", \
#              "magn_ts", "diff_magn_ts", "magn_x", "magn_y", "magn_z", "magn_acc", "magn_strength",\
#              "gyro_ts", "diff_gyro_ts", "gyro_x", "gyro_y", "gyro_z", "gyro_acc", \
#              "acce_u_ts", "diff_acce_u_ts", "acce_u_x", "acce_u_y", "acce_u_z", "acce_u_acc", \
#              "magn_u_ts", "diff_magn_u_ts", "magn_u_x", "magn_u_y", "magn_u_z", "magn_u_acc", \
#              "gyro_u_ts", "diff_gyro_u_ts", "gyro_u_x", "gyro_u_y", "gyro_u_z", "gyro_u_acc", \
#              "wifi_ts", "diff_wifi_ts", "wifi_ssid", "wifi_bssid", "wifi_rssi", "wifi_freq", "wifi_last_seen_ts", \
#              "beacon_ts", "diff_beacon_ts", "beacon_ssid", "beacon_rssi", \
#              "rel_ts", "diff_rel_ts", "rel_x", "rel_y"
#             ]

# print(len(col_names))

# df = pd.DataFrame(rows, columns=col_names)
# print("df len: ", len(df))
# print("site_id nunique: ", df["site_id"].nunique())
# print("file_id nunique: ", df["file_id"].nunique())
# print("x value_counts: ", df["x"].value_counts())
# print("y value_counts: ", df["y"].value_counts())
# print("event ts nunique: ", df["ts"].nunique())
# print("start ts nunique: ", df["start_ts"].nunique()) # should be one
# print("diff_ts_wp_ts value_counts: ", df["diff_ts_wp_ts"].value_counts())
# print("diff_ts_wp_ts nunique: ", df["diff_ts_wp_ts"].nunique())
# print("within_100ms value_counts: ", df["within_100ms"].value_counts())
# print("within_100ms nunique: ", df["within_100ms"].nunique())
# print("within_100ms count: ", df["within_100ms"].count())
# print("within_200ms value_counts: ", df["within_200ms"].value_counts())
# print("within_200ms nunique: ", df["within_200ms"].nunique())
# print("within_200ms count: ", df["within_200ms"].count())
# display(df.head())

In [None]:
# # Set pool
# num_cores = multiprocessing.cpu_count()
# print(f"num_cores={num_cores}")
# # args = [(p, floor_map) for p in train_paths[:train_num]]
# args = [(p, floor_map) for p in grouped_paths_list]
# pool = Pool(num_cores)

# start = time.time()
# # w/ 250ms settings, 3 random samples from each site_id
# # 2 paths -> 18.7 sec
# # 10 paths -> 315 sec (df len is 1994)
# # 100 paths -> 708 sec (df len is 7183)
# # all ~ 600 paths -> 

# # errors
# # grouped_paths_list -> 100 paths -> site_id: 8 errors, 27 correct
# # grouped_paths_list -> 100 paths -> file_id: 23 errors, 77 correct

# # all in one go -> xxx sec
# # array_split -> 5891.8 sec

# # all in one go
# # res = pool.starmap(one_trace_to_rows, args)

# # split the args
# res = []
# for arg in tqdm(np.array_split(args, 50)):
#     res.extend(pool.starmap(one_trace_to_rows, arg))

In [None]:
############################## KEEP THIS CELL FOR LATER REF ##############################

# Error in ~20% of the train paths -> caused by not having acces_uncali to create the event timestamps

# error files
# /5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bbc852a70008c01607.txt
# /5cd56b5ae2acfd2d33b58548/2F/5cf214bda50dc300099d34cc.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085df529994a0008202661.txt
# /5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt
# /5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt
# /5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt
# /5cd56b5ae2acfd2d33b5854a/B1/5d073ba64a19c000086c559b.txt
# /5cd56b5ae2acfd2d33b5854a/F1/5d07603e4cae4f000a2db525.txt
# /5cd56b63e2acfd2d33b591c2/F2/5d0b0668912a980009fe91f2.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt
# /5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt
# /5cd56b64e2acfd2d33b592b3/F2/5d0c9321c99c56000836df18.txt
# /5cd56b64e2acfd2d33b592b3/F3/5d0c9952ea565d0008e34e8b.txt
# /5cd56b64e2acfd2d33b592b3/F4/5d0c9d65ea565d0008e34ea2.txt
# /5cd56b5ae2acfd2d33b58549/5F/5d0613514a19c000086c432a.txt
# /5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt
# /5cd56b79e2acfd2d33b5b74e/F3/5d0b015e2f8a26000805ba44.txt
# /5cd56b79e2acfd2d33b5b74e/F1/5d0af3452f8a26000805b830.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt
# /5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt
# /5cd56b6be2acfd2d33b59d1f/F3/5d0896415125450008037c76.txt

# base_path = "../input/indoor-location-navigation/train"
# error_files = [
#     "/5cd56b5ae2acfd2d33b58548/1F/5cf20b29718b08000848aa0a.txt",
#     "/5cd56b61e2acfd2d33b58d20/F2/5d085dea4a2bd40008d47468.txt",
#     "/5cd56b61e2acfd2d33b58d20/F4/5d086c44d85da00008644fce.txt",
#     "/5cd56b5ae2acfd2d33b5854a/F3/5d078bab0e86b60008036348.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbfb2f8a26000805b9cb.txt",
#     "/5cd56b63e2acfd2d33b591c2/F1/5d0afbf92f8a26000805b9c9.txt",
#     "/5cd56b5ae2acfd2d33b58549/2F/5d11a6089c50c70008fe89bc.txt",
#     "/5cd56b79e2acfd2d33b5b74e/F3/5d0b01522f8a26000805ba3e.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a1545125450008037d87.txt",
#     "/5cd56b6be2acfd2d33b59d1f/F1/5d08a14e3f461f0008dac56c.txt"
# ]

# working_path = "../input/indoor-location-navigation/train/5d2709c303f801723c3299ee/1F/5dad7d6daa1d300006faa80c.txt"
# error_paths = [base_path + e for e in error_files]
# rows = one_trace_to_rows(error_paths[1], floor_map)
# print(rows)

In [None]:
# start = time.time()

# df_train = pd.DataFrame(res[0], columns=col_names)
# for r in res[1:]:
#     df = pd.DataFrame(r, columns=col_names)
#     df_train = df_train.append(df, ignore_index=True)

# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))

In [None]:
# def list_to_df(row_list):
#     df_train = pd.DataFrame(row_list[0], columns=col_names)
#     for r in row_list[1:]:
#         df = pd.DataFrame(r, columns=col_names)
#         df_train = df_train.append(df)
#     return df_train

# start = time.time()
# pool = Pool(num_cores)

# df_train = pool.map(list_to_df, tqdm(res))

# # print("train_path count", len(train_paths[:train_num]))
# print("time to process", time.time() - start)
# print("length of df made", len(df_train))
# display(df_train.head(10))
# pool.close()

In [None]:
# Calculate moving averages
# Differencing respect to time (as each timestep is unevenly spaced)

In [None]:
# # Save the file in parquet
# # https://www.kaggle.com/pedrocouto39/fast-reading-w-pickle-feather-parquet-jay
# # https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models

# # Saving train data
# train_file_name = "indoor_train_4.pkl"

# with open(train_file_name, "wb") as file:
#     pickle.dump(df_train, file)

# # Save them to output
# # df_train.to_csv('df_train_2.csv',index=False)
# # df_test.to_csv('df_test.csv',index=False)

In [None]:
# # Load data it back in
# with open(train_file_name, "rb") as file:
#     df_train = pickle.load(file)

In [None]:
# print("df len: ", len(df_train), "\n")
# print("file_id unique: ", (df_train["file_id"].nunique()), "\n")
# print("site_id unique: ", (df_train["site_id"].nunique()), "\n")
# print("site_id value_counts: ", (df_train["site_id"].value_counts()))
# display(df_train.head())

In [None]:
# # Get submission file
# sub_df = pd.read_csv("/kaggle/input/indoor-location-navigation/sample_submission.csv")
# sub_df[["site", "file", "timestamp"]] = sub_df["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
# sub_df = sub_df.drop(columns=["floor", "x", "y"])
# # grouped_df = sub_df.groupby("file").sample(n=2)
# # all_file_id = grouped_df["file"].unique()
# # print(len(grouped_df))
# # print(len(all_file_id))
# # display(grouped_df.head())
# display(sub_df.head())

# test_site_id = sub_df["site"].unique()
# train_site_id = df_train["site_id"].unique()
# print(test_site_id, "\n")
# print(train_site_id, "\n")
# a = list(set(test_site_id) & set(train_site_id))
# print(a)