# Overview

[Dataset is linked here](https://www.kaggle.com/ravishah1/indoor-location-navigation-sensor-data) - this notebook is the code to how I made the dataset

I have a discussion post up about this dataset with a bunch of info explaining this dataset that I suggest reading. I use multiprocessing Pool in order to speed up the process of creating this dataset. I learned a bunch about feature generation from [this notebook](https://www.kaggle.com/higepon/generate-wifi-features-5-times-faster). 

Things to note: this dataset is intended to not include WiFi Features but instead other types of data calculated from sensors such as acce and ahrs using the GitHub - [see my tutorial here](https://www.kaggle.com/ravishah1/understanding-the-indoor-loc-github-data-eda) about using the GitHub for this competition. As a result, this is not a good starting dataset as something with wifi is better; however, it may still be useful for fine tuning you results. Also, you can probably get better results by using postprocessing rather than this dataset, I myself am still deciding if I want to use these features. I created and published this dataset just to help anyone who can find a good way to use it. 

Good Luck and if you end up using this dataset, comment your results.

# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import os
import sys

import seaborn as sns
import matplotlib.pyplot as plt

from dataclasses import dataclass
import scipy.signal as signal

import multiprocessing
from multiprocessing import Pool
from pathlib import Path

import time
import psutil
import math
from tqdm import tqdm
from dataclasses import dataclass
from contextlib import contextmanager
from math import floor, ceil

import warnings
warnings.filterwarnings('ignore')

In [None]:
!git clone --depth 1 https://github.com/location-competition/indoor-location-competition-20 indoor_location_competition_20

In [None]:
from indoor_location_competition_20.io_f import read_data_file

from indoor_location_competition_20.compute_f import split_ts_seq
from indoor_location_competition_20.compute_f import correct_trajectory
from indoor_location_competition_20.compute_f import correct_positions
from indoor_location_competition_20.compute_f import init_parameters_filter
from indoor_location_competition_20.compute_f import get_rotation_matrix_from_vector
from indoor_location_competition_20.compute_f import get_orientation
from indoor_location_competition_20.compute_f import compute_steps
from indoor_location_competition_20.compute_f import compute_stride_length
from indoor_location_competition_20.compute_f import compute_headings
from indoor_location_competition_20.compute_f import compute_step_heading
from indoor_location_competition_20.compute_f import compute_rel_positions
from indoor_location_competition_20.compute_f import compute_step_positions

# Generate Dataset

### Helpers

In [None]:
"""Functions"""
@contextmanager
def timer(name: str):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    try:
        yield
    finally:
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr)
        
def time_float_to_str(time: float):
    return str(int(time))
        
"""Variables"""
# Sample Submission Important Info
ss = pd.read_csv("../input/indoor-location-navigation/sample_submission.csv")

sub_df = ss["site_path_timestamp"].apply(
    lambda x: pd.Series(x.split("_")))
sub_df.columns = ['site', 'path', 'timestamp']

sites = sub_df.site.unique()
building_dfs = [building_df for _, building_df in sub_df.groupby('site')]

# Paths
data_path = "../input/indoor-location-navigation"
train_path = "../input/indoor-location-navigation/train"
test_path = "../input/indoor-location-navigation/test"

In [None]:
(building_dfs[0].path.unique())

### Database Generation Using GitHub read_data_file

In [None]:
def generate_db(file_path: str):
    """ This function calls the read_data_file in the GitHub to create a database to work from. """
    db = read_data_file(file_path)
    
    acce = db.acce
    ahrs = db.ahrs
    gyro = db.gyro
    magn = db.magn
    posi = db.waypoint
    return acce, ahrs, gyro, magn, posi

### Sensor DataFrame Generation

In [None]:
def generate_sensor_df(file_path: str, floor: str, path: str, train: bool):
    """ 
    This function generates a dataframe of sensor information computed with help from the GitHub. 
    It also returns the waypoint dataframe
    """
    
    acce, ahrs, gyro, magn, posi = generate_db(file_path)
    
    # Waypoint DF 
    if train:
        posi_df = pd.DataFrame(posi, columns=['timestamp','x','y'])
        posi_df["timestamp"] = posi_df["timestamp"].apply(time_float_to_str)
        posi_df["path"] = path
    
    # Sensor DF
    step_timestamps, step_indexs, step_acce_max_mins = compute_steps(acce)    
    sensor_df = pd.DataFrame(step_acce_max_mins, index=step_indexs)
    sensor_df.columns = ["timestamp", "acce_max", "acce_min", "acce_std"]
    
    stride_lengths = compute_stride_length(step_acce_max_mins)
    sensor_df["stride_length"] = stride_lengths[:, 1]
    
    headings = compute_headings(ahrs)
    step_headings = compute_step_heading(step_timestamps, headings)
    sensor_df["step_heading"] = step_headings[:, 1]

    rel_positions = compute_rel_positions(stride_lengths, step_headings)
    sensor_df["rel_pos_x"] = rel_positions[:, 1]
    sensor_df["rel_pos_y"] = rel_positions[:, 2]
    sensor_df["timestamp"] = sensor_df["timestamp"].apply(time_float_to_str)
    
    sensor_df["floor"] = floor
    sensor_df["path"] = path
    
    if train:
        return sensor_df, posi_df
    return sensor_df

### Generate Train Data

In [None]:
def generate_one_train(site: str):
    """ This function creates a sensor train and waypoint train csv file for a single site """
    file_path = f"{train_path}/{site}"
    floor_paths = glob.glob(os.path.join(file_path+'/*'))
    sensor_data = pd.DataFrame()
    wayoints = pd.DataFrame()
    
    for floor in floor_paths:
        files = glob.glob(os.path.join(floor+'/*'))
        for file in files:
            floor_name = file.split("/")[-2]
            path = file.split("/")[-1].split(".")[0]

            sensor_df, posi_df = generate_sensor_df(file_path=file, floor=floor_name, path=path, train=True)
            sensor_data = pd.concat([sensor_data, sensor_df])
            wayoints = pd.concat([wayoints, posi_df]) 
            
    sensor_data.to_csv(f"./train/{site}_sensor_train.csv", index=False)
    wayoints.to_csv(f"./train/{site}_waypoint_train.csv", index=False)

In [None]:
def generate_train():
    """ This function uses multiprocessing Pool in order to call generate_one_train for each site to create all train files """
    try:
        os.mkdir("./train")
    except FileExistsError:
        pass
    
    num_cores = multiprocessing.cpu_count()
    print(f"num_cores={num_cores}")
    pool = Pool(num_cores)
    pool.map(generate_one_train, sites)

In [None]:
with timer("Generating Train Dataset"):
    generate_train()

### Generate Test Data

In [None]:
def generate_one_test(building_df: pd.DataFrame):
    """ This function creates a sensor test csv for a single site """
    site = str((building_df.site.unique()[0]))
    paths = building_df.path.unique()
    sensor_data = pd.DataFrame()
    
    for path in paths:
        file_path = f"{test_path}/{path}.txt"
        sensor_df = generate_sensor_df(file_path=file_path, floor="TBD", path=path, train=False)
        sensor_data = pd.concat([sensor_data, sensor_df])
        
    sensor_data.to_csv(f"./test/{site}_sensor_test.csv", index=False)

In [None]:
def generate_test():
    """ This function uses multiprocessing Pool in order to call generate_one_test for each site to create all test files """
    try:
        os.mkdir("./test")
    except FileExistsError:
        pass
    
    num_cores = multiprocessing.cpu_count()
    print(f"num_cores={num_cores}")
    pool = Pool(num_cores)
    pool.map(generate_one_test, building_dfs)

In [None]:
with timer("Generate Test Dataset"):
    generate_test()