# Floor estimation by WIFI timestamp

The floor of test data can be estimated from the timestamp of TYPE_WIFI or TYPE_BEACON. This is because the timestamp of the test data is a relative value, but timestamp of them areabsolute values and can be compared with the train data.  
Imagine an investigator walking around the floor of a building. Do one floor and go to the next floor ... It is considered that the timestamps of them are close to each other for the data on the same floor.   
If this inference is correct, the test data floor is likely to be the same as train data with a value close to its TYPE_WIFI timestamp.

<h1 style='background:navy; border:0; color:white'><center>1. Preparation</center></h1>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import re
import os
import sys
import shutil
import warnings; warnings.simplefilter('ignore')

In [None]:
TRAIN_DIR = '/kaggle/input/indoor-location-navigation/train'  # directory where train data is stored
ORG_TEST_DIR = '/kaggle/input/indoor-location-navigation/test'  # directory where test data is stored
TEST_DIR = '/kaggle/working/test/'  # directory for classifying and storing test data by siteID

### In order to predict the floor for each site, divide the test into folders by siteID.

In [None]:
test_files = [file for file in glob(ORG_TEST_DIR + '/*.txt')]

# Create parent directory
os.mkdir(TEST_DIR)

# Create a directory for each siteID and copy the test files
for file in test_files:
    with open(file, encoding='utf-8') as f:
        for line in f:
            if 'SiteID:' in line:
                siteid = line.split('\t')[1].replace('SiteID:', '')
                site_dir = os.path.join(TEST_DIR, siteid)
                if not os.path.isdir(site_dir):
                    os.mkdir(site_dir)
                break
    shutil.copy(file, site_dir)

### Define functions

In [None]:
# function to convert floor to number
def floor2int(floor):
    if re.fullmatch('F\d', floor) or re.fullmatch('\dF', floor):
        return int(re.sub('F', '', floor)) - 1
    elif re.fullmatch('B\d', floor) or re.fullmatch('\dB', floor):
        return -int(re.sub('B', '', floor))
    elif re.fullmatch('L\d', floor) or re.fullmatch('\dL', floor):
        return int(re.sub('L', '', floor)) - 1
    
# function to extract timestamp of TYPE_WIFI
def extract_wifi_time(file):
    wifi_times = []
    with open(file, encoding='utf-8') as f:
        for line in f:
            if 'TYPE_WIFI' in line:
                wifi_times.append(line.split('\t')[6].strip())
    return wifi_times

# function to extract timestamp of TYPE_WIFI (version 2)
def extract_wifi_time2(file):
    wifi_times = []
    with open(file, encoding='utf-8') as f:
        for line in f:
            if line.split('\t')[1] == 'TYPE_WIFI':
                wifi_times.append(line.split('\t')[6].strip())
                break
    return wifi_times

# function to plot
def stripplot(data, ylim=(0,1)):
    fig, ax = plt.subplots(1, 1, figsize=(25, 6))
    ax.ticklabel_format(useOffset=False, style='plain')
    _ = sns.stripplot(data=data, x='floor_or_path', y='time', ax=ax)
    # set axis
    plt.xticks(rotation=45)
    if ylim != (0,1):
        ax.set_ylim(ylim)
    # set yticks
    start, end = ax.get_ylim()
    stepsize = int((end - start) / 25)
    ax.yaxis.set_ticks(pd.np.arange(start, end, stepsize))
    ax.grid()

<h1 style='background:navy; border:0; color:white'><center>2. Example</center></h1>

Let's check how the timestamp of TYPE_WIFI is distributed in the data of one site. Here, as an example, the data of siteid: 5d27097f03f801723c320d97 is used.

In [None]:
%%time
### create a DataFrame for each floor
siteid = '5d27097f03f801723c320d97'
floors = os.listdir(os.path.join(TRAIN_DIR, siteid))
test_files = [file for file in glob(os.path.join(TEST_DIR, siteid, '*.txt'))]

# convert train data to DataFrame
train_df = pd.DataFrame(columns=['time', 'floor_or_path'])
for floor in floors:
    train_files = [file for file in glob(os.path.join(TRAIN_DIR, siteid, floor, '*.txt'))]
    for train_file in train_files:
        train_wifi_time = extract_wifi_time(train_file)
        train_df = pd.concat([train_df, pd.DataFrame(data={'time': train_wifi_time, 'floor_or_path': [floor]*len(train_wifi_time)})])
    
# convert test data to DataFrame
test_df = pd.DataFrame(columns=['time', 'floor_or_path'])
for test_file in test_files:
    test_wifi_time = extract_wifi_time(test_file)
    file_name = test_file.split('/')[5].replace('.txt', '')
    test_df = pd.concat([test_df, pd.DataFrame(data={'time': test_wifi_time, 'floor_or_path': [file_name] * len(test_wifi_time)})])

# combine train_df and test_df
df = pd.concat([train_df, test_df], ignore_index=True)
df['time'] = df['time'].astype('int64')
df.drop_duplicates(inplace=True, ignore_index=True)
df = df.sort_values('time').reset_index(drop=True)
df

In [None]:
# Let's plot it. The floor name is train data and the path name is test data.
stripplot(data=df)

It seems that one floor may be surveyed multiple times.

In [None]:
# Timestamp zooms in on small elements for clarity (excluding the last four)
df_tmp = df.query("floor_or_path in ['F1', 'B1', 'B2', '7727672abec7d70216173223', '698dc1d1a1885908e8fbfe4c', 'F2', '4fbd93217986b45372ebedd4']")
stripplot(data=df_tmp, ylim=(1573970000000, 1573990000000))

The time zones do not overlap between different floors. Also, the test data appears to be included in the timestamp of either train data. The floor of 7727 ~ and 698d ~ seems to be B2, and 4fbd seems to be F2.

In [None]:
# Try expanding the elements with large timestamps
df_tmp = df.query("floor_or_path not in ['F1', 'B1', 'B2', '7727672abec7d70216173223', '698dc1d1a1885908e8fbfe4c', 'F2', '4fbd93217986b45372ebedd4']")
stripplot(data=df_tmp, ylim=(1574042000000, 1574060000000))

After all, the time zones do not overlap between different floors. Also, any test data appears to be included in the timestamp of any train data.  
ea6f ~ to b760 ~ seems to be F3, and d505 ~ to bf55 ~ floor seems to be F4.

<h1 style='background:navy; border:0; color:white'><center>3. Accuracy</center></h1>

In [None]:
# create DataFrame of train and test data
siteids = os.listdir(TEST_DIR)
df_train_files = pd.DataFrame(columns=['siteid', 'floor', 'path'])
df_test_files = pd.DataFrame(columns=['siteid', 'path'])

for siteid in siteids:
    # train
    train_files = glob(os.path.join(TRAIN_DIR, siteid, '*/*.txt'))
    df_tmp = pd.DataFrame([file.split('/') for file in train_files])
    df_tmp.drop([0,1,2,3,4], axis=1, inplace=True)
    df_tmp.columns = ['siteid', 'floor', 'path']
    df_tmp['path'] = df_tmp['path'].map(lambda x: x.replace('.txt', ''))
    df_tmp['file_path'] = train_files
    df_train_files = pd.concat([df_train_files, df_tmp])
    
    # test
    test_files = glob(os.path.join(TEST_DIR, siteid, '*'))
    df_tmp = pd.DataFrame([file.split('/') for file in test_files])
    df_tmp.drop([0,1,2,3], axis=1, inplace=True)
    df_tmp.columns = ['siteid', 'path']
    df_tmp['path'] = df_tmp['path'].map(lambda x: x.replace('.txt', ''))
    df_tmp['file_path'] = test_files
    df_test_files = pd.concat([df_test_files, df_tmp])
    
df_train_files = df_train_files.reset_index(drop=True)

In [None]:
len_floor_paths = 0
accs = 0

for siteid in siteids:
    df_train_files_onesite = df_train_files[df_train_files['siteid'] == siteid].reset_index(drop=True)
    # Create a path and wifi_time pair
    paths = df_train_files_onesite['path']
    tmp_df = pd.DataFrame(columns=['path', 'wifi_time'])
    for i, file in enumerate(df_train_files_onesite['file_path']):
        wifi_time = extract_wifi_time2(file)
        path = [paths[i]] * len(wifi_time)
        tmp_df = pd.concat([tmp_df, pd.DataFrame(data={'path': path, 'wifi_time': wifi_time})])
    tmp_df['wifi_time'] = tmp_df['wifi_time'].astype('float')
    tmp_df = tmp_df.sort_values(by='wifi_time')
    val_df = pd.merge(df_train_files_onesite, tmp_df, on='path')
    floor_path_df = val_df.sort_values('wifi_time')[['floor', 'path', 'wifi_time']].set_index('path')
    floor_path_df['floor_shift'] = floor_path_df.shift(1)['floor']
    len_floor_paths += len(floor_path_df)
    accs += len(floor_path_df[floor_path_df['floor'] == floor_path_df['floor_shift']])
    
print('Accuracy:', accs/len_floor_paths)

<h1 style='background:navy; border:0; color:white'><center>4. Floor estimation</center></h1>

Estimate the floor by arranging each site in the order of timestamp and filling the floor of test data with the floor of the previous train data.

In [None]:
%%time
# add dataset column
df_train_files['dataset'] = 'train'
df_test_files['dataset'] = 'test'

# combine df_train_files and df_test_files
df_all = pd.concat([df_train_files, df_test_files]).reset_index(drop=True)

# add wifi_time
paths = df_all['path']
# create a path and wifi_time pair
tmp_df = pd.DataFrame(columns=['path', 'wifi_time'])
for i, file in enumerate(df_all['file_path']):
    wifi_time = extract_wifi_time2(file)
    path = [paths[i]] * len(wifi_time)
    tmp_df = pd.concat([tmp_df, pd.DataFrame(data={'path': path, 'wifi_time': wifi_time})])
tmp_df['wifi_time'] = tmp_df['wifi_time'].astype('float')

wifi_time_df = pd.merge(df_all, tmp_df, on='path')
wifi_time_df = wifi_time_df.sort_values('wifi_time')
wifi_time_df = wifi_time_df.reset_index(drop=True)

floor_path_df = wifi_time_df[['floor', 'path', 'dataset']].fillna(method='ffill')
floor_path_df = floor_path_df[floor_path_df['dataset'] == 'test']
floor_path_df = floor_path_df.drop('dataset', axis=1).reset_index(drop=True)
floor_path_df

In [None]:
# convert floor names to numbers
floor_path_df['floor'] = floor_path_df['floor'].map(floor2int)
floor_path_df

### Create submission file

In [None]:
submission_org = pd.read_csv('/kaggle/input/indoor-location-navigation/sample_submission.csv')
# separate site_path and timestamp
tmp_df = submission_org['site_path_timestamp'].str.split('_', expand=True)
tmp_df.columns = ['site', 'path', 'timestamp']
submission = pd.concat([tmp_df, submission_org[['site_path_timestamp', 'floor', 'x', 'y']]], axis=1)
# merge with floor_path_df
submission = submission.merge(floor_path_df, on='path')
submission.head()

In [None]:
submission.drop(['site', 'path', 'timestamp', 'floor_x'], axis=1, inplace=True)
submission.columns = ['site_path_timestamp', 'x', 'y', 'floor']
submission = submission[['site_path_timestamp', 'floor', 'x', 'y']]
submission

The submission file has been created. Of course, you need to estimate x and y separately.