<div>
    <h1 align="center"> Snap to Grid & Fix the timestamps - Part(3)</h1></h1>
    <h2 align="center">Identify the position of a smartphone in a shopping mall</h2>
    <h3 align="center">By: Somayyeh Gholami & Mehran Kazeminia</h3>
</div>

<div class="alert alert-success">  
</div>

# Description:

### - In this notebook, we want to improve the score of our previous notebook (No. 2). We chose "generated6", which has a score of "5.265". The address of our previous notebook is as follows:

https://www.kaggle.com/mehrankazeminia/2-3-indoor-navigation-comparative-method

### - We have used the following notebook codes in this notebook. Thanks again for sharing this great notebook. "Data Visualization" is of particular importance in this challenge. Because the location of the corridors is important :)

https://www.kaggle.com/robikscube/indoor-navigation-snap-to-grid-post-processing

### - Next, we used the following excellent notebook for "Fix the timestamps". 

https://www.kaggle.com/tomooinubushi/postprocessing-based-on-leakage

### =======================================================

### For more information, you can refer to the following address:

https://www.kaggle.com/c/indoor-location-navigation/discussion/230153

## >>> Good Luck <<<



<div class="alert alert-success">  
</div>

# If you find this work useful, please don't forget upvoting :)

<div class="alert alert-success">  
</div>

# Import 

In [None]:
import json
import numpy as np
import pandas as pd 
import matplotlib.pylab as plt

from scipy.spatial.distance import cdist

%matplotlib inline

<div class="alert alert-success">  
</div>

# Helper Functions

In [None]:
def split_col(df):
    df = pd.concat([
        df['site_path_timestamp'].str.split('_', expand=True) \
        .rename(columns={0:'site',
                         1:'path',
                         2:'timestamp'}),
        df
    ], axis=1).copy()
    return df

floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2,
             "F4":3, "F5":4, "F6":5, "F7":6,"F8":7,"F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5,
             "7F":6, "8F": 7, "9F":8}


def plot_preds(
    site,
    floorNo,
    sub=None,
    true_locs=None,
    base="../input/indoor-location-navigation",
    show_train=True,
    show_preds=True,
    fix_labels=True,
    map_floor=None
):
    """
    Plots predictions on floorplan map.
    
    map_floor : use a different floor's map
    """
    if map_floor is None:
        map_floor = floorNo
    # Prepare width_meter & height_meter (taken from the .json file)
    floor_plan_filename = f"{base}/metadata/{site}/{map_floor}/floor_image.png"
    json_plan_filename = f"{base}/metadata/{site}/{map_floor}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)

    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]

    floor_img = plt.imread(f"{base}/metadata/{site}/{map_floor}/floor_image.png")

    fig, ax = plt.subplots(figsize=(12, 12))
    plt.imshow(floor_img)

    if show_train:
        true_locs = true_locs.query('site == @site and floorNo == @map_floor').copy()
        true_locs["x_"] = true_locs["x"] * floor_img.shape[0] / height_meter
        true_locs["y_"] = (
            true_locs["y"] * -1 * floor_img.shape[1] / width_meter
        ) + floor_img.shape[0]
        true_locs.query("site == @site and floorNo == @map_floor").groupby("path").plot(
            x="x_",
            y="y_",
            style="+",
            ax=ax,
            label="train waypoint location",
            color="grey",
            alpha=0.5,
        )

    if show_preds:
        sub = sub.query('site == @site and floorNo == @floorNo').copy()
        sub["x_"] = sub["x"] * floor_img.shape[0] / height_meter
        sub["y_"] = (
            sub["y"] * -1 * floor_img.shape[1] / width_meter
        ) + floor_img.shape[0]
        for path, path_data in sub.query(
            "site == @site and floorNo == @floorNo"
        ).groupby("path"):
            path_data.plot(
                x="x_",
                y="y_",
                style=".-",
                ax=ax,
                title=f"{site} - floor - {floorNo}",
                alpha=1,
                label=path,
            )
    if fix_labels:
        handles, labels = ax.get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        plt.legend(
            by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1, 0.5)
        )
    return fig, ax

def sub_process(sub, train_waypoints):
    train_waypoints['isTrainWaypoint'] = True
    sub = split_col(sub[['site_path_timestamp','floor','x','y']]).copy()
    sub = sub.merge(train_waypoints[['site','floorNo','floor']].drop_duplicates(), how='left')
    sub = sub.merge(
        train_waypoints[['x','y','site','floor','isTrainWaypoint']].drop_duplicates(),
        how='left',
        on=['site','x','y','floor']
             )
    sub['isTrainWaypoint'] = sub['isTrainWaypoint'].fillna(False)
    return sub.copy()


<div class="alert alert-success">  
</div>

# Data Set

In [None]:
train_waypoints = pd.read_csv('../input/indoor-location-train-waypoints/train_waypoints.csv')
sub = sub_process(pd.read_csv('../input/2-3-indoor-navigation-comparative-method/generated6.csv'),
                 train_waypoints)

<div class="alert alert-success">  
</div>

# Find the closest "grid" point for each prediction.

In [None]:
def add_xy(df):
    df['xy'] = [(x, y) for x,y in zip(df['x'], df['y'])]
    return df

def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

sub = add_xy(sub)
train_waypoints = add_xy(train_waypoints)

ds = []
for (site, myfloor), d in sub.groupby(['site','floor']):
    true_floor_locs = train_waypoints.loc[(train_waypoints['floor'] == myfloor) &
                                          (train_waypoints['site'] == site)] \
        .reset_index(drop=True)
    if len(true_floor_locs) == 0:
        print(f'Skipping {site} {myfloor}')
        continue
    d['matched_point'] = [closest_point(x, list(true_floor_locs['xy'])) for x in d['xy']]
    d['x_'] = d['matched_point'].apply(lambda x: x[0])
    d['y_'] = d['matched_point'].apply(lambda x: x[1])
    ds.append(d)

sub = pd.concat(ds)


<div class="alert alert-success">  
</div>

# Apply a Threshold and "Snap to Grid"

In [None]:
def snap_to_grid(sub, threshold):
    """
    Snap to grid if within a threshold.
    
    x, y are the predicted points.
    x_, y_ are the closest grid points.
    _x_, _y_ are the new predictions after post processing.
    """
    sub['_x_'] = sub['x']
    sub['_y_'] = sub['y']
    sub.loc[sub['dist'] < threshold, '_x_'] = sub.loc[sub['dist'] < threshold]['x_']
    sub.loc[sub['dist'] < threshold, '_y_'] = sub.loc[sub['dist'] < threshold]['y_']
    return sub.copy()

# Calculate the distances
sub['dist'] = np.sqrt( (sub.x-sub.x_)**2 + (sub.y-sub.y_)**2 )

sub_pp = snap_to_grid(sub, threshold=7.55)

sub_pp = sub_pp[['site_path_timestamp','floor','_x_','_y_','site','path','floorNo']] \
    .rename(columns={'_x_':'x', '_y_':'y'})

<div class="alert alert-success">  
</div>

# Save Post Processed Submission.

In [None]:
sub_pp[['site_path_timestamp','floor','x','y']] \
    .to_csv('submission_snap_to_grid.csv', index=False)

In [None]:
submission_snap_to_grid = sub_pp[['site_path_timestamp','floor','x','y']]

<div class="alert alert-success">  
</div>

# Postprocessing with leaked feature

In [None]:
import json
import re
import gc
import pickle
import itertools
import pandas as pd
import numpy as np
from glob import glob
from datetime import datetime as dt
from pathlib import Path
from tqdm import tqdm
import datetime
ts_conv = np.vectorize(datetime.datetime.fromtimestamp) # ut(10 digit) -> date

# pandas settings -----------------------------------------
pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = '{:,.5f}'.format

# Graph drawing -------------------------------------------
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import rc
from matplotlib_venn import venn2, venn2_circles
from matplotlib import animation as ani
from IPython.display import Image
from pylab import imread

plt.rcParams["patch.force_edgecolor"] = True
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.set_style("whitegrid", {'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

%matplotlib inline
%config InlineBackend.figure_format='retina'

# ML -------------------------------------------
from sklearn.preprocessing import LabelEncoder


import dill
from collections import defaultdict, OrderedDict
from scipy.spatial import distance

In [None]:
def unpickle(filename):
    with open(filename, 'rb') as fo:
        p = pickle.load(fo)
    return p

def to_pickle(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, -1)



class FeatureStore():
    
    # necessayr to re-check
    floor_convert = {'1F' :  0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4, 
                     '6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8,
                     'B'  : -1, 'B1' : -1, 'B2' : -2, 'B3' : -3, 
                     'BF' : -1, 'BM' : -1, 
                     'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4, 
                     'F6' : 5, 'F7' : 6, 'F8' : 7, 'F9' : 8, 'F10': 9,
                     'L1' : 0, 'L2' : 1, 'L3' : 2, 'L4' : 3, 'L5' : 4, 
                     'L6' : 5, 'L7' : 6, 'L8' : 7, 'L9' : 8, 'L10': 9, 
                     'L11': 10,
                     'G'  : 0, 'LG1': 0, 'LG2': 1, 'LM' : 0, 'M'  : 0, 
                     'P1' : 0, 'P2' : 1,}
    
    df_types = ['accelerometer',
                'accelerometer_uncalibrated',
                'beacon',
                'gyroscope',
                'gyroscope_uncalibrated',
                'magnetic_field',
                'magnetic_field_uncalibrated',
                'rotation_vector',
                'waypoint',
                'wifi']
    
    # https://github.com/location-competition/indoor-location-competition-20
    df_type_cols = {'accelerometer': ["timestamp", "x", "y", "z", "accuracy"],
                'accelerometer_uncalibrated': ["timestamp", "x", "y", "z", 
                                               "x2", "y2", "z2", "accuracy" ],
                'beacon': ["timestamp", "uuid", "major_id", "minor_id", "tx_power", 
                           "rssi", "distance", "mac_addr", "timestamp2"],
                'gyroscope': ["timestamp", "x", "y", "z", "accuracy"],
                'gyroscope_uncalibrated': ["timestamp", "x", "y", "z", 
                                           "x2", "y2", "z2", "accuracy" ],
                'magnetic_field': ["timestamp", "x", "y", "z", "accuracy"],
                'magnetic_field_uncalibrated': ["timestamp", "x", "y", "z", 
                                                "x2", "y2", "z2", "accuracy" ],
                'rotation_vector': ["timestamp", "x", "y", "z", "accuracy"],
                'waypoint': ["timestamp", "x", "y"],
                'wifi': ["timestamp", "ssid", "bssid","rssi","frequency",
                         "last_seen_timestamp",]}

    dtype_dict = {}
    dtype_dict["accelerometer"] = {"timestamp":int, "x":float, "y":float, "z":float, 
                                   "accuracy":int}
    dtype_dict["accelerometer_uncalibrated"] = {"timestamp":int, "x":float, "y":float, 
                                                "z":float, "x2":float, "y2":float, 
                                                "z2":float, "accuracy":int}
    dtype_dict["beacon"] = {"timestamp":int, "uuid":str, "major_id":str, 
                            "minor_id":str, "tx_power":int,  "rssi":int, 
                            "distance":float, "mac_addr":str, "timestamp2":int}
    dtype_dict["gyroscope"] = {"timestamp":int, "x":float, "y":float, "z":float, 
                               "accuracy":int}
    dtype_dict["gyroscope_uncalibrated"] = {"timestamp":int, "x":float, "y":float, 
                                            "z":float, "x2":float, "y2":float, 
                                            "z2":float, "accuracy":int}
    dtype_dict["magnetic_field"] = {"timestamp":int, "x":float, "y":float, 
                                    "z":float, "accuracy":int}
    dtype_dict["magnetic_field_uncalibrated"] = {"timestamp":int, "x":float, 
                                                 "y":float, "z":float, "x2":float, 
                                                 "y2":float, "z2":float, "accuracy":int}
    dtype_dict["rotation_vector"] = {"timestamp":int, "x":float, "y":float, 
                                     "z":float, "accuracy":int}
    dtype_dict["waypoint"] = {"timestamp":int, "x":float, "y":float, "z":float}
    dtype_dict["wifi"] = {"timestamp":int, "ssid":str, "bssid":str,
                          "rssi":int,"frequency":int, "last_seen_timestamp":int}

    def __init__(self, site_id, floor, path_id, 
                 input_path="../input/indoor-location-navigation/",
                 save_path="../mid"):
        self.site_id = site_id.strip()
        self.floor = floor.strip()
        self.n_floor = self.floor_convert[self.floor]
        self.path_id = path_id.strip()
        
        self.input_path = input_path
        assert Path(input_path).exists(), f"input_path do not exist: {input_path}"
        
        self.save_path = save_path
        Path(save_path).mkdir(parents=True, exist_ok=True)
        
        self.site_info = SiteInfo(site_id=self.site_id, floor=self.floor, input_path=self.input_path)
        
    def _flatten(self, l):
        return list(itertools.chain.from_iterable(l))
    
    def multi_line_spliter(self, s):
        matches = re.finditer("TYPE_", s)
        matches_positions = [match.start() for match in matches]
        split_idx = [0] + [matches_positions[i]-14 for i in range(1, len(matches_positions))] + [len(s)]
        return [s[split_idx[i]:split_idx[i+1]] for i in range(len(split_idx)-1)]
    
    def load_df(self, ):
        path = str(Path(self.input_path)/f"train/{self.site_id}/{self.floor}/{self.path_id}.txt")
        with open(path) as f:
            data = f.readlines()
        
        modified_data = []
        for s in data:
            if s.count("TYPE_")>1:
                lines = self.multi_line_spliter(s)
                modified_data.extend(lines)
            else:
                modified_data.append(s)
        del data
        self.meta_info_len = len([d for d in modified_data if d[0]=="#"])
        self.meta_info_df = pd.DataFrame([m.replace("\n", "").split(":") 
                                          for m in self._flatten([d.split("\t") 
                                                                  for d in modified_data if d[0]=="#"]) if m!="#"])

        data_df = pd.DataFrame([d.replace("\n", "").split("\t") for d in modified_data if d[0]!="#"])
        for dt in self.df_types:
            # select data type
            df_s = data_df[data_df[1]==f"TYPE_{dt.upper()}"]
            if len(df_s)==0:
                setattr(self, dt, pd.DataFrame(columns=self.df_type_cols[dt]))
            else:
                # remove empty cols
                na_info = df_s.isna().sum(axis=0) == len(df_s)
                df_s = df_s[[i for i in na_info[na_info==False].index if i!=1]].reset_index(drop=True)
                
                if len(df_s.columns)!=len(self.df_type_cols[dt]):
                    df_s.columns = self.df_type_cols[dt][:len(df_s.columns)]
                else:
                    df_s.columns = self.df_type_cols[dt]
            
                # set dtype          
                for c in df_s.columns:
                    df_s[c] = df_s[c].astype(self.dtype_dict[dt][c])
                                     
                # set DataFrame to attr
                setattr(self, dt, df_s)
    
    def get_site_info(self, keep_raw=False):
        self.site_info.get_site_info(keep_raw=keep_raw)
            
    def load_all_data(self, keep_raw=False):     
        self.load_df()
        self.get_site_info(keep_raw=keep_raw)
        
    def __getitem__(self, item):
        if item in self.df_types:
            return getattr(self, item)
        else:
            return None
    
    def save(self, ):
        # to be implemented
        pass
    
    
class SiteInfo():
    def __init__(self, site_id, floor, input_path="../input/indoor-location-navigation/"):
        self.site_id = site_id
        self.floor = floor
        self.input_path = input_path
        assert Path(input_path).exists(), f"input_path do not exist: {input_path}"
        
    def get_site_info(self, keep_raw=False):
        floor_info_path = f"{self.input_path}/metadata/{self.site_id}/{self.floor}/floor_info.json"
        with open(floor_info_path, "r") as f:
            self.floor_info = json.loads(f.read())
            self.site_height = self.floor_info["map_info"]["height"]
            self.site_width = self.floor_info["map_info"]["width"]
            if not keep_raw:
                del self.floor_info
            
        geojson_map_path = f"{self.input_path}/metadata/{self.site_id}/{self.floor}/geojson_map.json"
        with open(geojson_map_path, "r") as f:
            self.geojson_map = json.loads(f.read())
            self.map_type = self.geojson_map["type"]
            self.features = self.geojson_map["features"]
            
            self.floor_coordinates = self.features[0]["geometry"]["coordinates"]
            self.store_coordinates = [self.features[i]["geometry"]["coordinates"] 
                                          for i in range(1, len(self.features))]
                
            if not keep_raw:
                del self.geojson_map
    
    def show_site_image(self):
        path = f"{self.input_path}/metadata/{self.site_id}/{self.floor}/floor_image.png"
        plt.imshow(imread(path), extent=[0, self.site_width, 0, self.site_height])

    def draw_polygon(self, size=8, only_floor=False):

        fig = plt.figure()
        ax = plt.subplot(111)
            
        xmax, xmin, ymax, ymin = self._draw(self.floor_coordinates, ax, calc_minmax=True)
        if not only_floor:
            self._draw(self.store_coordinates, ax, fill=True)
        plt.legend([])
        
        xrange = xmax - xmin
        yrange = ymax - ymin
        ratio = yrange / xrange
        
        self.x_size = size
        self.y_size = size*ratio

        fig.set_figwidth(size)
        fig.set_figheight(size*ratio)
        # plt.show()
        return ax
        
    def _draw(self, coordinates, ax, fill=False, calc_minmax=False):
        xmax, ymax = -np.inf, -np.inf
        xmin, ymin = np.inf, np.inf
        for i in range(len(coordinates)):
            ndim = np.ndim(coordinates[i])
            if ndim==2:
                corrd_df = pd.DataFrame(coordinates[i])
                if fill:
                    ax.fill(corrd_df[0], corrd_df[1], alpha=0.7)
                else:
                    corrd_df.plot.line(x=0, y=1, style="-", ax=ax)
                        
                if calc_minmax:
                    xmax = max(xmax, corrd_df[0].max())
                    xmin = min(xmin, corrd_df[0].min())

                    ymax = max(ymax, corrd_df[1].max())
                    ymin = min(ymin, corrd_df[1].min())
            elif ndim==3:
                for j in range(len(coordinates[i])):
                    corrd_df = pd.DataFrame(coordinates[i][j])
                    if fill:
                        ax.fill(corrd_df[0], corrd_df[1], alpha=0.6)
                    else:
                        corrd_df.plot.line(x=0, y=1, style="-", ax=ax)
                        
                    if calc_minmax:
                        xmax = max(xmax, corrd_df[0].max())
                        xmin = min(xmin, corrd_df[0].min())

                        ymax = max(ymax, corrd_df[1].max())
                        ymin = min(ymin, corrd_df[1].min())
            else:
                assert False, f"ndim of coordinates should be 2 or 3: {ndim}"
        if calc_minmax:
            return xmax, xmin, ymax, ymin
        else:
            return None
        


In [None]:
# train_meta_data
train_meta = glob("../input/indoor-location-navigation/train/*/*/*")
train_meta_org = pd.DataFrame(train_meta)
train_meta = train_meta_org[0].str.split("/", expand=True)[[4, 5, 6]]
train_meta.columns = ["site_id", "floor", "path_id"]
train_meta["path_id"] = train_meta["path_id"].str.replace(".txt", "")
train_meta["path"] = train_meta_org[0]
#train_meta.head()

In [None]:
def pickle_dump_dill(obj, path):
    with open(path, mode='wb') as f:
        dill.dump(obj, f)


def pickle_load_dill(path):
    with open(path, mode='rb') as f:
        data = dill.load(f)
        return data

In [None]:
sample_sub = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')
test_sites = sample_sub.site_path_timestamp.apply(lambda x: pd.Series(x.split("_")))[0].unique().tolist()

test_meta = sample_sub["site_path_timestamp"].apply(
    lambda x: pd.Series(x.split("_")))
test_meta.columns = ["site_id", "path_id", "timestamp"]
test_meta=test_meta.drop('timestamp', axis=1)
test_meta = test_meta.drop_duplicates(subset=["site_id", "path_id"]).reset_index(drop=True)

# Get first and last waypoints in train dataset

In [None]:
create_train_meta_sub=False
if create_train_meta_sub:
    train_meta_sub=train_meta[train_meta['site_id'].isin(test_sites)].reset_index(drop=True)
    train_meta_sub['start_time']=0
    train_meta_sub['end_time']=0
    train_meta_sub['start_wp_time']=0
    train_meta_sub['start_wp_x']=0
    train_meta_sub['start_wp_y']=0
    train_meta_sub['end_wp_time']=0
    train_meta_sub['end_wp_x']=0
    train_meta_sub['end_wp_y']=0
    train_meta_sub['n_floor']=0
    for i in tqdm(range(len(train_meta_sub))):
        t = train_meta_sub.iloc[i]
        n_floor = FeatureStore.floor_convert[t.floor]
        feature = FeatureStore(
            site_id=t.site_id, floor=t.floor, path_id=t.path_id)
        feature.load_all_data() 
        start_time=int(feature.meta_info_df[feature.meta_info_df[0]=='startTime'][1])
        end_time=int(feature.meta_info_df[feature.meta_info_df[0]=='endTime'][1])
        train_meta_sub.loc[i,'start_time']=start_time
        train_meta_sub.loc[i,'start_wp_time']=feature.waypoint.iloc[0]['timestamp']
        train_meta_sub.loc[i,'start_wp_x']=feature.waypoint.iloc[0]['x']
        train_meta_sub.loc[i,'start_wp_y']=feature.waypoint.iloc[0]['y']
        train_meta_sub.loc[i,'end_time']=end_time
        train_meta_sub.loc[i,'end_wp_time']=feature.waypoint.iloc[-1]['timestamp']
        train_meta_sub.loc[i,'end_wp_x']=feature.waypoint.iloc[-1]['x']
        train_meta_sub.loc[i,'end_wp_y']=feature.waypoint.iloc[-1]['y']
        train_meta_sub.loc[i,'n_floor']=feature.n_floor
    train_meta_sub.to_csv('train_meta_sub.csv', index=False)
else:
    train_meta_sub = pd.read_csv('../input/indoor-public/train_meta_sub.csv')

In [None]:
train_meta_sub[:50]

In [None]:
import seaborn as sns
for test_site in test_sites:
    plt.figure()
    sns.boxplot(x='floor', y='start_time', data=train_meta_sub[train_meta_sub.site_id==test_site])

In [None]:
def read_txt(file):
    with open(file) as f:
        txt = f.readlines()

    modified_data = []
    for s in txt:
        if s.count("TYPE_") > 1:
            lines = multi_line_spliter(s)
            modified_data.extend(lines)
        else:
            modified_data.append(s)
    return modified_data


def _flatten(l):
    return list(itertools.chain.from_iterable(l))


def get_feature_test(site_id, path_id, input_path, sample_sub):
    file = f"{input_path}/test/{path_id}.txt"
    content = read_txt(file)
    data_df = pd.DataFrame([d.replace("\n", "").split("\t")
                            for d in content if d[0] != "#"])
    data_dict = OrderedDict()
    for dt in FeatureStore.df_types:
        # select data type
        df_s = data_df[data_df[1] == f"TYPE_{dt.upper()}"]
        if len(df_s) == 0:
            setattr(data_dict, dt, pd.DataFrame(
                columns=FeatureStore.df_type_cols[dt]))
        else:
            # remove empty cols
            na_info = df_s.isna().sum(axis=0) == len(df_s)
            df_s = df_s[[i for i in na_info[na_info ==
                                            False].index if i != 1]].reset_index(drop=True)

            if len(df_s.columns) != len(FeatureStore.df_type_cols[dt]):
                df_s.columns = FeatureStore.df_type_cols[dt][:len(
                    df_s.columns)]
            else:
                df_s.columns = FeatureStore.df_type_cols[dt]

            # set dtype
            for c in df_s.columns:
                df_s[c] = df_s[c].astype(FeatureStore.dtype_dict[dt][c])
            setattr(data_dict, dt, df_s)
    data_dict.meta_info_df = pd.DataFrame([m.replace("\n", "").split(":")
                                           for m in _flatten([d.split("\t")
                                                              for d in content if d[0] == "#"]) if m != "#"])
    startTime_ind = int(np.where(data_dict.meta_info_df[0] == 'startTime')[0])
    endTime_ind = int(np.where(data_dict.meta_info_df[0] == 'endTime')[0])
    data_dict.meta_info_df.loc[startTime_ind,
                               1] = data_dict.meta_info_df.loc[startTime_ind+1, 0]
    data_dict.meta_info_df.loc[endTime_ind,
                               1] = data_dict.meta_info_df.loc[endTime_ind+1, 0]

    data_dict.waypoint['timestamp'] = sample_sub[sample_sub.path_id ==
                                                 path_id].timestamp.values.astype(int)
    data_dict.waypoint['x'] = 0
    data_dict.waypoint['y'] = 0
    data_dict.n_floor = 0
    data_dict.site_id = site_id
    if len(data_dict.beacon) > 0:
        gap = data_dict.beacon.loc[0, 'timestamp2'] + \
            data_dict.beacon.loc[0, 'timestamp']
    else:
        gap = (data_dict.wifi.last_seen_timestamp.values -
               data_dict.wifi.timestamp.values).max()+210.14426803816337  # from mean gap
    data_dict.wifi.last_seen_timestamp = data_dict.wifi.last_seen_timestamp-gap
    return data_dict

# Postprocessing based on leaked feature.

In [None]:
def leak_postprocessing(submission_df,train_meta, postprocess_start=True, postprocess_end=True, postprocess_floor=True,start_threshold=5500,end_threshold=6500):
    submission_df[["site_id", "path_id", "timestamp"]] = submission_df["site_path_timestamp"].apply(
        lambda x: pd.Series(x.split("_")))
    start_counter = 0
    end_counter = 0
    floor_counter = 0
    input_path='/kaggle/input/indoor-location-navigation/'
    sample_sub = pd.read_csv(f"{input_path}/sample_submission.csv")
    sample_sub = sample_sub["site_path_timestamp"].apply(
        lambda x: pd.Series(x.split("_")))
    sample_sub.columns = ["site_id", "path_id", "timestamp"]
    submission_df_unique=submission_df.drop_duplicates(
    subset=["site_id", "path_id"]).reset_index(drop=True)
    for i in tqdm(range(len(submission_df_unique.path_id))):
        t = submission_df_unique.iloc[i]
        site_id=t.site_id
        path_id=t.path_id
        feature = get_feature_test(site_id, path_id, input_path, sample_sub)
        if feature.meta_info_df[feature.meta_info_df[0] == 'startTime'][1].values == None:
            start_time = int(np.nanmin([feature.accelerometer.timestamp.min(
            ), feature.wifi.timestamp.min(), feature.beacon.timestamp.min()]))
        else:
            start_time = int(
                feature.meta_info_df[feature.meta_info_df[0] == 'startTime'][1])
        if (len(feature.meta_info_df[feature.meta_info_df[0] == 'endTime']) == 0) or (feature.meta_info_df[feature.meta_info_df[0] == 'endTime'][1].values == None):
            end_time = int(np.nanmax([feature.accelerometer.timestamp.max(
            ), feature.wifi.timestamp.max(), feature.beacon.timestamp.max()]))
        else:
            end_time = int(
                feature.meta_info_df[feature.meta_info_df[0] == 'endTime'][1])
        if len(feature.beacon) > 0:
            gap = feature.beacon.loc[0, 'timestamp2'] + \
                feature.beacon.loc[0, 'timestamp']
        else:
            gap = (feature.wifi.last_seen_timestamp.values -
                   feature.wifi.timestamp.values).max()+210.14426803816337  # from mean gap
        site_id = feature.site_id
        train_meta_site = train_meta[train_meta.site_id == site_id]
        
        #postprocess start point based on leakage
        train_meta_site_end = train_meta_site[(
            start_time+gap) > train_meta_site.end_time]
        if len(train_meta_site_end) > 0:
            nearest_endpoint = train_meta_site_end.loc[train_meta_site_end.end_time.idxmax(
            )]
            if postprocess_start and (start_time + gap - nearest_endpoint.end_time < start_threshold):
                submission_df.loc[(submission_df.path_id == path_id) & (submission_df.timestamp == 
                    submission_df[submission_df.path_id == path_id].timestamp.min()), 'x'] = nearest_endpoint.end_wp_x
                submission_df.loc[(submission_df.path_id == path_id) & (submission_df.timestamp == 
                    submission_df[submission_df.path_id == path_id].timestamp.min()), 'y'] = nearest_endpoint.end_wp_y
                start_counter += 1
        
        #postprocess end point based on leakage
        train_meta_site_start = train_meta_site[train_meta_site.start_time > (
            end_time+gap)]
        if len(train_meta_site_start) > 0:
            nearest_startpoint = train_meta_site_start.loc[train_meta_site_start.start_time.idxmin(
            )]
            if postprocess_end and (nearest_startpoint.start_time - end_time - gap < end_threshold):
                submission_df.loc[(submission_df.path_id == path_id) & (submission_df.timestamp == 
                    submission_df[submission_df.path_id == path_id].timestamp.max()), 'x'] = nearest_startpoint.start_wp_x
                submission_df.loc[(submission_df.path_id == path_id) & (submission_df.timestamp == 
                    submission_df[submission_df.path_id == path_id].timestamp.max()), 'y'] = nearest_startpoint.start_wp_y
                end_counter += 1
                
        #postprocess floor based on leakage
        if postprocess_floor:
            if (len(train_meta_site_end) > 0) and (len(train_meta_site_start) > 0) and (nearest_endpoint.n_floor == nearest_startpoint.n_floor):
                submission_df.loc[(submission_df.path_id == path_id),
                                  'floor'] = nearest_endpoint.n_floor
                floor_counter += (submission_df.path_id == path_id).sum()
            elif (len(train_meta_site_end) > 0) and (len(train_meta_site_start) > 0):
                diff_start_time = start_time - nearest_endpoint.end_time
                diff_end_time = nearest_startpoint.start_time - end_time
                if diff_start_time < diff_end_time:
                    submission_df.loc[(submission_df.path_id == path_id),
                                      'floor'] = nearest_endpoint.n_floor
                    floor_counter += (submission_df.path_id == path_id).sum()
                if diff_end_time < diff_start_time:
                    submission_df.loc[(submission_df.path_id == path_id),
                                      'floor'] = nearest_startpoint.n_floor
                    floor_counter += (submission_df.path_id == path_id).sum()
            elif len(train_meta_site_end) > 0:
                submission_df.loc[(submission_df.path_id == path_id),
                                  'floor'] = nearest_endpoint.n_floor
                floor_counter += (submission_df.path_id == path_id).sum()
            elif len(train_meta_site_start) > 0:
                submission_df.loc[(submission_df.path_id == path_id),
                                  'floor'] = nearest_startpoint.n_floor
                floor_counter += (submission_df.path_id == path_id).sum()

    print(str(start_counter) + ' start points are postprocessed.')
    print(str(end_counter) + ' end points are postprocessed.')
    print(str(floor_counter) + ' floors are postprocessed.')
    submission_df = submission_df.drop(
        ["site_id", "path_id", "timestamp"], axis=1)
    return submission_df

In [None]:
submission_df = submission_snap_to_grid

In [None]:
submission_df_leak_start = leak_postprocessing(submission_df,train_meta_sub, postprocess_start=True, postprocess_end=False, postprocess_floor=False)
submission_df_leak_start.to_csv(
    'submission_df_leak_start.csv', index=False)

In [None]:
submission_df_leak_end = leak_postprocessing(submission_df,train_meta_sub, postprocess_start=False, postprocess_end=True, postprocess_floor=False)
submission_df_leak_end.to_csv(
    'submission_df_leak_end.csv', index=False)

In [None]:
submission_df_leak_floor = leak_postprocessing(submission_df,train_meta_sub, postprocess_start=False, postprocess_end=False, postprocess_floor=True)
submission_df_leak_floor.to_csv(
    'submission_df_leak_floor.csv', index=False)

In [None]:
submission_df_leak_all = leak_postprocessing(submission_df,train_meta_sub, postprocess_start=True, postprocess_end=True, postprocess_floor=True)
submission_df_leak_all.to_csv(
    'submission_df_leak_all.csv', index=False)

# Data Visualization

We just draw the "End point" option. The best public score will be created with this option.

In [None]:
def split_col(df):
    """
    Split submission site/path/timestamp into individual columns.
    """
    df = pd.concat(
        [
            df["site_path_timestamp"]
            .str.split("_", expand=True)
            .rename(columns={0: "site", 1: "path", 2: "timestamp"}),
            df,
        ],
        axis=1,
    ).copy()
    return df


def plot_preds(
    site,
    floorNo,
    sub=None,
    true_locs=None,
    base="../input/indoor-location-navigation",
    show_train=True,
    show_preds=True,
):
    """
    Plots predictions on floorplan map.
    """
    # Prepare width_meter & height_meter (taken from the .json file)
    floor_plan_filename = f"{base}/metadata/{site}/{floorNo}/floor_image.png"
    json_plan_filename = f"{base}/metadata/{site}/{floorNo}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)

    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]

    floor_img = plt.imread(f"{base}/metadata/{site}/{floorNo}/floor_image.png")

    fig, ax = plt.subplots(figsize=(12, 12))
    plt.imshow(floor_img)

    if show_train:
        true_locs["x_"] = true_locs["x"] * floor_img.shape[0] / height_meter
        true_locs["y_"] = (
            true_locs["y"] * -1 * floor_img.shape[1] / width_meter
        ) + floor_img.shape[0]
        true_locs.query("site == @site and floorNo == @floorNo").groupby("path").plot(
            x="x_",
            y="y_",
            style="+",
            ax=ax,
            label="train waypoint location",
            color="grey",
            alpha=0.5,
        )

    if show_preds:
        sub["x_"] = sub["x"] * floor_img.shape[0] / height_meter
        sub["y_"] = (
            sub["y"] * -1 * floor_img.shape[1] / width_meter
        ) + floor_img.shape[0]
        for path, path_data in sub.query(
            "site == @site and floorNo == @floorNo"
        ).groupby("path"):
            path_data.plot(
                x="x_",
                y="y_",
                style=".-",
                ax=ax,
                title=f"{site} - floor - {floorNo}",
                alpha=1,
                label=path,
            )
    return fig, ax


In [None]:
sub = split_col(submission_df_leak_end)

true_locs = pd.read_csv("../input/indoor-location-train-waypoints/train_waypoints.csv")

# Add floor No to sub file
sub = sub.merge(true_locs[["site", "floor", "floorNo"]].drop_duplicates())


for (site, floorNo), d in sub.groupby(["site", "floorNo"]):
    fig, ax = plot_preds(site, floorNo, sub, true_locs)
    # Remove duplicate labels
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(
        by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1, 0.5)
    )
    plt.show()
    

<div class="alert alert-success">  
</div>

# Submission

In [None]:
sub = submission_df_leak_end

sub.to_csv("submission.csv", index=False)

<div class="alert alert-success">  
</div>