# Lets connect the dots
<!-- ![]() -->

<img src="https://i.pinimg.com/736x/1a/f2/b5/1af2b59eb6002d59981b5fd81035a649.jpg" width="400" height="400" />


This notebook shows how we can leverage some knowledge about the training data to post process our predictions and gain a LB boost.

The idea is:
1. The training data `waypoint` locations are not precise measurements. Instead they are locations unique to the building structure, uniformly distributed throughout the available hallway space.
2. We can use the waypoints from the training data as a "grid" and snap our predictions to these grid points.
3. I set a minimum threshold before snapping predictions to this grid.

I'm sure more ideas will follow.

To demonstrate this post processing I am using the submission file from a top scoring notebook.

In [None]:
# Helper Functions
import pandas as pd
import numpy as np

import json
import matplotlib.pylab as plt

from pathlib import Path

def split_col(df):
    df = pd.concat([
        df['site_path_timestamp'].str.split('_', expand=True) \
        .rename(columns={0:'site',
                         1:'path',
                         2:'timestamp'}),
        df
    ], axis=1).copy()
    return df

floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2,
             "F4":3, "F5":4, "F6":5, "F7":6,"F8":7,"F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5,
             "7F":6, "8F": 7, "9F":8}


def plot_preds(
    site,
    floorNo,
    sub=None,
    true_locs=None,
    base="../input/indoor-location-navigation",
    show_train=True,
    show_preds=True,
    fix_labels=True,
    map_floor=None
):
    """
    Plots predictions on floorplan map.
    
    map_floor : use a different floor's map
    """
    if map_floor is None:
        map_floor = floorNo
    # Prepare width_meter & height_meter (taken from the .json file)
    floor_plan_filename = f"{base}/metadata/{site}/{map_floor}/floor_image.png"
    json_plan_filename = f"{base}/metadata/{site}/{map_floor}/floor_info.json"
    with open(json_plan_filename) as json_file:
        json_data = json.load(json_file)

    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]

    floor_img = plt.imread(f"{base}/metadata/{site}/{map_floor}/floor_image.png")

    fig, ax = plt.subplots(figsize=(12, 12))
    plt.imshow(floor_img)

    if show_train:
        true_locs = true_locs.query('site == @site and floorNo == @map_floor').copy()
        true_locs["x_"] = true_locs["x"] * floor_img.shape[0] / height_meter
        true_locs["y_"] = (
            true_locs["y"] * -1 * floor_img.shape[1] / width_meter
        ) + floor_img.shape[0]
        true_locs.query("site == @site and floorNo == @map_floor").groupby("path").plot(
            x="x_",
            y="y_",
            style="+",
            ax=ax,
            label="train waypoint location",
            color="grey",
            alpha=0.5,
        )

    if show_preds:
        sub = sub.query('site == @site and floorNo == @floorNo').copy()
        sub["x_"] = sub["x"] * floor_img.shape[0] / height_meter
        sub["y_"] = (
            sub["y"] * -1 * floor_img.shape[1] / width_meter
        ) + floor_img.shape[0]
        for path, path_data in sub.query(
            "site == @site and floorNo == @floorNo"
        ).groupby("path"):
            path_data.plot(
                x="x_",
                y="y_",
                style=".-",
                ax=ax,
                title=f"{site} - floor - {floorNo}",
                alpha=1,
                label=path,
            )
    if fix_labels:
        handles, labels = ax.get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        plt.legend(
            by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1, 0.5)
        )
    return fig, ax

def sub_process(sub, train_waypoints):
    train_waypoints['isTrainWaypoint'] = True
    sub = split_col(sub[['site_path_timestamp','floor','x','y']]).copy()
    sub = sub.merge(train_waypoints[['site','floorNo','floor']].drop_duplicates(), how='left')
    sub = sub.merge(
        train_waypoints[['x','y','site','floor','isTrainWaypoint']].drop_duplicates(),
        how='left',
        on=['site','x','y','floor']
             )
    sub['isTrainWaypoint'] = sub['isTrainWaypoint'].fillna(False)
    return sub.copy()

In [None]:
test_buildings = [
    '5a0546857ecc773753327266',
    '5c3c44b80379370013e0fd2b',
    '5d27075f03f801723c2e360f',
    '5d27096c03f801723c31e5e0',
    '5d27097f03f801723c320d97',
    '5d27099f03f801723c32511d',
    '5d2709a003f801723c3251bf',
    '5d2709b303f801723c327472',
    '5d2709bb03f801723c32852c',
    '5d2709c303f801723c3299ee',
    '5d2709d403f801723c32bd39',
    '5d2709e003f801723c32d896',
    '5da138274db8ce0c98bbd3d2',
    '5da1382d4db8ce0c98bbe92e',
    '5da138314db8ce0c98bbf3a0',
    '5da138364db8ce0c98bc00f1',
    '5da1383b4db8ce0c98bc11ab',
    '5da138754db8ce0c98bca82f',
    '5da138764db8ce0c98bcaa46',
    '5da1389e4db8ce0c98bd0547',
    '5da138b74db8ce0c98bd4774',
    '5da958dd46f8266d0737457b',
    '5dbc1d84c1eb61796cf7c010',
    '5dc8cea7659e181adb076a3f'
]

## Step 1: Identify training waypoints
As an example I'll plot the training waypoints on the map for a given floor.

In [None]:
# Blend Subs
sub1 = split_col(pd.read_csv('../input/indoor-location-train-waypoints/6.771LB_submission.csv'))
sub2 = split_col(pd.read_csv('../input/indoor-location-train-waypoints/7.274LB_submission.csv'))
sub3 = split_col(pd.read_csv('../input/indoor-location-train-waypoints/7.518LB_submission_LSTM.csv'))
sub4 = split_col(pd.read_csv('../input/indoor-location-train-waypoints/7.661LB_LSTM_submission.csv'))
sub5 = split_col(pd.read_csv('../input/indoor-location-train-waypoints/7.745LB_submission.csv'))

# Blend
sub = sub1.merge(sub2[['site_path_timestamp','floor','x','y']],
           on=['site_path_timestamp','floor'],
           how='left',
           suffixes=('_s1','_s2')).copy()
sub['x'] = sub1['x'] 
sub['y'] = sub1['y'] 

sub = sub.merge(sub3[['site_path_timestamp','floor','x','y']],
           on=['site_path_timestamp','floor'],
           how='left',
           suffixes=('','_s3')).copy()
sub = sub.merge(sub4[['site_path_timestamp','floor','x','y']],
           on=['site_path_timestamp','floor'],
           how='left',
           suffixes=('','_s4')).copy()
sub = sub.merge(sub5[['site_path_timestamp','floor','x','y']],
           on=['site_path_timestamp','floor'],
           how='left',
           suffixes=('','_s5')).copy()

sub['x'] = sub[['x_s1','x_s2','x_s3','x_s4','x_s5']].mean(axis=1)
sub['y'] = sub[['y_s1','y_s2','y_s3','y_s4','y_s5']].mean(axis=1)

In [None]:
train_waypoints = pd.read_csv('../input/indoor-location-train-waypoints/train_waypoints.csv')
# sub = sub_process(pd.read_csv('../input/indoor-location-train-waypoints/6.578LB_submission.csv'),
#                  train_waypoints)
sub = sub_process(pd.read_csv('../input/indoor-location-post-processing/submission.csv'), train_waypoints)
# Plot the training Data For an example Floor
example_site = '5dbc1d84c1eb61796cf7c010'
example_floorNo = 'F3'

plot_preds(example_site, example_floorNo, sub,
           train_waypoints, show_preds=False)
plt.show()

## Step 2: Find the closest "grid" point for each prediction.

We can find the closest "grid" point to our predictions using the `cdist` function in scipy.

In [None]:
from scipy.spatial.distance import cdist

def add_xy(df):
    df['xy'] = [(x, y) for x,y in zip(df['x'], df['y'])]
    return df

def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

sub = add_xy(sub)
train_waypoints = add_xy(train_waypoints)

ds = []
for (site, myfloor), d in sub.groupby(['site','floor']):
    true_floor_locs = train_waypoints.loc[(train_waypoints['floor'] == myfloor) &
                                          (train_waypoints['site'] == site)] \
        .reset_index(drop=True)
    if len(true_floor_locs) == 0:
        print(f'Skipping {site} {myfloor}')
        continue
    d['matched_point'] = [closest_point(x, list(true_floor_locs['xy'])) for x in d['xy']]
    d['x_'] = d['matched_point'].apply(lambda x: x[0])
    d['y_'] = d['matched_point'].apply(lambda x: x[1])
    ds.append(d)

sub = pd.concat(ds)

In [None]:
# for building in test_buildings:
#     for floor_dir in Path(f'../input/indoor-location-navigation/train/{building}').iterdir():
#         floor = str(floor_dir).split('/')[-1]
#         plot_preds(building, floor, sub,
#            train_waypoints, show_preds=True)
#         plt.show()

In [None]:
# Example of raw predictions
# plot_preds(example_site, example_floorNo, sub,
#            train_waypoints, show_preds=True)
# plt.show()

## Step 3: Apply a Threshold and "Snap to Grid"

I've found a threshold of 3-8 works well on the LB. But this most likely will be a function of how good your predictions are to start with.

In [None]:
def snap_to_grid(sub, threshold):
    """
    Snap to grid if within a threshold.
    
    x, y are the predicted points.
    x_, y_ are the closest grid points.
    _x_, _y_ are the new predictions after post processing.
    """
    sub['_x_'] = sub['x']
    sub['_y_'] = sub['y']
    sub.loc[sub['dist'] < threshold, '_x_'] = sub.loc[sub['dist'] < threshold]['x_']
    sub.loc[sub['dist'] < threshold, '_y_'] = sub.loc[sub['dist'] < threshold]['y_']
    return sub.copy()

# Calculate the distances
sub['dist'] = np.sqrt( (sub.x-sub.x_)**2 + (sub.y-sub.y_)**2 )

sub_pp = snap_to_grid(sub, threshold=5)

sub_pp = sub_pp[['site_path_timestamp','floor','_x_','_y_','site','path','floorNo']] \
    .rename(columns={'_x_':'x', '_y_':'y'})

Lets take a look at the predictions after post processing.

In [None]:
for building in test_buildings:
    for floor_dir in Path(f'../input/indoor-location-navigation/train/{building}').iterdir():
        floor = str(floor_dir).split('/')[-1]
        plot_preds(building, floor, sub,
           train_waypoints, show_preds=True)
        plt.show()
        plot_preds(building, floor, sub_pp,
           train_waypoints, show_preds=True)
        plt.show()

In [None]:
# Plot example after post processing
# plot_preds(example_site, example_floorNo, sub_pp,
#            train_waypoints, show_preds=True)
# plt.show()

Not perfect, but it looks much better!

![](https://media3.giphy.com/media/Hkoamb0iCaQbZmM9ok/giphy.gif?cid=ecf05e47ra5yjuw0f93p7ti3gpmsyst6whhdexd06vyygyvw&rid=giphy.gif)

## Evaluate The Change in Predictions

In [None]:
sub['dist_pp_change'] = np.sqrt(((sub['x'] - sub['_x_']) ** 2) + ((sub['y'] - sub['_y_']) ** 2))
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
sub['dist_pp_change'].plot(kind='hist', bins=30,
                           ax=axs[0],
                           title='Distance Changed by Post Processing')
sub.query('dist_pp_change > 0.1')['dist_pp_change'] \
    .plot(kind='hist', bins=30, ax=axs[1],
          title='Distance Changed (Excluding <0.1 Change)')

plt.show()

In [None]:
sub.groupby(['site','floorNo'])['dist_pp_change'].mean() \
    .reset_index() \
    .sort_values('dist_pp_change') \
    .set_index(['site','floorNo']).head(20).plot(kind='barh')

## Final Step: Save Post Processed Submission.

In [None]:
sub_pp[['site_path_timestamp','floor','x','y']] \
    .to_csv('submission_snap_to_grid.csv', index=False)

In [None]:
# df = pd.read_csv('../input/indoor-location-post-processing/submission.csv')

In [None]:
# sub_pp[sub_pp['x'] < 0]

In [None]:
# sub_pp = sub_pp.sort_index()

In [None]:
# sub_pp.loc[1697]

In [None]:
# i = 3626
# sub_pp.loc[3628, 'x':'y'] = (sub_pp.loc[3625, 'x':'y'] + sub_pp.loc[3629, 'x':'y']) / 2

In [None]:
# sub_pp.iloc[3620: 3630]

# FIN