I would like to share with you a simple post-processing that I used.

It's not powerful, but you can use it not only once, but after other post-processings to correct any misalignment caused by other post-processings.

It's very poorly coded, but I'd be very happy to get feedback.

In [None]:
import os
from glob import glob
from pathlib import Path
import warnings


import numpy as np
import pandas as pd

warnings.simplefilter('ignore')

In [None]:
dir = Path("../input/google-smartphone-decimeter-challenge")

train_base = pd.read_csv(dir / "baseline_locations_train.csv")
test_base = pd.read_csv(dir / "baseline_locations_test.csv")
sub = pd.read_csv(dir / "sample_submission.csv")


def get_groundtruth(path: Path) -> pd.DataFrame:
        output_df = pd.DataFrame()
        
        for path in glob(str(dir / 'train/*/*/ground_truth.csv')):
            _df = pd.read_csv(path)
            output_df = pd.concat([output_df, _df])
        output_df = output_df.reset_index(drop=True)
        
        _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']
        output_df[['t_'+col for col in _columns]] = output_df[_columns]
        output_df = output_df.drop(columns=_columns, axis=1)
        return output_df

train_base = train_base.merge(
    get_groundtruth(dir),
    on=['collectionName', 'phoneName', 'millisSinceGpsEpoch']
)

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    d = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(d**0.5)
    return dist

def check_score(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = input_df.copy()
    
    output_df['meter'] = input_df.apply(
        lambda r: calc_haversine(
            r.latDeg, r.lngDeg, r.t_latDeg, r.t_lngDeg
        ),
        axis=1
    )

    meter_score = output_df['meter'].mean()
    print(f'error meter: {meter_score}')

    scores = []
    for phone in output_df['phone'].unique():
        _index = output_df['phone']==phone
        p_50 = np.percentile(output_df.loc[_index, 'meter'], 50)
        p_95 = np.percentile(output_df.loc[_index, 'meter'], 95)
        scores.append(p_50)
        scores.append(p_95)

    score = sum(scores) / len(scores)
    print(f'score: {score}')
    
    return output_df

# Check Degrees

The process is very simple.
If the degree of ∠ABC is less than n°, correct the coordinates of B.
In this notebook, n = 155.

In [None]:
def add_check_degrees_features(input_df):
    output_df = input_df.copy()
    
    output_df["latDeg_pre"] = output_df["latDeg"].shift(1)
    output_df["latDeg_pro"] = output_df["latDeg"].shift(-1)
    output_df["lngDeg_pre"] = output_df["lngDeg"].shift(1)
    output_df["lngDeg_pro"] = output_df["lngDeg"].shift(-1)
    output_df["millisSinceGpsEpoch_pre"] = output_df["millisSinceGpsEpoch"].shift(1)
    output_df["millisSinceGpsEpoch_pro"] = output_df["millisSinceGpsEpoch"].shift(-1)
    output_df["latDeg_mean_point"] \
        = (output_df["latDeg_pre"] + ((output_df["latDeg_pro"] - output_df["latDeg_pre"]) * 
           ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
           (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"]))))
        
    output_df["lngDeg_mean_point"] \
        = (output_df["lngDeg_pre"] + ((output_df["lngDeg_pro"] - output_df["lngDeg_pre"]) * 
           ((output_df["millisSinceGpsEpoch"] - output_df["millisSinceGpsEpoch_pre"]) /
           (output_df["millisSinceGpsEpoch_pro"] - output_df["millisSinceGpsEpoch_pre"]))))
                 

    degree_list = []
    for lat_pre, lng_pre, lat, lng, lat_pro, lng_pro in zip(
        output_df["latDeg_pre"].to_numpy(),
        output_df["lngDeg_pre"].to_numpy(),
        output_df["latDeg"].to_numpy(),
        output_df["lngDeg"].to_numpy(),
        output_df["latDeg_pro"].to_numpy(),
        output_df["lngDeg_pro"].to_numpy()
    ):
        p0 = np.array([lat_pre, lng_pre])
        p1 = np.array([lat, lng])
        p2 = np.array([lat_pro, lng_pro])
            
        vec_p01 = p0 - p1
        vec_p12 = p2 - p1
        length_vec_p01 = np.linalg.norm(vec_p01)
        length_vec_p12 = np.linalg.norm(vec_p12)
        inner = np.inner(vec_p01, vec_p12)
        degree = np.rad2deg(np.arccos(inner / (length_vec_p01 * length_vec_p12)))
        degree_list.append(degree)
    
    output_df["degree"] = degree_list
    return output_df


def check_degrees(input_df):
    output_df = input_df.copy()

    lat_list = []
    lng_list = []

    for collection in output_df["collectionName"].unique():
        collection_df = output_df[output_df["collectionName"] == collection]
        for phone in collection_df["phoneName"].unique():
            phone_df = collection_df[collection_df["phoneName"] == phone]
            degree_df = add_check_degrees_features(phone_df)
            for lat, lng, lat_mp, lng_mp, degee in zip(
                degree_df["latDeg"].to_numpy(),
                degree_df["lngDeg"].to_numpy(),
                degree_df["latDeg_mean_point"].to_numpy(),
                degree_df["lngDeg_mean_point"].to_numpy(),
                degree_df["degree"].to_numpy()
            ):
                if degee < 155:
                    lat_ = (lat + lat_mp)/2
                    lng_ = (lng + lng_mp)/2
                    lat_list.append(lat_)
                    lng_list.append(lng_)
                else:
                    lat_list.append(lat)
                    lng_list.append(lng)
        
    output_df["latDeg"] = lat_list
    output_df["lngDeg"] = lng_list
    return output_df[input_df.columns]

This post-processing can be applied multiple times to increase the accuracy.

In [None]:
print("<train_base score>")
train_base = check_score(train_base)
print("******************************")
print("******************************")


print("<check_degrees score>")
print("- once -")
once  = check_degrees(train_base)
once = check_score(once)
print("------------------------------")

print("- 10 times -")
_10_times  = check_degrees(train_base)
for i in range(9):
    _10_times = check_degrees(_10_times)
_10_times = check_score(_10_times)
print("------------------------------")

print("- 50 times- ")
_50_times  = check_degrees(train_base)
for i in range(49):
    _50_times = check_degrees(_50_times)
_50_times = check_score(_50_times)
print("------------------------------")

print("- 100 times -")
_100_times  = check_degrees(train_base)
for i in range(99):
    _100_times = check_degrees(_100_times)
_100_times = check_score(_100_times)
print("------------------------------")

print("- 300 times -")
_300_times  = check_degrees(train_base)
for i in range(299):
    _300_times = check_degrees(_300_times)
_300_times = check_score(_300_times)
print("------------------------------")

The accuracy is improving as the number of times increases.

In the case of once, the process takes a short time, but in the case of tens to hundreds of times as described above, it takes long time.

I used this post-processing in the following way.

baseline  ->  check_degrees  ->  other post-processing  ->  check_degrees  ->  other post-processing  ->  check_degrees  ->  ... 

In many cases, it is possible to correct misalignments caused by other post-processing.

However, the accuracy may deteriorate for post-processing using train_grand_truth.