In [None]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np


# helper function to load GNSS logs as dataframe

In [None]:
def gnss_log_to_raw_dataframe(path):
    path_split = str(path).split('/')
    collection_name = path_split[-3]
    phone = path_split[-2]
    
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            try:
                gnss_map[dataline[0]] = dataline[1:]
            except:
                pass
        elif not is_header:
            try:
                datas[dataline[0]].append(dataline[1:])
            except:
                pass
    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            try:
                results[k][col] = pd.to_numeric(results[k][col])
            except:
                pass
    raw_df = results['Raw']       
    raw_df['collectionName'] = collection_name
    raw_df['phoneName'] = phone
    
    return raw_df

# Load data

In [None]:
# reload df and groundtruth
n_files=73
datapath = Path('../input/google-smartphone-decimeter-challenge/')

ground_truths = (datapath / "train").rglob("ground_truth.csv")
derived_files = (datapath / "train").rglob("*_derived.csv")
raw_files = (datapath / "train").rglob("*_GnssLog.txt")

df_gt = pd.concat([pd.read_csv(filepath) for filepath in tqdm(ground_truths, total=n_files, desc="Reading ground truth data")], ignore_index=True)
df_raw_train = pd.concat([gnss_log_to_raw_dataframe(filepath) for filepath in tqdm(raw_files, total=n_files, desc="Reading raw data")], ignore_index=True)
df_derived = pd.concat([pd.read_csv(filepath) for filepath in tqdm(derived_files, total=n_files, desc="Loading data")], ignore_index=True)

# Apply tip 1

In [None]:
df_raw_train['MillisSinceGpsEpoch'] = np.floor( (df_raw_train['TimeNanos'] - df_raw_train['FullBiasNanos']) / 1000000.0).astype(int)

df_derived_corrected = pd.DataFrame() 
for indexes, subdf in df_derived.groupby(['collectionName', 'phoneName']):
    df_raw_sub = df_raw_train[(df_raw_train['collectionName']==indexes[0])&(df_raw_train['phoneName']==indexes[1])]
    
    # Change each value in df_derived['MillisSinceGpsEpoch'] to be the prior epoch.
    raw_timestamps = df_raw_sub['MillisSinceGpsEpoch'].unique()
    derived_timestamps = subdf['millisSinceGpsEpoch'].unique()

    # The timestamps in derived are one epoch ahead. We need to map each epoch
    # in derived to the prior one (in Raw).
    indexes = np.searchsorted(raw_timestamps, derived_timestamps)
    from_t_to_fix_derived = dict(zip(derived_timestamps, raw_timestamps[indexes-1]))
    subdf['millisSinceGpsEpoch'] = np.array(list(map(lambda v: from_t_to_fix_derived[v], subdf['millisSinceGpsEpoch'])))

    df_derived_corrected = pd.concat([df_derived_corrected, subdf])
    



# Apply tip 5

In [None]:
delta_millis = df_derived_corrected['millisSinceGpsEpoch'] - df_derived_corrected['receivedSvTimeInGpsNanos'] / 1e6
where_good_signals = (delta_millis > 0) & (delta_millis < 300)
df_derived_corrected_filtered = df_derived_corrected[where_good_signals].copy()


In [None]:
df_derived_corrected.shape

In [None]:
df_derived_corrected_filtered.shape