**[NOTE] This notebook's method is not good, so I'd like to know what information would help me reproduce the baseline.**


We are currently working on reproducing the baseline using the derived file, and Parrot (@hyperc) has a great notebook that can be used as a reference, but it doesn't score as well as the actual baseline.

According to [this discussion](https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/238583),
> It has 4+N states, where 4 refers to the user's position in ECEF and clock offset (x, y, z, t), and N states are inter-signal biases (ISB) for the number of non-GPS-L1 signal types. For instance, if the device measures signals of GPS L1 frequency, GLO G1 frequency, GPS L5 frequency, GAL E1 frequency at the same epoch, the number of non-GPS-L1 signal types equals 3 (i.e. N=3).

And according to [data overview](https://www.kaggle.com/c/google-smartphone-decimeter-challenge/data),
> [train/test]/[drive_id]/[phone_name]/[phone_name]_derived.csv - GNSS intermediate values derived from raw GNSS measurements, provided for convenience.
> The baseline locations are computed using correctedPrM and the satellite positions, using a standard Weighted Least Squares (WLS) solver, with the phone's position (x, y, z), clock bias (t), and isrbM for each unique signal type as states for each epoch.


So, based on the notebook published by Parrot (@hyperc), I fixed it to also estimate isrbM for each signal type, **but the score got worse**. 
  
**I would be grateful if you could point out what I am doing wrong in reproducing the baseline.**

### Reference
-  Parrot(@hyperc)'s notebook 
https://www.kaggle.com/hyperc/gsdc-reproducing-baseline-wls-on-one-measurement 

- Information provided by host
https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/238583

- Data Overview
https://www.kaggle.com/c/google-smartphone-decimeter-challenge/data  

In [None]:
import os
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from pathlib import Path
import pathlib
from tqdm.notebook import tqdm
INPUT = '../input/google-smartphone-decimeter-challenge/'
root = Path(INPUT)

In [None]:
def ecef2lla(x, y, z):
    # x, y and z are scalars or vectors in meters
    x = np.array([x]).reshape(np.array([x]).shape[-1], 1)
    y = np.array([y]).reshape(np.array([y]).shape[-1], 1)
    z = np.array([z]).reshape(np.array([z]).shape[-1], 1)

    a=6378137
    a_sq=a**2
    e = 8.181919084261345e-2
    e_sq = 6.69437999014e-3

    f = 1/298.257223563
    b = a*(1-f)

    # calculations:
    r = np.sqrt(x**2 + y**2)
    ep_sq  = (a**2-b**2)/b**2
    ee = (a**2-b**2)
    f = (54*b**2)*(z**2)
    g = r**2 + (1 - e_sq)*(z**2) - e_sq*ee*2
    c = (e_sq**2)*f*r**2/(g**3)
    s = (1 + c + np.sqrt(c**2 + 2*c))**(1/3.)
    p = f/(3.*(g**2)*(s + (1./s) + 1)**2)
    q = np.sqrt(1 + 2*p*e_sq**2)
    r_0 = -(p*e_sq*r)/(1+q) + np.sqrt(0.5*(a**2)*(1+(1./q)) - p*(z**2)*(1-e_sq)/(q*(1+q)) - 0.5*p*(r**2))
    u = np.sqrt((r - e_sq*r_0)**2 + z**2)
    v = np.sqrt((r - e_sq*r_0)**2 + (1 - e_sq)*z**2)
    z_0 = (b**2)*z/(a*v)
    h = u*(1 - b**2/(a*v))
    phi = np.arctan((z + ep_sq*z_0)/r)
    lambd = np.arctan2(y, x)

    return phi*180/np.pi, lambd*180/np.pi, h

def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
      np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

In [None]:
def make_gt(recal):
    if recal:
        p = pathlib.Path(INPUT)
        gt_files = list(p.glob('train/*/*/ground_truth.csv'))
        print('ground_truth.csv count : ', len(gt_files))

        gts = []
        for gt_file in tqdm(gt_files):
            gts.append(pd.read_csv(gt_file))
        ground_truth = pd.concat(gts)
        ground_truth.to_csv('gt.csv',index=False)
    else:
        ground_truth = pd.read_csv('gt.csv')
    return ground_truth
    
gt = make_gt(recal=True)

In [None]:
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

def get_train_score_df(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    return df

In [None]:
def gnss_log_to_dataframes(path):
    '''Load GNSS Log'''
    print('Loading ' + path, flush = True)
    gnss_section_names = {'Raw', 'UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            gnss_map[dataline[0]] = dataline[1:]
        elif not is_header:
            datas[dataline[0]].append(dataline[1:])

    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            results[k][col] = pd.to_numeric(results[k][col])

    return results

In [None]:
def apply_tips1(raw_df, derived_df):
    # Create a new column in df_raw that corresponds to derivedの['millisSinceGpsEpoch']
    raw_df['millisSinceGpsEpoch'] = np.floor((raw_df['TimeNanos'] - raw_df['FullBiasNanos']) / 1000000.0).astype(int)
        
    # Change each value in df_derived['MillisSinceGpsEpoch'] to be the prior epoch.
    raw_timestamps = raw_df['millisSinceGpsEpoch'].unique()
    derived_timestamps = derived_df['millisSinceGpsEpoch'].unique()

    # The timestamps in derived are one epoch ahead. We need to map each epoch
    # in derived to the prior one (in Raw).
    indexes = np.searchsorted(raw_timestamps, derived_timestamps)
    from_t_to_fix_derived = dict(zip(derived_timestamps, raw_timestamps[indexes-1]))
    derived_df['millisSinceGpsEpoch'] = np.array(list(map(lambda v: from_t_to_fix_derived[v], derived_df['millisSinceGpsEpoch'])))
    return derived_df

def apply_tips5(derived_df):
    delta_millis = derived_df['millisSinceGpsEpoch'] - derived_df['receivedSvTimeInGpsNanos'] / 1e6
    where_good_signals = (delta_millis > 0) & (delta_millis < 300)
    return derived_df[where_good_signals]

In [None]:
output_dir = './'
os.makedirs(output_dir, exist_ok=True)

In [None]:
base_train = pd.read_csv(root/ 'baseline_locations_train.csv')
base_train.loc[:,['px','py','pz']] = 0
base_train.head()

## WLS
do only SJC area to reduce calculation time

In [None]:
# original distance function(# Parrot's method)
def distance(x, **kwargs):
    satx = kwargs["xSatPosMRotated"] - x[0]
    saty = kwargs["ySatPosMRotated"] - x[1]
    satz = kwargs["zSatPosMRotated"] - x[2]
    weight = kwargs["uncertaintyWeight"]
    prm = kwargs["correctedPrM"]

    d = weight * (np.sqrt(satx**2 + saty**2 +satz**2) + x[3] - prm)
    return d

# Set up least squares methods
def distance_v2(x, **kwargs):
    satx = kwargs["xSatPosMRotated"] - x[0]
    saty = kwargs["ySatPosMRotated"] - x[1]
    satz = kwargs["zSatPosMRotated"] - x[2]
    weight = kwargs["uncertaintyWeight"]
    prm = kwargs["correctedPrM"]
    
    isrbms = [k for k in kwargs.keys() if "_isrbM" in k]
    N = len(isrbms)
    isrbms_loss = 0
    for i in range(N):
        isrbms_loss += x[4+i] - kwargs[isrbms[i]]
        # isrbms_loss += x[4+i]

    d = weight * (np.sqrt(satx**2 + saty**2 +satz**2) + x[3] - prm + isrbms_loss)
    return d

In [None]:
# Parrot's method
def estimate_train_position_by_derived(args):
    (collection_name, phone_name), base_df = args
    if "SJC" not in collection_name:
        return base_df
    # Train df here only contains one collection and one measurement
    derived_df = pd.read_csv(root / f"train/{collection_name}/{phone_name}/{phone_name}_derived.csv")
    gnss_df = gnss_log_to_dataframes(str(root / f"train/{collection_name}/{phone_name}/{phone_name}_GnssLog.txt"))
    raw_df = gnss_df['Raw']
    
    # fixed epoch
    derived_df = apply_tips1(raw_df, derived_df)
    derived_df = apply_tips5(derived_df)
    derived_df = derived_df.sort_values('millisSinceGpsEpoch')
    
    # fixed pseudorange
    derived_df['correctedPrM'] = derived_df.apply(lambda r: r.rawPrM + r.satClkBiasM - r.isrbM - r.ionoDelayM - r.tropoDelayM,axis=1)
    
    # transmission time=pseudorange/light speed
    # diff between received time and send time
    light_speed = 299_792_458
    derived_df['transmissionTimeSeconds'] = derived_df['correctedPrM'] / light_speed

    # Compute true sat positions at arrival time
    omega_e = 7.2921151467e-5
    derived_df['xSatPosMRotated'] = \
        np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
        + np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']

    derived_df['ySatPosMRotated'] = \
        - np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
        + np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']
    derived_df['zSatPosMRotated'] = derived_df['zSatPosM']
    
    # weight for WLS
    derived_df['uncertaintyWeight'] = 1 / derived_df['rawPrUncM']

    output_df = pd.DataFrame()
    d_list = []
    x_list = []
    y_list = []
    z_list = []
    epoch_list = []
    
    # calc position each epoch
    for epoch, df in derived_df.groupby('millisSinceGpsEpoch'): 
        # estimate position by WLS
        x0 = [0]*4     
        opt_res = opt.least_squares(distance, x0, kwargs=df.to_dict(orient="list"))

        # Optimiser yields a position in the ECEF coordinates
        opt_res_pos = opt_res.x
        d = distance(opt_res_pos, **df.to_dict(orient="list"))

        # ECEF position to lat/long
        wls_estimated_pos = ecef2lla(*opt_res_pos[:3])
        wls_estimated_pos = np.squeeze(wls_estimated_pos)
        d_list.append(d)
        x_list.append(wls_estimated_pos[0])
        y_list.append(wls_estimated_pos[1])
        z_list.append(wls_estimated_pos[2])
        epoch_list.append(epoch)

    output_df["latDeg"] = x_list
    output_df["lngDeg"] = y_list
    output_df['heightAboveWgs84EllipsoidM'] = z_list
    output_df["dist"] = d_list
    output_df['millisSinceGpsEpoch'] = epoch_list
    output_df['collectionName'] = collection_name
    output_df['phoneName'] = phone_name

    output_df.to_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv', index=False)
    return output_df

In [None]:
# fixed method(but not good)
def estimate_train_position_by_derived_v2(args):
    (collection_name, phone_name), base_df = args
    if "SJC" not in collection_name:
        return base_df
    # Train df here only contains one collection and one measurement
    derived_df = pd.read_csv(root / f"train/{collection_name}/{phone_name}/{phone_name}_derived.csv")
    gnss_df = gnss_log_to_dataframes(str(root / f"train/{collection_name}/{phone_name}/{phone_name}_GnssLog.txt"))
    raw_df = gnss_df['Raw']
    
    # fixed epoch
    derived_df = apply_tips1(raw_df, derived_df)
    derived_df = apply_tips5(derived_df)
    derived_df = derived_df.sort_values('millisSinceGpsEpoch')
    
    # fixed pseudorange
    derived_df['correctedPrM'] = derived_df.apply(lambda r: r.rawPrM + r.satClkBiasM - r.isrbM - r.ionoDelayM - r.tropoDelayM,axis=1)
    
    # transmission time=pseudorange/light speed
    # diff between received time and send time
    light_speed = 299_792_458
    derived_df['transmissionTimeSeconds'] = derived_df['correctedPrM'] / light_speed

    # Compute true sat positions at arrival time
    omega_e = 7.2921151467e-5
    derived_df['xSatPosMRotated'] = \
        np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
        + np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']

    derived_df['ySatPosMRotated'] = \
        - np.sin(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['xSatPosM'] \
        + np.cos(omega_e * derived_df['transmissionTimeSeconds']) * derived_df['ySatPosM']
    derived_df['zSatPosMRotated'] = derived_df['zSatPosM']
    
    # weight for WLS
    derived_df['uncertaintyWeight'] = 1 / derived_df['rawPrUncM']

    output_df = pd.DataFrame()
    d_list = []
    x_list = []
    y_list = []
    z_list = []
    epoch_list = []
    
    # calc position each epoch
    for epoch, df in derived_df.groupby('millisSinceGpsEpoch'): 
        
        ################  
        #Fixed point ↓
        ################
        # the number of signal type (not GPS_L1)
        N = len([i for i in df["signalType"].unique() if i != "GPS_L1"])

        # estimate position by WLS
        # X -> 4 + N (4 = x,y,z,t)
        x0 = [0]*(4 + N)  
    
        for signal_type in df["signalType"].unique():
            if signal_type != "GPS_L1":
                df[f"{signal_type}_isrbM"] = 0
                df.loc[df["signalType"]==signal_type, f"{signal_type}_isrbM"] = df.loc[df["signalType"]==signal_type, "isrbM"].values   
        
        opt_res = opt.least_squares(distance_v2, x0, kwargs=df.to_dict(orient="list"))

        # Optimiser yields a position in the ECEF coordinates
        opt_res_pos = opt_res.x
        d = distance_v2(opt_res_pos, **df.to_dict(orient="list"))
        #################
        # Fixed point ↑
        #################
        
        # ECEF position to lat/long
        wls_estimated_pos = ecef2lla(*opt_res_pos[:3])
        wls_estimated_pos = np.squeeze(wls_estimated_pos)
        d_list.append(d)
        x_list.append(wls_estimated_pos[0])
        y_list.append(wls_estimated_pos[1])
        z_list.append(wls_estimated_pos[2])
        epoch_list.append(epoch)

    output_df["latDeg"] = x_list
    output_df["lngDeg"] = y_list
    output_df['heightAboveWgs84EllipsoidM'] = z_list
    output_df["dist"] = d_list
    output_df['millisSinceGpsEpoch'] = epoch_list
    output_df['collectionName'] = collection_name
    output_df['phoneName'] = phone_name

    output_df.to_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv', index=False)
    return output_df

## original method by parrot

In [None]:
import multiprocessing

gr = base_train.groupby(['collectionName','phoneName'])
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(estimate_train_position_by_derived, gr)
    dfs = tqdm(dfs, total=len(gr))
    dfs = list(dfs)
all_derived_df = pd.concat(dfs).sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch']).reset_index(drop=True)     

In [None]:
from tqdm.notebook import tqdm
df_list = []
count = 0
for (collection_name, phone_name), base_df in tqdm(base_train.groupby(['collectionName','phoneName'])):
    # break
    if "SJC" in collection_name:
        print(f"\n{collection_name} {phone_name}")
        base_df = base_df.sort_values('millisSinceGpsEpoch')
        target_gt = gt[(gt['collectionName']==collection_name)&(gt['phoneName']==phone_name)].sort_values('millisSinceGpsEpoch').reset_index(drop=True)
        
        # 
        derived_df = pd.read_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv')
        derived_df = derived_df[~derived_df["millisSinceGpsEpoch"].duplicated()].sort_values("millisSinceGpsEpoch").reset_index(drop=True)
        derived_df = derived_df.rename(columns={"latDeg":"_latDeg", "lngDeg":"_lngDeg"})
        print(base_df.shape, target_gt.shape, derived_df.shape)
        base_score = get_train_score(base_df, target_gt)
        print("baseline:", base_score)

        derived_df = pd.merge_asof(base_df, derived_df[["millisSinceGpsEpoch", "_latDeg", "_lngDeg"]], on=["millisSinceGpsEpoch"], tolerance=10, direction='nearest')

        # replace if data is nan 
        derived_df.loc[derived_df["_latDeg"].isna(), "_latDeg"] = derived_df.loc[derived_df["_latDeg"].isna(), "latDeg"].values
        derived_df.loc[derived_df["_lngDeg"].isna(), "_lngDeg"] = derived_df.loc[derived_df["_lngDeg"].isna(), "lngDeg"].values

        derived_df = derived_df.drop(["latDeg", "lngDeg"], axis=1).rename(columns={"_latDeg":"latDeg","_lngDeg":"lngDeg"})
        df_list.append(derived_df)
        derived_score = get_train_score(derived_df, target_gt)
        print("derived:", derived_score)
corrected_base_df = pd.concat(df_list).reset_index(drop=True)

There is a difference of ~10 from the actual score.  
Next I'll try a fixed version that estimates isrbM for each signal type.

## Fixed version(but not good)

In [None]:
gr = base_train.groupby(['collectionName','phoneName'])
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(estimate_train_position_by_derived_v2, gr)
    dfs = tqdm(dfs, total=len(gr))
    dfs = list(dfs)
all_derived_df = pd.concat(dfs).sort_values(['collectionName', 'phoneName', 'millisSinceGpsEpoch']).reset_index(drop=True)     

In [None]:
from tqdm.notebook import tqdm
df_list = []
count = 0
for (collection_name, phone_name), base_df in tqdm(base_train.groupby(['collectionName','phoneName'])):
    # break
    if "SJC" in collection_name:
        print(f"\n{collection_name} {phone_name}")
        base_df = base_df.sort_values('millisSinceGpsEpoch')
        target_gt = gt[(gt['collectionName']==collection_name)&(gt['phoneName']==phone_name)].sort_values('millisSinceGpsEpoch').reset_index(drop=True)
        
        # 
        derived_df = pd.read_csv(output_dir + f'{collection_name}_{phone_name}_derived.csv')
        derived_df = derived_df[~derived_df["millisSinceGpsEpoch"].duplicated()].sort_values("millisSinceGpsEpoch").reset_index(drop=True)
        derived_df = derived_df.rename(columns={"latDeg":"_latDeg", "lngDeg":"_lngDeg"})
        print(base_df.shape, target_gt.shape, derived_df.shape)
        base_score = get_train_score(base_df, target_gt)
        print("baseline:", base_score)

        derived_df = pd.merge_asof(base_df, derived_df[["millisSinceGpsEpoch", "_latDeg", "_lngDeg"]], on=["millisSinceGpsEpoch"], tolerance=10, direction='nearest')

        # replace if data is nan 
        derived_df.loc[derived_df["_latDeg"].isna(), "_latDeg"] = derived_df.loc[derived_df["_latDeg"].isna(), "latDeg"].values
        derived_df.loc[derived_df["_lngDeg"].isna(), "_lngDeg"] = derived_df.loc[derived_df["_lngDeg"].isna(), "lngDeg"].values

        derived_df = derived_df.drop(["latDeg", "lngDeg"], axis=1).rename(columns={"_latDeg":"latDeg","_lngDeg":"lngDeg"})
        df_list.append(derived_df)
        derived_score = get_train_score(derived_df, target_gt)
        print("derived:", derived_score)
corrected_base_df = pd.concat(df_list).reset_index(drop=True)

The error was larger than the original one.  
According to the discussion, we need to estimate isrbM to reproduce baseline, but I don't know how to do that.  
**I would be very happy if someone could tell me**.  