In [1]:
import pandas as pd
import pathlib
import numpy as np
from tqdm.notebook import tqdm

In [2]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

    
def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score1(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    # df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    df = df.merge(gt, on=['phone', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    # df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95]) # phoneによってgroupbyし、gtと予測値の差(err)の50%,95%値を求める
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [3]:
INPUT = '../input/google-smartphone-decimeter-challenge'
p = pathlib.Path(INPUT)
gt_files = list(p.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in tqdm(gt_files):
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)
ground_truth['phone'] = ground_truth['collectionName'] + '_' + ground_truth['phoneName']

ground_truth.csv count :  73


  0%|          | 0/73 [00:00<?, ?it/s]

### Method:Make train data

In [4]:
#  making ground truth file
def make_gt(path, collectionName, phoneName):
    # ground_truth
    p = pathlib.Path(path)
    gt_files = list(p.glob('train/*/*/ground_truth.csv'))

    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    ground_truth = pd.concat(gts)
    
    # baseline
    cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
    baseline = pd.read_csv(path + '/baseline_locations_train.csv', usecols=cols)
    ground_truth = ground_truth.merge(baseline, how='inner', on=cols[:3], suffixes=('_gt', '_bs'))
    # ground_truth["millisSinceGpsEpoch"] = ground_truth["millisSinceGpsEpoch"]//1000
    if (collectionName is None) or (phoneName is None):
        return ground_truth
    else:
        return ground_truth[(ground_truth['collectionName'] == collectionName) & (ground_truth['phoneName'] == phoneName)]
    

def make_tag(df, tag_v):
    df.loc[df['speedMps'] < tag_v, 'tag'] = 1
    df.loc[df['speedMps'] >= tag_v, 'tag'] = 0
    return df


# loading gnss file
def gnss_log_to_dataframes(path):
    print('Loading ' + path, flush=True)
    gnss_section_names = {'Raw', 'UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            try:
                gnss_map[dataline[0]] = dataline[1:]
            except:
                pass
        elif not is_header:
            try:
                datas[dataline[0]].append(dataline[1:])
            except:
                pass
    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            try:
                results[k][col] = pd.to_numeric(results[k][col])
            except:
                pass
    return results


def add_IMU(df, INPUT, cname, pname):
    path = INPUT + "/train/"+cname+"/"+pname+"/"+pname+"_GnssLog.txt"
    gnss_dfs = gnss_log_to_dataframes(path)
    acce_df = gnss_dfs["UncalAccel"]
    magn_df = gnss_dfs["UncalMag"]
    gyro_df = gnss_dfs["UncalGyro"]
    
    acce_df["millisSinceGpsEpoch"] = acce_df["utcTimeMillis"] - 315964800000
    acce_df["millisSinceGpsEpoch"] = acce_df["millisSinceGpsEpoch"]//1000 +18
    magn_df["millisSinceGpsEpoch"] = magn_df["utcTimeMillis"] - 315964800000
    magn_df["millisSinceGpsEpoch"] = magn_df["millisSinceGpsEpoch"]//1000 +18
    gyro_df["millisSinceGpsEpoch"] = gyro_df["utcTimeMillis"] - 315964800000
    gyro_df["millisSinceGpsEpoch"] = gyro_df["millisSinceGpsEpoch"]//1000 +18
    
    acce_df["x_f_acce"] = acce_df["UncalAccelZMps2"]
    acce_df["y_f_acce"] = acce_df["UncalAccelXMps2"]
    acce_df["z_f_acce"] = acce_df["UncalAccelYMps2"]
    # magn 
    magn_df["x_f_magn"] = magn_df["UncalMagZMicroT"]
    magn_df["y_f_magn"] = magn_df["UncalMagYMicroT"]
    magn_df["z_f_magn"] = magn_df["UncalMagXMicroT"]
    # gyro
    gyro_df["x_f_gyro"] = gyro_df["UncalGyroXRadPerSec"]
    gyro_df["y_f_gyro"] = gyro_df["UncalGyroYRadPerSec"]
    gyro_df["z_f_gyro"] = gyro_df["UncalGyroZRadPerSec"]    

    df = pd.merge_asof(df[["collectionName", "phoneName", "millisSinceGpsEpoch", "latDeg_gt", "lngDeg_gt", "latDeg_bs", "lngDeg_bs", "heightAboveWgs84EllipsoidM", "speedMps"]].sort_values('millisSinceGpsEpoch'), acce_df[["millisSinceGpsEpoch", "x_f_acce", "y_f_acce", "z_f_acce"]].sort_values('millisSinceGpsEpoch'), on='millisSinceGpsEpoch', direction='nearest')
    df = pd.merge_asof(df[["collectionName", "phoneName", "millisSinceGpsEpoch", "latDeg_gt", "lngDeg_gt", "latDeg_bs", "lngDeg_bs", "heightAboveWgs84EllipsoidM", "speedMps", "x_f_acce", "y_f_acce", "z_f_acce"]].sort_values('millisSinceGpsEpoch'), magn_df[["millisSinceGpsEpoch", "x_f_magn", "y_f_magn", "z_f_magn"]].sort_values('millisSinceGpsEpoch'), on='millisSinceGpsEpoch', direction='nearest')
    df = pd.merge_asof(df[["collectionName", "phoneName", "millisSinceGpsEpoch", "latDeg_gt", "lngDeg_gt", "latDeg_bs", "lngDeg_bs", "heightAboveWgs84EllipsoidM", "speedMps", "x_f_acce", "y_f_acce", "z_f_acce", "x_f_magn", "y_f_magn", "z_f_magn"]].sort_values('millisSinceGpsEpoch'), gyro_df[["millisSinceGpsEpoch", "x_f_gyro", "y_f_gyro", "z_f_gyro"]].sort_values('millisSinceGpsEpoch'), on='millisSinceGpsEpoch', direction='nearest')
    return df

def make_train(INPUT, train_cname, tag_v):
    # make ground_truth file
    gt = make_gt(INPUT, None, None)
    train_df = pd.DataFrame()
    for cname in train_cname:
        phone_list = gt[gt['collectionName'] == cname]['phoneName'].drop_duplicates()
        for pname in phone_list:
            df = gt[(gt['collectionName'] == cname) & (gt['phoneName'] == pname)]
            df = add_IMU(df, INPUT, cname, pname)
            train_df = pd.concat([train_df, df])
    # make tag
    train_df = make_tag(train_df, tag_v)
    return train_df

###  Method:Model(Light GBM)

In [10]:
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


def lgbm(train, test, col, lgb_params):
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(train[col], train['tag'])
    preds = model.predict(test[col])
    print('confusion matrix :  \n', confusion_matrix(preds, test['tag']))
    print('accuracy score : ', accuracy_score(preds, test['tag']))
    return preds

### Method:Confirm Score

In [5]:
def get_train_score(df):
    # calc_distance_error
    df['err'] =  calc_haversine(df.latDeg_bs, df.lngDeg_bs, 
    df.latDeg_gt, df.lngDeg_gt)
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score


def percentile50(x):
    return np.percentile(x, 50)


def percentile95(x):
    return np.percentile(x, 95)


def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

### Initial value

Please adjust the parameters as you like.

In [6]:
INPUT = '../input/google-smartphone-decimeter-challenge'

train_cname = ['2021-04-29-US-SJC-2', '2021-03-10-US-SVL-1']
test_cname = ['2021-04-28-US-SJC-1']
tag_v = 0.5
col = ["x_f_acce", "y_f_acce", "z_f_acce", "x_f_magn", "y_f_magn", "z_f_magn", "x_f_gyro", "y_f_gyro", "z_f_gyro"]

# parameter
lgb_params = {
    'num_leaves': 90,
    'n_estimators': 125,
}

### Main

In [7]:
# make train&test
train_df = make_train(INPUT, train_cname, tag_v)
test_df = make_train(INPUT, test_cname, tag_v)

Loading ../input/google-smartphone-decimeter-challenge/train/2021-04-29-US-SJC-2/Pixel4/Pixel4_GnssLog.txt
Loading ../input/google-smartphone-decimeter-challenge/train/2021-04-29-US-SJC-2/SamsungS20Ultra/SamsungS20Ultra_GnssLog.txt
Loading ../input/google-smartphone-decimeter-challenge/train/2021-03-10-US-SVL-1/Pixel4XL/Pixel4XL_GnssLog.txt
Loading ../input/google-smartphone-decimeter-challenge/train/2021-03-10-US-SVL-1/SamsungS20Ultra/SamsungS20Ultra_GnssLog.txt
Loading ../input/google-smartphone-decimeter-challenge/train/2021-04-28-US-SJC-1/Pixel4/Pixel4_GnssLog.txt
Loading ../input/google-smartphone-decimeter-challenge/train/2021-04-28-US-SJC-1/SamsungS20Ultra/SamsungS20Ultra_GnssLog.txt


In [11]:
# prediction with light gbm
test_df['preds'] = lgbm(train_df, test_df, col, lgb_params)

confusion matrix :  
 [[2686 1411]
 [   0    0]]
accuracy score :  0.6556016597510373


In [13]:
train1 = train_df.copy()
train1['phone'] = train1['collectionName'] + '_' + train1['phoneName']
train1 = train1.rename(columns={'latDeg_bs':'latDeg', 'lngDeg_bs':'lngDeg'})
train1 = train1[['phone', 'collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'tag']]
get_train_score1(train1, ground_truth)

13.245089919926727

In [46]:
def remove_lowSpeed(_df, window=100):
    df = _df.copy()
    df['latDeg'] = df['latDeg'].astype(float)
    df['lngDeg'] = df['lngDeg'].astype(float)

    # phoneごとに補間する
    dfs = []
    for _, df in df.groupby('phone'):
        _index = df[df['tag']==1].index
        for idx in _index:
            window_b = idx-int(window/2)
            window_f = idx+int(window/2)
            if window_b>0 and window_f<len(df):
                df.loc[idx, 'latDeg'] = df.loc[window_b:window_f, 'latDeg'].mean()
                df.loc[idx, 'lngDeg'] = df.loc[window_b:window_f, 'lngDeg'].mean()
        df = df.interpolate(method='linear',
                            limit=None,
                            limit_direction='both')
        dfs.append(df)
    df = pd.concat(dfs)
    return df[['phone','millisSinceGpsEpoch','latDeg','lngDeg','tag']]

In [47]:
train2 = remove_lowSpeed(train1, window=20)
print(get_train_score1(train2, ground_truth))

12.527358949606755


In [27]:
import plotly.express as px
def visualize_trafic(df, center={"lat":37.6458, "lon":-122.4056}, zoom=9):
    fig = px.scatter_mapbox(df,
                            # Here, plotly gets, (x,y) coordinates
                            lat="latDeg",
                            lon="lngDeg",
                            
                            #Here, plotly detects color of series
                            color="tag",
                            
                            labels="phone",
                            zoom=zoom,
                            center=center,
                            height=600,
                            width=800)
    fig.update_layout(mapbox_style='stamen-terrain')
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.update_layout(title_text="GPS trafic")
    fig.show()

In [28]:
visualize_trafic(train2)

### Visualize data

Visualizing that speed is 0 or not.

Looking at the figure, it appears that multipath is occurring where the car is determined to be stationary😄

In [14]:
import plotly.express as px
fig = px.scatter_mapbox(test_df,
                    # Here, plotly gets, (x,y) coordinates
                    lat="latDeg_bs",
                    lon="lngDeg_bs",
                    text='phoneName',

                    #Here, plotly detects color of series
                    color="preds",
                    labels="collectionName",

                    zoom=14.5,
                    center={"lat":37.334, "lon":-121.89},
                    height=600,
                    width=800)
fig.update_layout(mapbox_style='stamen-terrain')
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.update_layout(title_text="GPS trafic")
fig.show()