# Reference
1. https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
1. https://www.kaggle.com/asobod11138/gsdc-neuralnet-keras (multi-threading)

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px
from multiprocessing import Pool
import multiprocessing as multi

# Set Path and Load Dataset

In [None]:
PATH = Path("../input/google-smartphone-decimeter-challenge")
train_df = pd.read_csv(PATH / "baseline_locations_train.csv")
test_df = pd.read_csv(PATH / "baseline_locations_test.csv")

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(test_df.shape)
test_df.head()

# Define Loading GnssLog.txt file Function

In [None]:
def gnss_log_to_dataframes(path):
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            try:
                gnss_map[dataline[0]] = dataline[1:]
            except:
                pass
        elif not is_header:
            try:
                datas[dataline[0]].append(dataline[1:])
            except:
                pass
    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            try:
                results[k][col] = pd.to_numeric(results[k][col])
            except:
                pass
    return results

# Load All Data Function

In [None]:
def get_addtional_data(df : pd.DataFrame, path: Path, train = True):
    gnss_section_names = {'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    section_names = {'GroundTruth', 'Derived', 'Raw','UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    _columns = ['latDeg', 'lngDeg', 'heightAboveWgs84EllipsoidM']

    output = dict()
    for section in section_names:
        output[section] = pd.DataFrame()

    if train:
        start_path = "train"
    else:
        start_path = "test"
        
    for path in tqdm(glob(str(PATH / start_path / "*/*/*"))):
        print(path)
        (collectionName, phoneName) = path.split("/")[-3:-1]
        
        file_name = path.split("/")[-1]
        
        if(file_name.find('ground_truth') >= 0): # get ground truth data
            _df = pd.read_csv(path)    
            _df[['t_'+col for col in _columns]] = _df[_columns]
            _df = _df.drop(columns=_columns)
            output['GroundTruth'] = pd.concat([output['GroundTruth'], _df])
            
        elif(file_name.find('derived.csv') >= 0): # get derived data
            _df = pd.read_csv(path)
            output['Derived'] = pd.concat([output['Derived'], _df])
            
        elif(file_name.find('GnssLog.txt') >= 0): # get gnss log data (it is dict)
            _dict = gnss_log_to_dataframes(path)
            for key, value in _dict.items():
                if value.shape[0] == 0: # empty log bypass
                    continue
                    
                # Addtional meta data for merging original data frame
                value['collectionName'] = collectionName 
                value['phoneName'] = phoneName
                if (key == "Status") or (key == "Fix"):  
                    value.rename(columns = {'UnixTimeMillis':'utcTimeMillis'}, inplace = True)
                value["millisSinceGpsEpoch"] = value["utcTimeMillis"] - 315964800000
                
                output[key] = pd.concat([output[key], value])

    for key, value in output.items():
        if value.shape[0] == 0:
            continue
        df = pd.merge_asof(df.sort_values('millisSinceGpsEpoch'), 
              value.sort_values('millisSinceGpsEpoch'), 
              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
              direction='nearest',tolerance=100000)
        
    return df
    
                
    
    

# Save To Pickle File

In [None]:
output = get_addtional_data(train_df, PATH, train = True)

output.to_pickle("gsdc_train.pkl.gzip")

In [None]:
output = get_addtional_data(test_df, PATH, train = False)

output.to_pickle("gsdc_test.pkl.gzip")

In [None]:
%clear

# Load Pickle File

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path
import plotly.express as px

In [None]:
PATH = Path("../input/google-smartphone-decimeter-challenge")

In [None]:
df_train = pd.read_pickle("gsdc_train.pkl.gzip")

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_pickle("gsdc_test.pkl.gzip")

In [None]:
print(df_test.shape)
df_test.head()