In [1]:
import os
from datasets import load_dataset, concatenate_datasets
from tqdm.auto import tqdm

files = [
    os.path.join("./data/", f) for f in os.listdir("./data/") if f.endswith(".json")
]
files = sorted(files)

In [2]:
def preprocess(example):
    data = {}
    data['startTime'] = example['startTime']['$date']

    data['geolocationEvents'] = sorted([{
        'timestamp': event['recorded_at']['$date'],
        'accuracy': event['accuracy'],
        'latitude': event['latitude'],
        'longitude': event['longitude'],
        'altitude': event['altitude'],
        'speed': event['speed'],
        'heading': event['heading'],
    } for event in example["geolocationEvents"]], key=lambda x: x['timestamp'])

    return data


def select_columns_and_preprocess(chunk):
    dataset = load_dataset("json", data_files=chunk, split='train')
    dataset = dataset.map(preprocess, num_proc=min(
        os.cpu_count(), len(dataset)))
    dataset = dataset.select_columns(
        ['userId', 'geolocationEvents', 'startTime'])
    return dataset


dataset = concatenate_datasets(
    [select_columns_and_preprocess(chunk) for chunk in tqdm(files)]
)

  0%|          | 0/36 [00:00<?, ?it/s]

In [3]:
dataset

Dataset({
    features: ['userId', 'geolocationEvents', 'startTime'],
    num_rows: 73432
})

In [4]:
dataset.push_to_hub("rik1599/playngo-trips")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/37 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rik1599/playngo-trips/commit/df14aaa35dd6af67628d256fbe2e124c8fc85f19', commit_message='Upload dataset', commit_description='', oid='df14aaa35dd6af67628d256fbe2e124c8fc85f19', pr_url=None, pr_revision=None, pr_num=None)