In [1]:
import os
from datasets import load_dataset, concatenate_datasets
from tqdm.auto import tqdm

files = [
    os.path.join("./data/", f) for f in os.listdir("./data/") if f.endswith(".json")
]
files = sorted(files)

In [17]:
dataset = load_dataset("json", data_files=files[0], split='train')

dataset[6]

{'_id': {'$oid': '63ed595aaea1450668c4521e'},
 'clientId': 'bike_1676497598487',
 'userId': 'u_2a1da12fb19d44c2b2cc083d151154e8',
 'nickname': 'giulii.amigoni',
 'territoryId': 'L',
 'multimodalId': 'multimodal_1676497598487',
 'freeTrackingTransport': 'bike',
 'geolocationEvents': [{'accuracy': 2,
   'activity_confidence': 100,
   'activity_type': 'still',
   'altitude': 253.6,
   'battery_is_charging': False,
   'battery_level': 0.59,
   'created_at': {'$date': '2023-02-15T22:14:50.372Z'},
   'device_model': 'M2004J19C',
   'geocoding': [9.2559827, 45.7422482],
   'heading': 37.16,
   'is_moving': True,
   'latitude': 45.7422482,
   'longitude': 9.2559827,
   'multimodalId': 'multimodal_1676497598487',
   'recorded_at': {'$date': '2023-02-15T21:51:07Z'},
   'sharedTravelId': None,
   'speed': 23.66,
   'travelId': 'bike_1676497598487',
   'userId': 'u_2a1da12fb19d44c2b2cc083d151154e8',
   'uuid': '4043d327-50ae-4922-aa7e-b4192e367d74'},
  {'accuracy': 2,
   'activity_confidence': 100

In [3]:
def preprocess(example):
    data = {}
    data['startTime'] = example['startTime']['$date']
    data['is_valid'] = example['validationResult']['valid']

    data['geolocationEvents'] = [{
        'accuracy': event['accuracy'],
        'altitude': event['altitude'],
        'heading': event['heading'],
        'is_moving': event['is_moving'],
        'latitude': event['latitude'],
        'longitude': event['longitude'],
        'speed': event['speed'],
        'timestamp': event['recorded_at']['$date']
    } for event in example["geolocationEvents"]]
    data['geolocationEvents'] = sorted(
        data['geolocationEvents'], key=lambda x: x['timestamp'])

    return data


def select_columns_and_preprocess(chunks):
    try:
        dataset = load_dataset("json", data_files=next(chunks), split='train')
        print(dataset)
        dataset = dataset.map(preprocess, num_proc=min(os.cpu_count(), len(dataset)))
        dataset = dataset.select_columns(['userId', 'geolocationEvents', 'startTime', 'is_valid'])
        dataset = dataset.filter(lambda x: x['is_valid'])

        other_datasets = select_columns_and_preprocess(chunks)
        if other_datasets is not None:
            dataset = concatenate_datasets([dataset, other_datasets])

        return dataset

    except StopIteration:
        return None


dataset = select_columns_and_preprocess(
    iter(tqdm(files[:2], desc="Loading and preprocessing datasets"))
)

Loading and preprocessing datasets:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['_id', 'clientId', 'userId', 'nickname', 'territoryId', 'multimodalId', 'freeTrackingTransport', 'geolocationEvents', 'started', 'complete', 'validating', 'deviceInfo', 'startTime', 'validationResult', '_class', 'sharedTravelId', 'toCheck', 'changedValidity', 'overriddenDistances'],
    num_rows: 2385
})
Dataset({
    features: ['_id', 'clientId', 'userId', 'nickname', 'territoryId', 'multimodalId', 'freeTrackingTransport', 'geolocationEvents', 'started', 'complete', 'validating', 'deviceInfo', 'startTime', 'validationResult', '_class', 'toCheck', 'sharedTravelId', 'changedValidity', 'overriddenDistances'],
    num_rows: 197
})


In [11]:
import random

random.choice(dataset)

{'_id': {'$oid': '63fce139dec1652a06d260b4'},
 'clientId': 'walk_1677516094559',
 'userId': 'u_3a51a8e9d99c4b67ab260121dcc9cca8',
 'nickname': 'Greenlantern',
 'territoryId': 'L',
 'multimodalId': 'multimodal_1677516094559',
 'freeTrackingTransport': 'walk',
 'geolocationEvents': [{'accuracy': 9,
   'activity_confidence': 100,
   'activity_type': 'still',
   'altitude': 250.9,
   'battery_is_charging': False,
   'battery_level': 0.58,
   'created_at': {'$date': '2023-02-27T16:58:33.751Z'},
   'device_model': 'CPH2197',
   'geocoding': [9.3977996, 45.8476558],
   'heading': 207.81,
   'is_moving': True,
   'latitude': 45.8476558,
   'longitude': 9.3977996,
   'multimodalId': 'multimodal_1677516094559',
   'recorded_at': {'$date': '2023-02-27T16:49:13.262Z'},
   'sharedTravelId': None,
   'speed': 1.5,
   'travelId': 'walk_1677516094559',
   'userId': 'u_3a51a8e9d99c4b67ab260121dcc9cca8',
   'uuid': 'ae7ffdf8-7f27-4f21-9fe6-152dbbb7a3a6'},
  {'accuracy': 9,
   'activity_confidence': 100,