# Prepocessing of GBFS-Data from Voi

In [None]:
# imports
import json
import os

import pandas as pd

from tqdm import tqdm

In [None]:
# constants
DATA_DIR = "../../data/v/collection/"
FILE_TRAILING = "_vehicles_snap.json"

OUTPUT_FILE = "../processed_data/voi_demand.pickle"

In [3]:
docs = [doc for doc in sorted(os.listdir(DATA_DIR)) if doc.endswith(FILE_TRAILING)]

In [6]:
docs

['1739285705_1739285644_vehicles_snap.json',
 '1739286005_1739285943_vehicles_snap.json',
 '1739286306_1739286244_vehicles_snap.json',
 '1739286606_1739286552_vehicles_snap.json',
 '1739286906_1739286844_vehicles_snap.json',
 '1739287207_1739287146_vehicles_snap.json',
 '1739287507_1739287445_vehicles_snap.json',
 '1739287808_1739287748_vehicles_snap.json',
 '1739288108_1739288044_vehicles_snap.json',
 '1739288408_1739288344_vehicles_snap.json',
 '1739288709_1739288644_vehicles_snap.json',
 '1739289009_1739288947_vehicles_snap.json',
 '1739289309_1739289247_vehicles_snap.json',
 '1739289610_1739289547_vehicles_snap.json',
 '1739289910_1739289848_vehicles_snap.json',
 '1739290211_1739290144_vehicles_snap.json',
 '1739290511_1739290448_vehicles_snap.json',
 '1739290811_1739290748_vehicles_snap.json',
 '1739291112_1739291047_vehicles_snap.json',
 '1739291412_1739291346_vehicles_snap.json',
 '1739291713_1739291647_vehicles_snap.json',
 '1739292013_1739291949_vehicles_snap.json',
 '17392923

In [5]:
len(docs)

35755

In [None]:
def extract_time(filename):
    return filename.split("_")[0]

Unnamed: 0,filename,timestamp
0,1739285705_1739285644_vehicles_snap.json,2025-02-11 14:55:05
1,1739286005_1739285943_vehicles_snap.json,2025-02-11 15:00:05
2,1739286306_1739286244_vehicles_snap.json,2025-02-11 15:05:06
3,1739286606_1739286552_vehicles_snap.json,2025-02-11 15:10:06
4,1739286906_1739286844_vehicles_snap.json,2025-02-11 15:15:06


In [None]:
def convert_docs_to_df(docs_temp):
    """
    Convert a list of JSON files to a DataFrame containing filename and timestamp.
    """

    df = pd.DataFrame(docs_temp, columns=["filename"])

    df["timestamp"] = df["filename"].apply(extract_time)
    df["timestamp"] = pd.to_datetime(df["timestamp"].astype("int"), unit="s")
    df.sort_values(by="timestamp", inplace=True)
    return df

In [None]:
df = convert_docs_to_df(docs)
df.head()

In [8]:
df.describe()

Unnamed: 0,timestamp
count,35755
mean,2025-04-16 18:31:12.666088704
min,2025-02-11 14:55:05
25%,2025-03-15 15:16:01.500000
50%,2025-04-17 10:30:03
75%,2025-05-18 12:51:11.500000
max,2025-06-18 15:06:39


In [None]:
def detect_pickups(file_df):
    """
    Detects bike pickups by comparing consecutive snapshots.
    A pickup is when a bike is present in one snapshot but missing in the next.

    Args:
        file_df: DataFrame with filenames and timestamps

    Returns:
        DataFrame with pickup events
    """
    pickups = []

    # Iterate through consecutive pairs of snapshots
    for i in tqdm(range(len(file_df) - 1), desc="Detecting pickups"):
        current_file = file_df.iloc[i]["filename"]
        next_file = file_df.iloc[i + 1]["filename"]
        current_time = file_df.iloc[i]["timestamp"]
        next_time = file_df.iloc[i + 1]["timestamp"]

        try:
            # Load current snapshot
            with open(f"{DATA_DIR}{current_file}", "r") as fh:
                current_data = json.load(fh)
                current_bikes = {
                    bike["bike_id"]: bike for bike in current_data["data"]["bikes"]
                }

            # Load next snapshot
            with open(f"{DATA_DIR}{next_file}", "r") as fh:
                next_data = json.load(fh)
                next_bikes = {bike["bike_id"] for bike in next_data["data"]["bikes"]}

            # Find bikes that disappeared (were picked up)
            for bike_id, bike_data in current_bikes.items():
                if bike_id not in next_bikes:
                    pickup_data = {
                        "bike_id": bike_id,
                        "lat": bike_data.get("lat"),
                        "lon": bike_data.get("lon"),
                        "current_range_meters": bike_data.get("current_range_meters"),
                        "current_fuel_percent": bike_data.get("current_fuel_percent"),
                        "event_type": "pickup",
                        "event_time_start_range": current_time,
                        "event_time_end_range": next_time,
                    }
                    pickups.append(pickup_data)

            # Explicitly free memory
            del current_data, next_data, current_bikes, next_bikes

        except Exception as e:
            print(f"Error processing files {current_file} and {next_file}: {e}")

    if pickups:
        pickup_df = pd.DataFrame(pickups)
        return pickup_df
    else:
        return pd.DataFrame()

In [9]:
pick_ups_df = detect_pickups(df)
pick_ups_df.head()

Detecting pickups: 100%|██████████| 35754/35754 [51:39<00:00, 11.54it/s]  


Unnamed: 0,bike_id,lat,lon,current_range_meters,current_fuel_percent,event_type,event_time_start_range,event_time_end_range
0,VOJ:Vehicle:ed086d78-6194-4a8b-a37c-390f1f07533c,48.815282,9.235892,69600.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
1,VOJ:Vehicle:e1b69fc3-bf67-4eb8-a091-f2bc178d2f8d,48.810246,9.150238,76000.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
2,VOJ:Vehicle:f1339d7c-0637-4a19-957d-fab5a843bda0,48.772645,9.160812,59200.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
3,VOJ:Vehicle:149ddb57-a0b6-4bd2-a77a-aebb2132abfd,48.817641,9.155951,51200.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
4,VOJ:Vehicle:2030e8bd-f73f-454a-9065-c129a5ec6ca9,48.723247,9.106427,76000.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05


In [10]:
pick_ups_df.describe()

Unnamed: 0,lat,lon,current_range_meters,event_time_start_range,event_time_end_range
count,1083965.0,1083965.0,1083965.0,1083965,1083965
mean,48.9386,8.798506,51049.52,2025-04-21 08:16:40.666987008,2025-04-21 08:31:13.610288384
min,38.10037,-1.864902,800.0,2025-02-11 14:55:05,2025-02-11 15:00:05
25%,48.78061,8.41637,36000.0,2025-03-25 00:02:31,2025-03-25 00:07:32
50%,48.88818,8.710556,52800.0,2025-04-21 18:45:06,2025-04-21 18:50:07
75%,49.00934,9.170909,68000.0,2025-05-21 13:31:52,2025-05-21 13:36:52
max,52.49668,13.41007,80000.0,2025-06-18 15:01:38,2025-06-18 15:06:39
std,0.247613,0.3590984,19595.31,,


In [None]:
def detect_dropoffs(file_df):
    """
    Detects bike dropffs by comparing consecutive snapshots.
    A dropoff is when a bike is present in one snapshot but missing in the previous.

    Args:
        file_df: DataFrame with filenames and timestamps

    Returns:
        DataFrame with dropoff events
    """
    dropoffs = []

    # Iterate through consecutive pairs of snapshots
    for i in tqdm(range(len(file_df) - 1), desc="Detecting dropoffs"):
        current_file = file_df.iloc[i]["filename"]
        next_file = file_df.iloc[i + 1]["filename"]
        current_time = file_df.iloc[i]["timestamp"]
        next_time = file_df.iloc[i + 1]["timestamp"]

        try:
            # Load current snapshot
            with open(f"{DATA_DIR}{current_file}", "r") as fh:
                current_data = json.load(fh)
                current_bikes = {
                    bike["bike_id"] for bike in current_data["data"]["bikes"]
                }

            # Load next snapshot
            with open(f"{DATA_DIR}{next_file}", "r") as fh:
                next_data = json.load(fh)
                next_bikes = {
                    bike["bike_id"]: bike for bike in next_data["data"]["bikes"]
                }

            # Find bikes that dropped off
            for bike_id, bike_data in next_bikes.items():
                if bike_id not in current_bikes:
                    dropoff_data = {
                        "bike_id": bike_id,
                        "lat": bike_data.get("lat"),
                        "lon": bike_data.get("lon"),
                        "current_range_meters": bike_data.get("current_range_meters"),
                        "current_fuel_percent": bike_data.get("current_fuel_percent"),
                        "event_type": "dropoff",
                        "event_time_start_range": current_time,
                        "event_time_end_range": next_time,
                    }
                    dropoffs.append(dropoff_data)

            # Explicitly free memory
            del current_data, next_data, current_bikes, next_bikes

        except Exception as e:
            print(f"Error processing files {current_file} and {next_file}: {e}")

    if dropoffs:
        dropoff_df = pd.DataFrame(dropoffs)
        return dropoff_df
    else:
        return pd.DataFrame()

In [12]:
dropoffs_df = detect_dropoffs(df)
dropoffs_df.head()

Detecting dropoffs:   0%|          | 0/35754 [00:00<?, ?it/s]

Detecting dropoffs: 100%|██████████| 35754/35754 [48:55<00:00, 12.18it/s]  


Unnamed: 0,bike_id,lat,lon,current_range_meters,current_fuel_percent,event_type,event_time_start_range,event_time_end_range
0,VOJ:Vehicle:17e7a145-4dcf-4c22-835d-14167fdb3725,48.715877,9.000152,41600.0,,dropoff,2025-02-11 14:55:05,2025-02-11 15:00:05
1,VOJ:Vehicle:301bd7b2-8dcd-4ea3-93f3-fe7d1957943e,48.773468,9.175542,25600.0,,dropoff,2025-02-11 14:55:05,2025-02-11 15:00:05
2,VOJ:Vehicle:e4184254-5c61-46a4-9d61-28abeec0c20a,48.780381,9.178467,24800.0,,dropoff,2025-02-11 14:55:05,2025-02-11 15:00:05
3,VOJ:Vehicle:529725c7-b255-41ee-90fd-456da62d256a,48.800321,9.237849,46400.0,,dropoff,2025-02-11 14:55:05,2025-02-11 15:00:05
4,VOJ:Vehicle:a6d4d529-a45a-4930-8c1e-7d050fedc9e7,48.830956,9.161216,51200.0,,dropoff,2025-02-11 14:55:05,2025-02-11 15:00:05


In [13]:
dropoffs_df.describe()

Unnamed: 0,lat,lon,current_range_meters,event_time_start_range,event_time_end_range
count,1084366.0,1084366.0,1084366.0,1084366,1084366
mean,48.93853,8.798849,49491.14,2025-04-21 08:06:12.269644288,2025-04-21 08:20:46.855586560
min,48.29636,5.443516,800.0,2025-02-11 14:55:05,2025-02-11 15:00:05
25%,48.78057,8.416345,33600.0,2025-03-24 23:02:26,2025-03-24 23:07:26
50%,48.88802,8.71075,51200.0,2025-04-21 18:25:04,2025-04-21 18:30:05
75%,49.00934,9.17094,67200.0,2025-05-21 13:11:50,2025-05-21 13:16:50
max,61.39511,24.96076,80000.0,2025-06-18 15:01:38,2025-06-18 15:06:39
std,0.2482973,0.3591403,19922.77,,


In [14]:
len(dropoffs_df)

1084366

In [15]:
results_df = pd.concat([pick_ups_df, dropoffs_df], ignore_index=True)
results_df.head()

Unnamed: 0,bike_id,lat,lon,current_range_meters,current_fuel_percent,event_type,event_time_start_range,event_time_end_range
0,VOJ:Vehicle:ed086d78-6194-4a8b-a37c-390f1f07533c,48.815282,9.235892,69600.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
1,VOJ:Vehicle:e1b69fc3-bf67-4eb8-a091-f2bc178d2f8d,48.810246,9.150238,76000.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
2,VOJ:Vehicle:f1339d7c-0637-4a19-957d-fab5a843bda0,48.772645,9.160812,59200.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
3,VOJ:Vehicle:149ddb57-a0b6-4bd2-a77a-aebb2132abfd,48.817641,9.155951,51200.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05
4,VOJ:Vehicle:2030e8bd-f73f-454a-9065-c129a5ec6ca9,48.723247,9.106427,76000.0,,pickup,2025-02-11 14:55:05,2025-02-11 15:00:05


In [16]:
results_df.describe()

Unnamed: 0,lat,lon,current_range_meters,event_time_start_range,event_time_end_range
count,2168331.0,2168331.0,2168331.0,2168331,2168331
mean,48.93857,8.798677,50270.19,2025-04-21 08:11:26.410209280,2025-04-21 08:26:00.174982144
min,38.10037,-1.864902,800.0,2025-02-11 14:55:05,2025-02-11 15:00:05
25%,48.78059,8.416359,34400.0,2025-03-24 23:27:28,2025-03-24 23:32:29
50%,48.88809,8.710662,52000.0,2025-04-21 18:35:05,2025-04-21 18:40:06
75%,49.00934,9.170924,68000.0,2025-05-21 13:21:51,2025-05-21 13:26:51
max,61.39511,24.96076,80000.0,2025-06-18 15:01:38,2025-06-18 15:06:39
std,0.2479554,0.3591193,19775.1,,


In [None]:
results_df.value_counts("event_type")

event_type
dropoff    1084366
pickup     1083965
Name: count, dtype: int64

In [18]:
results_df.to_pickle(OUTPUT_FILE)

In [None]:
# extract fleet size in bounds
STUTTGART_BOUNDS = [9.1, 48.7, 9.3, 48.8]


def extract_fleet_size(file_df):
    """
    Extracts the fleet size within specified bounds.

    Args:
        file_df: DataFrame with filenames and timestamps

    Returns:
        DataFrame with fleet size per snapshot
    """
    fleet_sizes = []

    for i in tqdm(range(len(file_df)), desc="Detecting dropoffs"):
        current_file = file_df.iloc[i]["filename"]
        current_time = file_df.iloc[i]["timestamp"]

        try:
            with open(f"{DATA_DIR}{current_file}", "r") as fh:
                current_data = json.load(fh)
                current_bikes = {
                    bike["bike_id"]: bike for bike in current_data["data"]["bikes"]
                }

            fleet_counter = 0
            for bike_id, bike_data in current_bikes.items():
                if (
                    STUTTGART_BOUNDS[0] <= bike_data["lon"] <= STUTTGART_BOUNDS[2]
                    and STUTTGART_BOUNDS[1] <= bike_data["lat"] <= STUTTGART_BOUNDS[3]
                ):
                    fleet_counter += 1

            fleet_sizes.append(
                {
                    "timestamp": current_time,
                    "fleet_size": fleet_counter,
                    "full_fleet_size": len(current_bikes),
                }
            )

            del current_data, current_bikes

        except Exception as e:
            print(f"Error processing file {current_file}: {e}")

    if fleet_sizes:
        fleet_sizes_df = pd.DataFrame(fleet_sizes)
        return fleet_sizes_df
    else:
        return pd.DataFrame()

In [6]:
fleet_sizes_df = extract_fleet_size(df)
fleet_sizes_df.head()

Detecting dropoffs:  99%|█████████▉| 35417/35755 [37:14<00:16, 20.81it/s]  

Error processing file 1750157618_1750157186_vehicles_snap.json: 'lon'


Detecting dropoffs: 100%|██████████| 35755/35755 [37:32<00:00, 15.87it/s]


Unnamed: 0,timestamp,fleet_size,full_fleet_size
0,2025-02-11 14:55:05,909,3944
1,2025-02-11 15:00:05,909,3933
2,2025-02-11 15:05:06,907,3897
3,2025-02-11 15:10:06,904,3889
4,2025-02-11 15:15:06,906,3920


In [7]:
fleet_sizes_df.describe()

Unnamed: 0,timestamp,fleet_size,full_fleet_size
count,35754,35754.0,35754.0
mean,2025-04-16 18:28:43.610169344,872.34027,4197.258684
min,2025-02-11 14:55:05,781.0,3783.0
25%,2025-03-15 15:14:46.249999872,853.0,3939.0
50%,2025-04-17 10:27:32.500000,872.0,4292.0
75%,2025-05-18 12:47:26,893.0,4346.0
max,2025-06-18 15:06:39,944.0,4515.0
std,,28.402981,197.494072
