# Prepocessing of GBFS-Data from Bolt

In [2]:
# imports 
import json
import os

import pandas as pd
import geopandas as gpd

from tqdm import tqdm

from concurrent.futures import ProcessPoolExecutor

In [21]:
# constants
DATA_DIR = '../../data/b_stu/'
FILE_TRAILING = '_vehicles_snap.json'

OUTPUT_FILE = '../processed_data/bolt_demand.pickle'

In [4]:
docs = [doc for doc in sorted(os.listdir(DATA_DIR)) if doc.endswith(FILE_TRAILING)]

In [5]:
docs

['1739284777_1739284565_vehicles_snap.json',
 '1739285077_1739284867_vehicles_snap.json',
 '1739285377_1739285168_vehicles_snap.json',
 '1739285678_1739285524_vehicles_snap.json',
 '1739285978_1739285887_vehicles_snap.json',
 '1739286278_1739286191_vehicles_snap.json',
 '1739286579_1739286191_vehicles_snap.json',
 '1739286879_1739286551_vehicles_snap.json',
 '1739287179_1739286906_vehicles_snap.json',
 '1739287479_1739287265_vehicles_snap.json',
 '1739287780_1739287568_vehicles_snap.json',
 '1739288080_1739287927_vehicles_snap.json',
 '1739288380_1739288286_vehicles_snap.json',
 '1739288681_1739288286_vehicles_snap.json',
 '1739288981_1739288646_vehicles_snap.json',
 '1739289281_1739288947_vehicles_snap.json',
 '1739289581_1739289250_vehicles_snap.json',
 '1739289882_1739289605_vehicles_snap.json',
 '1739290182_1739289965_vehicles_snap.json',
 '1739290482_1739290267_vehicles_snap.json',
 '1739290783_1739290569_vehicles_snap.json',
 '1739291083_1739290925_vehicles_snap.json',
 '17392913

In [6]:
len(docs)

21580

In [7]:
def extract_time(filename):
    return filename.split('_')[0]

def convert_docs_to_df(docs_temp):
    """
    Convert a list of JSON files to a DataFrame containing filename and timestamp.
    """

    # Create a DataFrame
    df = pd.DataFrame(docs_temp, columns=['filename'])
    
    # Add time column
    df['timestamp'] = df['filename'].apply(extract_time)
    df['timestamp'] = pd.to_datetime(df['timestamp'].astype('int'), unit='s')
    df.sort_values(by='timestamp', inplace=True)
    return df

df = convert_docs_to_df(docs)
df.head()

Unnamed: 0,filename,timestamp
0,1739284777_1739284565_vehicles_snap.json,2025-02-11 14:39:37
1,1739285077_1739284867_vehicles_snap.json,2025-02-11 14:44:37
2,1739285377_1739285168_vehicles_snap.json,2025-02-11 14:49:37
3,1739285678_1739285524_vehicles_snap.json,2025-02-11 14:54:38
4,1739285978_1739285887_vehicles_snap.json,2025-02-11 14:59:38


In [8]:
df.describe()

Unnamed: 0,timestamp
count,21580
mean,2025-03-21 14:49:21.170945536
min,2025-02-11 14:39:37
25%,2025-03-02 08:48:35
50%,2025-03-21 20:43:09.500000
75%,2025-04-09 14:55:07.249999872
max,2025-04-28 09:13:04


In [9]:
def detect_pickups(file_df):
    """
    Detects bike pickups by comparing consecutive snapshots.
    A pickup is when a bike is present in one snapshot but missing in the next.
    
    Args:
        file_df: DataFrame with filenames and timestamps
        
    Returns:
        DataFrame with pickup events
    """
    pickups = []
    
    # Iterate through consecutive pairs of snapshots
    for i in tqdm(range(len(file_df) - 1), desc="Detecting pickups"):
        current_file = file_df.iloc[i]['filename']
        next_file = file_df.iloc[i+1]['filename']
        current_time = file_df.iloc[i]['timestamp']
        next_time = file_df.iloc[i+1]['timestamp']
        
        try:
            # Load current snapshot
            with open(f'{DATA_DIR}{current_file}', 'r') as fh:
                current_data = json.load(fh)
                current_bikes = {bike['bike_id']: bike for bike in current_data['data']['bikes']}
            
            # Load next snapshot
            with open(f'{DATA_DIR}{next_file}', 'r') as fh:
                next_data = json.load(fh)
                next_bikes = {bike['bike_id'] for bike in next_data['data']['bikes']}
            
            # Find bikes that disappeared (were picked up)
            for bike_id, bike_data in current_bikes.items():
                if bike_id not in next_bikes:
                    pickup_data = {
                        'bike_id': bike_id,
                        'lat': bike_data.get('lat'),
                        'lon': bike_data.get('lon'),
                        'current_range_meters': bike_data.get('current_range_meters'),
                        'current_fuel_percent': bike_data.get('current_fuel_percent'),
                        'event_type': 'pickup',
                        'event_time_start_range': current_time,
                        'event_time_end_range': next_time
                    }
                    pickups.append(pickup_data)
            
            # Explicitly free memory
            del current_data, next_data, current_bikes, next_bikes
            
        except Exception as e:
            print(f"Error processing files {current_file} and {next_file}: {e}")
    
    # Convert to DataFrame
    if pickups:
        pickup_df = pd.DataFrame(pickups)
        return pickup_df
    else:
        return pd.DataFrame()

In [10]:
pick_ups_df = detect_pickups(df)
pick_ups_df.head()

Detecting pickups: 100%|██████████| 21579/21579 [12:32<00:00, 28.67it/s]


Unnamed: 0,bike_id,lat,lon,current_range_meters,current_fuel_percent,event_type,event_time_start_range,event_time_end_range
0,BLT:Vehicle:b6069f5c-58b8-4f69-9671-94b88988d4a0,48.70842,9.003853,10080.0,0.39,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
1,BLT:Vehicle:740666c5-38af-4cd9-8ea8-d7a4b623294b,48.826118,9.238379,23800.0,0.44,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
2,BLT:Vehicle:c736bf7f-9e1b-40df-af6c-afb04271040e,48.748035,9.074088,22400.0,0.42,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
3,BLT:Vehicle:f7b43738-0836-4443-9d7a-723976f84870,48.809673,9.183686,14000.0,0.3,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
4,BLT:Vehicle:50f1520d-feea-47e2-afb3-56ec8064d9b5,48.741364,9.165754,46900.0,0.77,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37


In [11]:
pick_ups_df.describe()

Unnamed: 0,lat,lon,current_range_meters,current_fuel_percent,event_time_start_range,event_time_end_range
count,197933.0,197933.0,197933.0,197571.0,197933,197933
mean,48.776247,9.173208,33844.531028,0.592263,2025-03-21 20:26:46.688596224,2025-03-21 20:37:03.452214016
min,48.654331,8.959231,0.0,0.01,2025-02-11 14:39:37,2025-02-11 14:44:37
25%,48.762234,9.156836,18200.0,0.37,2025-03-03 13:53:33,2025-03-03 13:58:33
50%,48.77898,9.176451,35000.0,0.61,2025-03-22 14:51:58,2025-03-22 14:56:58
75%,48.800785,9.208868,49000.0,0.81,2025-04-08 15:06:57,2025-04-08 15:11:58
max,48.863571,9.34978,63000.0,1.0,2025-04-28 09:03:03,2025-04-28 09:08:04
std,0.033116,0.053636,17069.400941,0.242323,,


In [12]:
def detect_dropoffs(file_df):
    """
    Detects bike dropffs by comparing consecutive snapshots.
    A dropoff is when a bike is present in one snapshot but missing in the previous.
    
    Args:
        file_df: DataFrame with filenames and timestamps
        
    Returns:
        DataFrame with dropoff events
    """
    dropoffs = []

    # Iterate through consecutive pairs of snapshots
    for i in tqdm(range(len(file_df) - 1), desc="Detecting dropoffs"):
        current_file = file_df.iloc[i]['filename']
        next_file = file_df.iloc[i+1]['filename']
        current_time = file_df.iloc[i]['timestamp']
        next_time = file_df.iloc[i+1]['timestamp']
        
        try:
            # Load current snapshot
            with open(f'{DATA_DIR}{current_file}', 'r') as fh:
                current_data = json.load(fh)
                current_bikes = {bike['bike_id'] for bike in current_data['data']['bikes']}
            
            # Load next snapshot
            with open(f'{DATA_DIR}{next_file}', 'r') as fh:
                next_data = json.load(fh)
                next_bikes = {bike['bike_id']: bike for bike in next_data['data']['bikes']}
            
            # Find bikes that appeared (were dropped off)
            for bike_id, bike_data in next_bikes.items():
                if bike_id not in current_bikes:
                    dropoff_data = {
                        'bike_id': bike_id,
                        'lat': bike_data.get('lat'),
                        'lon': bike_data.get('lon'),
                        'current_range_meters': bike_data.get('current_range_meters'),
                        'current_fuel_percent': bike_data.get('current_fuel_percent'),
                        'event_type': 'dropoff',
                        'event_time_start_range': current_time,
                        'event_time_end_range': next_time
                    }
                    dropoffs.append(dropoff_data)
            
            # Explicitly free memory
            del current_data, next_data, current_bikes, next_bikes
            
        except Exception as e:
            print(f"Error processing files {current_file} and {next_file}: {e}")
    
    # Convert to DataFrame
    if dropoffs:
        dropoff_df = pd.DataFrame(dropoffs)
        return dropoff_df
    else:
        return pd.DataFrame()

In [13]:
dropoffs_df = detect_dropoffs(df)
dropoffs_df.head()

Detecting dropoffs:   0%|          | 0/21579 [00:00<?, ?it/s]

Detecting dropoffs: 100%|██████████| 21579/21579 [04:25<00:00, 81.38it/s]


Unnamed: 0,bike_id,lat,lon,current_range_meters,current_fuel_percent,event_type,event_time_start_range,event_time_end_range
0,BLT:Vehicle:3c1f5136-6d31-4aae-9ac9-38f239320aeb,48.832756,9.222763,9100.0,0.23,dropoff,2025-02-11 14:39:37,2025-02-11 14:44:37
1,BLT:Vehicle:6e2f0941-817a-43f1-9dfc-24d17bae3a15,48.749332,9.080224,21700.0,0.41,dropoff,2025-02-11 14:39:37,2025-02-11 14:44:37
2,BLT:Vehicle:18e2be66-f679-47d2-a2db-ade50eadc11d,48.785801,9.169739,6300.0,0.19,dropoff,2025-02-11 14:39:37,2025-02-11 14:44:37
3,BLT:Vehicle:12fbd7dc-75e3-4dbe-b4a0-348b2168e3cf,48.782501,9.252641,31500.0,0.55,dropoff,2025-02-11 14:39:37,2025-02-11 14:44:37
4,BLT:Vehicle:50556b52-ebcc-469d-ad89-356e7872d479,48.782974,9.179928,13300.0,0.29,dropoff,2025-02-11 14:39:37,2025-02-11 14:44:37


In [14]:
dropoffs_df.describe()

Unnamed: 0,lat,lon,current_range_meters,current_fuel_percent,event_time_start_range,event_time_end_range
count,197910.0,197910.0,197910.0,197327.0,197910,197910
mean,48.77627,9.173236,32248.345308,0.569228,2025-03-21 20:44:47.923919872,2025-03-21 20:55:05.468020992
min,48.654045,8.959249,0.0,0.01,2025-02-11 14:39:37,2025-02-11 14:44:37
25%,48.762238,9.156837,16800.0,0.35,2025-03-03 14:48:36,2025-03-03 14:53:36
50%,48.778973,9.17647,32900.0,0.59,2025-03-22 15:01:58,2025-03-22 15:06:59
75%,48.800823,9.208897,47600.0,0.79,2025-04-08 15:11:58,2025-04-08 15:16:58
max,48.863583,9.34982,63000.0,1.0,2025-04-28 09:03:03,2025-04-28 09:08:04
std,0.033111,0.053602,17194.469039,0.243888,,


In [15]:
len(dropoffs_df)

197910

In [18]:
results_df = pd.concat([pick_ups_df, dropoffs_df], ignore_index=True)
results_df.head()

Unnamed: 0,bike_id,lat,lon,current_range_meters,current_fuel_percent,event_type,event_time_start_range,event_time_end_range
0,BLT:Vehicle:b6069f5c-58b8-4f69-9671-94b88988d4a0,48.70842,9.003853,10080.0,0.39,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
1,BLT:Vehicle:740666c5-38af-4cd9-8ea8-d7a4b623294b,48.826118,9.238379,23800.0,0.44,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
2,BLT:Vehicle:c736bf7f-9e1b-40df-af6c-afb04271040e,48.748035,9.074088,22400.0,0.42,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
3,BLT:Vehicle:f7b43738-0836-4443-9d7a-723976f84870,48.809673,9.183686,14000.0,0.3,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37
4,BLT:Vehicle:50f1520d-feea-47e2-afb3-56ec8064d9b5,48.741364,9.165754,46900.0,0.77,pickup,2025-02-11 14:39:37,2025-02-11 14:44:37


In [19]:
results_df.describe()

Unnamed: 0,lat,lon,current_range_meters,current_fuel_percent,event_time_start_range,event_time_end_range
count,395843.0,395843.0,395843.0,394898.0,395843,395843
mean,48.776258,9.173222,33046.484541,0.580753,2025-03-21 20:35:47.274845696,2025-03-21 20:46:04.428682752
min,48.654045,8.959231,0.0,0.01,2025-02-11 14:39:37,2025-02-11 14:44:37
25%,48.762234,9.156836,17500.0,0.36,2025-03-03 14:23:34,2025-03-03 14:28:35
50%,48.778976,9.176459,34300.0,0.6,2025-03-22 14:51:58,2025-03-22 14:56:58
75%,48.800804,9.20888,48300.0,0.8,2025-04-08 15:06:57,2025-04-08 15:11:58
max,48.863583,9.34982,63000.0,1.0,2025-04-28 09:03:03,2025-04-28 09:08:04
std,0.033113,0.053619,17150.603338,0.243379,,


In [20]:
results_df.value_counts('event_type')

event_type
pickup     197933
dropoff    197910
Name: count, dtype: int64

In [22]:
results_df.to_pickle(OUTPUT_FILE)