In [21]:
import json
import os
import re
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

In [22]:
stations_dir = 'data/info'
status_dir = 'data/status'
free_bike_dir = 'data/free_bike'

In [23]:
stations_files = [file_name for file_name in os.listdir(stations_dir)]
status_files = [file_name for file_name in os.listdir(status_dir)]
free_bike_files = [file_name for file_name in os.listdir(free_bike_dir)]

In [24]:
stations_data_list = []

for file in tqdm(stations_files, desc='file'):
    
    with open(f'data/info/{file}', 'r') as file_object:
        try:
            stations_json_load = json.load(file_object)
        except json.JSONDecodeError as e:
            print(f'error on {file}: {e}')
            continue

    time_stations_data = (
        pd.json_normalize(
            data=stations_json_load,
            record_path= [
                ['data','stations']
            ],
            meta='last_updated',
        )
        .filter(items=[
            'last_updated',
            'station_id',
            'short_name',
            'name',
            'capacity',
            'lat',
            'lon'
        ])
    )

    file_timestamp = int(re.search('\d*',file).group())

    time_stations_data['status_last_updated_fetched_timestamp'] = file_timestamp 


    stations_data_list.append(time_stations_data)

stations_data = pd.concat(stations_data_list)

file: 100%|██████████| 123/123 [00:02<00:00, 46.48it/s]


In [25]:
status_data_list = []

for file in tqdm(status_files):

    with open(f'data/status/{file}', 'r') as file_object:
        try:
            status_json_load = json.load(file_object)
        except json.JSONDecodeError as e:
            print(f'error on {file}: {e}')
            continue

    time_status_data = pd.json_normalize(
        data=status_json_load,
        record_path= [
            ['data','stations']
        ],
        meta='last_updated',
    ).filter(items=[
        'last_updated',
        'station_id',
        'station_status',
        'is_renting',
        'is_returning',
        'num_docks_available',
        'num_bikes_available',
        'num_ebikes_available',
        'num_bikes_disabled',
        'num_docks_disabled',
        'num_ebikes_disabled',
        'valet.active'
    ])

    status_data_list.append(time_status_data)


status_data = pd.concat(status_data_list)


100%|██████████| 123/123 [00:03<00:00, 31.92it/s]


localize times

In [26]:
status_data['last_updated'] = (
    status_data['last_updated']
    .apply(pd.Timestamp, unit='s', tz='America/New_York')
)

stations_data['status_last_updated_fetched_timestamp'] = (
    stations_data['status_last_updated_fetched_timestamp']
    .apply(pd.Timestamp, unit='s', tz='America/New_York')
)

check that stations data timestamps have matching status data timestamps

In [27]:
assert stations_data['status_last_updated_fetched_timestamp'].isin(status_data['last_updated']).all()

In [28]:
dataset = (
    stations_data
    .merge(
        status_data, 
        left_on=['status_last_updated_fetched_timestamp','station_id'],
        right_on=['last_updated','station_id'],
        how='inner',
        suffixes=['_stations',None]
    )
)

check that each station_id is a unique physical location

In [29]:
assert dataset.groupby('station_id')['lat'].nunique().max() == 1

In [30]:
stations_locations = (
    dataset
    .drop_duplicates(subset='station_id')
    .set_index('station_id')
    [['lat','lon']]
)

In [31]:
# stations_geo = gpd.GeoDataFrame(
#     index=stations_locations.index,
#     geometry=gpd.points_from_xy(
#         stations_locations['lon'],
#         stations_locations['lat'],
#         crs='epsg:4326'
#     )
# )

In [32]:
dataset = (
    dataset
    .set_index(['last_updated','station_id'])
    .drop(columns=[
        'name',
        'lat','lon',
        'status_last_updated_fetched_timestamp',
        'last_updated_stations',
        ])
)

In [33]:
dataset.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,short_name,capacity,is_renting,is_returning,num_docks_available,num_bikes_available,num_ebikes_available,num_bikes_disabled,num_docks_disabled
last_updated,station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-01-26 04:39:20-05:00,6edca550-d78f-4c5d-ad2c-79d1ce88c48d,32093,19,0,0,11,0,0,6,0
2025-01-26 04:39:20-05:00,1890204129337198944,31396,11,1,1,1,10,0,0,0
2025-01-26 04:39:20-05:00,08263aa1-1f3f-11e7-bf6b-3863bb334450,31643,19,1,1,14,5,3,0,0
2025-01-26 04:39:20-05:00,08262494-1f3f-11e7-bf6b-3863bb334450,31909,15,1,1,2,12,1,1,0
2025-01-26 04:39:20-05:00,c0ec45a3-ec59-4c82-9671-13d9c122be30,32256,12,1,1,7,5,0,0,0
2025-01-26 04:39:20-05:00,0825e3b0-1f3f-11e7-bf6b-3863bb334450,31516,19,1,1,9,8,2,2,0
2025-01-26 04:39:20-05:00,08259c89-1f3f-11e7-bf6b-3863bb334450,32021,15,1,1,12,3,0,0,0
2025-01-26 04:39:20-05:00,0824a703-1f3f-11e7-bf6b-3863bb334450,31612,23,1,1,12,11,4,0,0
2025-01-26 04:39:20-05:00,08258475-1f3f-11e7-bf6b-3863bb334450,32009,9,1,1,5,4,0,0,0
2025-01-26 04:39:20-05:00,0826402c-1f3f-11e7-bf6b-3863bb334450,31917,15,1,1,8,7,5,0,0


save out

In [34]:
# dataset.to_parquet('dataset.parquet') 

In [35]:
dataset_normal_json = dataset.reset_index()  # Reset index to include station_id and last_updated
dataset_normal_json.to_json('dataset_normal.json', orient='records', lines=True)


In [36]:
# stations_geo.to_file('stations_geo.geojson')