# Bike Rebalances Table Creation
Citibike does not provide data regarding bike rebalances, however, a bike that starts from a station where it did not end its previous trip it likely was either rebalanced or taken out of service. We will assume the former is the case for this preliminary exercise and consider ways to make this more robust in the future.

This Notebook creates rebalance tables for years 2014-2020

In [None]:
import pandas as pd
import gc
import os

In [None]:
REBALANCE_DIR = "data/rebalance_parquet/"

In [None]:
if not os.path.exists(REBALANCE_DIR):
    os.makedirs(os.path.dirname(REBALANCE_DIR))

# Create Annual Rebalance Tables

In [None]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
years = [2019]  # to make this go fast

# set import columns and preferred dtypes
trip_columns = [
    "starttime",
    "stoptime",
    "startstationid",
    "endstationid",
    "bikeid",
]
col_types = {
    "starttime": "datetime64",
    "stoptime": "datetime64",
    "startstationid": "category",
    "endstationid": "category",
    "bikeid": "category",
}

# create an dummy dataframe to offset when merging
offset = pd.DataFrame(
    {
        "starttime": pd.to_datetime("2010-09-01"),
        "startstationid": 0,
        "stoptime": pd.to_datetime("2010-09-01"),
        "endstationid": 0,
        "bikeid": 0,
    },
    index=[0],
)

for year in years:

    # load data from parquet
    print("starting...", year, "----------------------")
    print("loading...")
    trips = pd.read_parquet(
        "data/tripdata_parquet/NY/" + str(year) + ".parquet",
        engine="pyarrow",
        columns=trip_columns,
    ).reset_index()
    trips.drop(trips.columns[0], axis=1, inplace=True)  # drop the dask index

    # order trips sequentially by bike
    print("sorting...")
    trips = trips.sort_values(by=["bikeid", "starttime"])

    # offset rides1 (start stations) to track end station, rides 2 for start station
    print("creating trips1...")
    trips1 = (
        pd.concat([offset, trips])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid1"})
    )

    print("creating trips2...")
    trips2 = (
        pd.concat([trips, offset])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid2"})
    )

    # concat horizontally - a ride would start from the same endstation unless rebalanced
    print("concating trips1 and trips 2...")
    trips = pd.concat(
        [
            trips1[["bikeid1", "stoptime", "endstationid"]],
            trips2[["bikeid2", "starttime", "startstationid"]],
        ],
        axis=1,
    )

    # remove temp dataframes from memory
    del [trips1, trips2]
    gc.collect()

    # filter using rebalance criteria
    print("filtering....")
    trips = trips.loc[
        (trips.bikeid1 == trips.bikeid2) & (trips.startstationid != trips.endstationid)
    ]
    trips.drop(columns=["bikeid2"], inplace=True)

    # format and export
    print("formatting...")
    trips.rename(
        columns={
            "bikeid1": "bikeid",
            "stoptime": "prevtrip_stoptime",
            "endstationid": "prevtrip_endstationid",
        },
        inplace=True,
    )
    print("exporting to parquet...")
    rebal_filepath = "data/rebalance_parquet/" + str(year) + "_rebalances.parquet"
    trips.to_parquet(rebal_filepath, engine="pyarrow")

print("complete!")

starting... 2019 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
complete!


# Create Master Rebalance Table

In [None]:
years = [
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
]  # 2013 and 2021+ intentionally excluded
years = [2019]

rebal_dfs = []
rebal_files = [
    "data/rebalance_parquet/" + str(y) + "_rebalances.parquet" for y in years
]

# create list of dataframes and concat
for rebal_file in rebal_files:
    print("appending..." + rebal_file)
    rebal_dfs.append(pd.read_parquet(rebal_file))
print("concating dfs...")
rebal = pd.concat(rebal_dfs)

# export to parquet
print("exporting...")
rebal.to_parquet("data/rebalance_parquet/all_rebalances.parquet", engine="pyarrow")
print("complete!")

appending...data/rebalance_parquet/2019_rebalances.parquet
concating dfs...
exporting...
complete!


# Create Featured Rebalance Table
Merge w/ stations data to include features like boro and elevation for EDA

In [None]:
# load dataframes
stations = pd.read_parquet("data/stations/stations.parquet")
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances.parquet")

# merge startstation features
col_rename = {
    "stationid": "startstationid",
    "stationname": "startstationname",
    "capacity": "startcapacity",
    "neighbourhood": "startneighborhood",
    "boro": "startboro",
    "elevation_ft": "startelevation_ft",
}
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="startstationid")

# merge prevtrip_endstation features
col_rename = {
    "stationid": "prevtrip_endstationid",
    "stationname": "prevtrip_endstationname",
    "capacity": "prevtrip_capacity",
    "neighbourhood": "prevtrip_neighborhood",
    "boro": "prevtrip_boro",
    "elevation_ft": "prevtrip_elevation_ft",
}
stations = pd.read_parquet("data/stations/stations.parquet")
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="prevtrip_endstationid")

rebal.to_parquet(
    "data/rebalance_parquet/all_rebalances_features.parquet", engine="pyarrow"
)

# Create Pairwise Rebalance Table (just for mapping)

In [None]:
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances_features.parquet")
rebal = rebal.astype({"starttime": "datetime64"})
rebal["rebal_year"] = pd.DatetimeIndex(rebal["starttime"]).year
rebal.head()

Unnamed: 0,bikeid,prevtrip_stoptime,prevtrip_endstationid,starttime,startstationid,startstationname,startcapacity,startneighborhood,startboro,startelevation_ft,prevtrip_endstationname,prevtrip_capacity,prevtrip_neighborhood,prevtrip_boro,prevtrip_elevation_ft,rebal_year
0,14529,2019-01-28 09:02:57.4310,313.0,2019-02-05 17:26:22.126,3055.0,Greene Ave & Nostrand Ave,23.0,,Brooklyn,50.0,Washington Ave & Park Ave,55.0,,Brooklyn,30.53,2019
1,14529,2019-03-09 08:41:54.4070,3290.0,2019-03-12 12:19:24.601,391.0,Clark St & Henry St,43.0,Brooklyn Heights,,78.17,E 89 St & York Ave,35.0,Manhattan Community Board 8,Manhattan,19.52,2019
2,14529,2019-03-24 19:25:11.2450,539.0,2019-03-25 15:58:35.635,3129.0,Queens Plaza North & Crescent St,40.0,Queensbridge,Queens,21.7,Metropolitan Ave & Bedford Ave,31.0,,Brooklyn,35.19,2019
3,14529,2019-03-29 18:17:57.0590,486.0,2019-04-17 17:56:25.764,3418.0,Plaza St West & Flatbush Ave,33.0,Park Slope,,123.58,Broadway & W 29 St,41.0,Midtown South,Manhattan,42.55,2019
4,14529,2019-04-19 09:35:15.2270,127.0,2019-04-19 14:07:38.259,229.0,Great Jones St,23.0,NoHo,Manhattan,41.56,Barrow St & Hudson St,31.0,West Village,Manhattan,17.22,2019


In [None]:
# group by pairs and count number per pair
rebpair = (
    rebal.groupby(["rebal_year", "startstationname", "prevtrip_endstationname"])[
        ["bikeid"]
    ]
    .count()
    .reset_index()
    .sort_values(by="bikeid", ascending=False)
)
rebpair.rename(columns={"bikeid": "rebal_count"}, inplace=True)

# filter for only pairs that have been rebalanced more than ~10 times
rebpair = rebpair.loc[rebpair.rebal_count > 10]

# add id columns
rebpair = rebpair.merge(
    rebal[["startstationid", "startstationname"]].drop_duplicates(
        subset="startstationname"
    ),
    how="left",
    on="startstationname",
).merge(
    rebal[["prevtrip_endstationid", "prevtrip_endstationname"]].drop_duplicates(
        subset="prevtrip_endstationname"
    ),
    how="left",
    on="prevtrip_endstationname",
)
rebpair.rename(
    columns={
        "startstationid": "stationid_to",
        "prevtrip_endstationid": "stationid_from",
    },
    inplace=True,
)

# create a single string for rebalance route
rebpair["rebal_route"] = (
    rebpair.prevtrip_endstationname.astype("str")
    + " to "
    + rebpair.startstationname.astype("str")
)

rebpair

Unnamed: 0,rebal_year,startstationname,prevtrip_endstationname,rebal_count,stationid_to,stationid_from,rebal_route
0,2019,E 9 St & Avenue C,E 10 St & Avenue A,1807,394.0,445.0,E 10 St & Avenue A to E 9 St & Avenue C
1,2019,E 7 St & Avenue A,Cooper Square & E 7 St,1531,432.0,3263.0,Cooper Square & E 7 St to E 7 St & Avenue A
2,2019,E 13 St & Avenue A,E 10 St & Avenue A,1307,3711.0,445.0,E 10 St & Avenue A to E 13 St & Avenue A
3,2019,W 63 St & Broadway,W 52 St & 6 Ave,1276,3158.0,3443.0,W 52 St & 6 Ave to W 63 St & Broadway
4,2019,E 11 St & Avenue B,E 10 St & Avenue A,1226,3718.0,445.0,E 10 St & Avenue A to E 11 St & Avenue B
...,...,...,...,...,...,...,...
10218,2019,31 St & Broadway,E 27 St & 1 Ave,11,3593.0,2012.0,E 27 St & 1 Ave to 31 St & Broadway
10219,2019,3 Ave & E 62 St,Vernon Blvd & 30 Rd,11,3134.0,3610.0,Vernon Blvd & 30 Rd to 3 Ave & E 62 St
10220,2019,3 Ave & E 62 St,Broadway & E 22 St,11,3134.0,402.0,Broadway & E 22 St to 3 Ave & E 62 St
10221,2019,31 Ave & 30 St,Bedford Ave & Nassau Ave,11,3603.0,3107.0,Bedford Ave & Nassau Ave to 31 Ave & 30 St


In [None]:
rebpair.to_parquet("data/rebalance_parquet/rebalance_pairs.parquet", engine="pyarrow")