# Bike Rebalances Table Creation
Citibike does not provide data regarding bike rebalances, however, a bike that starts from a station where it did not end its previous trip it likely was either rebalanced or taken out of service. We will assume the former is the case for this preliminary exercise and consider ways to make this more robust in the future.

This Notebook creates rebalance tables for years 2014-2020

In [None]:
import pandas as pd
import gc
import os

In [None]:
REBALANCE_DIR = "data/rebalance_parquet/"

In [None]:
if not os.path.exists(REBALANCE_DIR):
    os.makedirs(os.path.dirname(REBALANCE_DIR))

# Create Annual Rebalance Tables

In [None]:
years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

# set import columns and preferred dtypes
trip_columns = [
    "starttime",
    "stoptime",
    "startstationid",
    "endstationid",
    "bikeid",
]
col_types = {
    "starttime": "datetime64",
    "stoptime": "datetime64",
}

# create an dummy dataframe to offset when merging
offset = pd.DataFrame(
    {
        "starttime": pd.to_datetime("2010-09-01"),
        "startstationid": 0,
        "stoptime": pd.to_datetime("2010-09-01"),
        "endstationid": 0,
        "bikeid": 0,
    },
    index=[0],
)

for year in years:

    # load data from parquet
    print("starting...", year, "----------------------")
    print("loading...")
    trips = pd.read_parquet(
        "../data/tripdata_parquet/NY/" + str(year) + ".parquet",
        engine="pyarrow",
        columns=trip_columns,
    ).reset_index()
    trips.drop(trips.columns[0], axis=1, inplace=True)  # drop the dask index

    # convert date cols to enable sorting
    print("converting date types...")
    trips = trips.astype(col_types)

    # order trips sequentially by bike and start time
    print("sorting...")
    trips = trips.sort_values(by=["bikeid", "starttime"])

    # offset rides1 (start stations) to track end station, rides 2 for start station
    print("creating trips1...")
    trips1 = (
        pd.concat([offset, trips])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid1"})
    )

    print("creating trips2...")
    trips2 = (
        pd.concat([trips, offset])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid2"})
    )

    # concat horizontally - a ride would start from the same endstation unless rebalanced
    print("concating trips1 and trips 2...")
    rebal = pd.concat(
        [
            trips1[["bikeid1", "stoptime", "endstationid"]],
            trips2[["bikeid2", "starttime", "startstationid"]],
        ],
        axis=1,
    )

    # remove temp dataframes from memory
    del [trips1, trips2]
    gc.collect()

    # filter using rebalance criteria
    print("filtering....")
    rebal = rebal.loc[
        (rebal.bikeid1 == rebal.bikeid2) & (rebal.startstationid != rebal.endstationid)
    ]
    rebal.drop(columns=["bikeid2"], inplace=True)

    # format
    print("formatting...")
    rebal.rename(
        columns={
            "bikeid1": "bikeid",
            "stoptime": "prevtrip_stoptime",
            "endstationid": "prevtrip_endstationid",
        },
        inplace=True,
    )

    # add rebalance time row
    rebal["rebal_time_hr"] = (
        rebal.starttime - rebal.prevtrip_stoptime
    ) / np.timedelta64(1, "h")

    # catch bad trip records (indicates a ride was started while anothe ride with that bikeid was in progress)
    neg = rebal.loc[rebal.rebal_time_hr < 0]

    # removes first offending record from rides table and continues looping until none are left
    # loop is required because fixing first real bad record *may or may not* fix subsequent bad records for a given bike
    # see section at end of rebalance_eda notebook for more details on this issue
    if neg.shape[0] > 0:
        print("up to", neg.shape[0], "bad records found - begin drop loop...")
        count = 0

    while neg.shape[0] > 0:
        count += 1
        print("dropping bad record", count)
        drop_trip = neg.prevtrip_stoptime.iloc[0]
        trips = trips[trips.stoptime != drop_trip]

        # offset rides1 (start stations) to track end station, rides 2 for start station
        trips1 = (
            pd.concat([offset, trips])
            .reset_index(drop=True)
            .rename(columns={"bikeid": "bikeid1"})
        )

        trips2 = (
            pd.concat([trips, offset])
            .reset_index(drop=True)
            .rename(columns={"bikeid": "bikeid2"})
        )

        # concat horizontally - a ride would start from the same endstation unless rebalanced
        rebal = pd.concat(
            [
                trips1[["bikeid1", "stoptime", "endstationid"]],
                trips2[["bikeid2", "starttime", "startstationid"]],
            ],
            axis=1,
        )

        # remove temp dataframes from memory
        del [trips1, trips2]
        gc.collect()

        # filter using rebalance criteria
        rebal = rebal.loc[
            (rebal.bikeid1 == rebal.bikeid2)
            & (rebal.startstationid != rebal.endstationid)
        ]
        rebal.drop(columns=["bikeid2"], inplace=True)

        # format and export
        rebal.rename(
            columns={
                "bikeid1": "bikeid",
                "stoptime": "prevtrip_stoptime",
                "endstationid": "prevtrip_endstationid",
            },
            inplace=True,
        )

        # add rebalance time row
        rebal["rebal_time_hr"] = (
            rebal.starttime - rebal.prevtrip_stoptime
        ) / np.timedelta64(1, "h")

        # update negative rebalances
        neg = rebal.loc[rebal.rebal_time_hr < 0]

    # export
    print("exporting to parquet...")
    rebal_filepath = "../data/rebalance_parquet/" + str(year) + "_rebalances.parquet"
    rebal.to_parquet(rebal_filepath, engine="pyarrow")

    # remove df from memory before loop
    del trips
    del rebal
    gc.collect()

print("complete!")

# Create Master Rebalance Table

In [None]:
years = [
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
]  # 2013 and 2021+ intentionally excluded

rebal_dfs = []
rebal_files = [
    "data/rebalance_parquet/" + str(y) + "_rebalances.parquet" for y in years
]

# create list of dataframes and concat
for rebal_file in rebal_files:
    print("appending..." + rebal_file)
    rebal_dfs.append(pd.read_parquet(rebal_file))
print("concating dfs...")
rebal = pd.concat(rebal_dfs)

# export to parquet
print("exporting...")
rebal.to_parquet("data/rebalance_parquet/all_rebalances.parquet", engine="pyarrow")
del rebal
gc.collect()
print("complete!")

appending...data/rebalance_parquet/2015_rebalances.parquet
appending...data/rebalance_parquet/2017_rebalances.parquet
concating dfs...
exporting...
complete!


# Create Featured Rebalance Table
Merge w/ stations data to include features like boro and elevation for EDA

In [None]:
# load dataframes
stations = pd.read_parquet("data/stations/stations.parquet")
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances.parquet")

# merge startstation features
col_rename = {
    "stationid": "startstationid",
    "stationname": "startstationname",
    "capacity": "startcapacity",
    "neighbourhood": "startneighborhood",
    "boro": "startboro",
    "elevation_ft": "startelevation_ft",
}
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="startstationid")

# merge prevtrip_endstation features
col_rename = {
    "stationid": "prevtrip_endstationid",
    "stationname": "prevtrip_endstationname",
    "capacity": "prevtrip_capacity",
    "neighbourhood": "prevtrip_neighborhood",
    "boro": "prevtrip_boro",
    "elevation_ft": "prevtrip_elevation_ft",
}
stations = pd.read_parquet("data/stations/stations.parquet")
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="prevtrip_endstationid")

rebal.to_parquet(
    "data/rebalance_parquet/all_rebalances_features.parquet", engine="pyarrow"
)

# Create Pairwise Rebalance Table (just for mapping)

In [None]:
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances_features.parquet")
rebal = rebal.astype({"starttime": "datetime64"})
rebal["rebal_year"] = pd.DatetimeIndex(rebal["starttime"]).year
rebal.head()

Unnamed: 0,bikeid,prevtrip_stoptime,prevtrip_endstationid,starttime,startstationid,startstationname,startcapacity,startneighborhood,startboro,startelevation_ft,prevtrip_endstationname,prevtrip_capacity,prevtrip_neighborhood,prevtrip_boro,prevtrip_elevation_ft,rebal_year
0,14529,2014-01-15 21:59:03,358.0,2014-01-16 09:10:01,300.0,Shevchenko Pl & E 7 St,,East Village,Manhattan,39.34,Christopher St & Greenwich St,50.0,West Village,Manhattan,18.11,2014
1,14529,2014-01-17 13:23:19,2023.0,2014-01-17 16:18:23,168.0,W 18 St & 6 Ave,47.0,Manhattan Community Board 5,Manhattan,33.25,E 55 St & Lexington Ave,,Manhattan Community Board 5,Manhattan,43.76,2014
2,14529,2014-01-23 13:17:05,348.0,2014-01-27 12:52:52,438.0,St Marks Pl & 1 Ave,47.0,East Village,Manhattan,25.69,W Broadway & Spring St,46.0,Manhattan Community Board 2,Manhattan,16.39,2014
3,14529,2014-01-27 13:03:12,151.0,2014-02-07 11:01:18,488.0,W 39 St & 9 Ave,44.0,Garment District,Manhattan,32.55,Cleveland Pl & Spring St,33.0,Manhattan Community Board 2,Manhattan,33.36,2014
4,14529,2014-02-07 21:35:11,229.0,2014-03-21 08:42:17,173.0,Broadway & W 49 St,,Theater District,Manhattan,54.58,Great Jones St,23.0,NoHo,Manhattan,41.56,2014


In [None]:
# group by pairs and count number per pair
rebpair = (
    rebal.groupby(["rebal_year", "startstationname", "prevtrip_endstationname"])[
        ["bikeid"]
    ]
    .count()
    .reset_index()
    .sort_values(by="bikeid", ascending=False)
)
rebpair.rename(columns={"bikeid": "rebal_count"}, inplace=True)

# filter for only pairs that have been rebalanced more than ~10 times
rebpair = rebpair.loc[rebpair.rebal_count > 10]

# add id columns
rebpair = rebpair.merge(
    rebal[["startstationid", "startstationname"]].drop_duplicates(
        subset="startstationname"
    ),
    how="left",
    on="startstationname",
).merge(
    rebal[["prevtrip_endstationid", "prevtrip_endstationname"]].drop_duplicates(
        subset="prevtrip_endstationname"
    ),
    how="left",
    on="prevtrip_endstationname",
)
rebpair.rename(
    columns={
        "startstationid": "stationid_to",
        "prevtrip_endstationid": "stationid_from",
    },
    inplace=True,
)

# create a single string for rebalance route
rebpair["rebal_route"] = (
    rebpair.prevtrip_endstationname.astype("str")
    + " to "
    + rebpair.startstationname.astype("str")
)

rebpair

Unnamed: 0,rebal_year,startstationname,prevtrip_endstationname,rebal_count,stationid_to,stationid_from,rebal_route
0,2014,Broadway & W 41 St,W 41 St & 8 Ave,8597,465.0,477.0,W 41 St & 8 Ave to Broadway & W 41 St
1,2016,E 14 St & Avenue B,E 7 St & Avenue A,7332,511.0,432.0,E 7 St & Avenue A to E 14 St & Avenue B
2,2014,Broadway & W 36 St,W 33 St & 7 Ave,4172,267.0,492.0,W 33 St & 7 Ave to Broadway & W 36 St
3,2016,E 20 St & FDR Drive,E 7 St & Avenue A,4117,487.0,432.0,E 7 St & Avenue A to E 20 St & FDR Drive
4,2015,E 14 St & Avenue B,E 7 St & Avenue A,4036,511.0,432.0,E 7 St & Avenue A to E 14 St & Avenue B
...,...,...,...,...,...,...,...
148343,2017,W 74 St & Columbus Ave,Central Park West & W 76 St,11,3172.0,3160.0,Central Park West & W 76 St to W 74 St & Colum...
148344,2016,Bond St & Fulton St,Front St & Washington St,11,3232.0,2000.0,Front St & Washington St to Bond St & Fulton St
148345,2016,E 52 St & 2 Ave,Broadway & W 55 St,11,441.0,468.0,Broadway & W 55 St to E 52 St & 2 Ave
148346,2016,E 39 St & 2 Ave,Avenue D & E 3 St,11,518.0,302.0,Avenue D & E 3 St to E 39 St & 2 Ave


In [None]:
rebpair.to_parquet("data/rebalance_parquet/rebalance_pairs.parquet", engine="pyarrow")