# Bike Rebalances Table Creation
Citibike does not provide data regarding bike rebalances, however, a bike that starts from a station where it did not end its previous trip it likely was either rebalanced or taken out of service. We will assume the former is the case for this preliminary exercise and consider ways to make this more robust in the future.

This Notebook creates rebalance tables for years 2014-2020

In [None]:
import pandas as pd
import gc

# Create Annual Rebalance Tables

In [None]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

# set import columns and preferred dtypes
trip_columns = [
    "starttime",
    "stoptime",
    "startstationid",
    "endstationid",
    "bikeid",
]
col_types = {
    "starttime": "datetime64",
    "stoptime": "datetime64",
    "startstationid": "category",
    "endstationid": "category",
    "bikeid": "category",
}

# create an dummy dataframe to offset when merging
offset = pd.DataFrame(
    {
        "starttime": pd.to_datetime("2010-09-01"),
        "startstationid": 0,
        "stoptime": pd.to_datetime("2010-09-01"),
        "endstationid": 0,
        "bikeid": 0,
    },
    index=[0],
)

for year in years:

    # load data from parquet
    print("starting...", year, "----------------------")
    print("loading...")
    trips = pd.read_parquet(
        "data/tripdata_parquet/NY/" + str(year) + ".parquet",
        engine="pyarrow",
        columns=trip_columns,
    ).reset_index()
    trips.drop(trips.columns[0], axis=1, inplace=True)  # drop the dask index

    # order trips sequentially by bike
    print("sorting...")
    trips = trips.sort_values(by=["bikeid", "starttime"])

    # offset rides1 (start stations) to track end station, rides 2 for start station
    print("creating trips1...")
    trips1 = (
        pd.concat([offset, trips])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid1"})
    )

    print("creating trips2...")
    trips2 = (
        pd.concat([trips, offset])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid2"})
    )

    # concat horizontally - a ride would start from the same endstation unless rebalanced
    print("concating trips1 and trips 2...")
    trips = pd.concat(
        [
            trips1[["bikeid1", "stoptime", "endstationid"]],
            trips2[["bikeid2", "starttime", "startstationid"]],
        ],
        axis=1,
    )

    # remove temp dataframes from memory
    del [trips1, trips2]
    gc.collect()

    # filter using rebalance criteria
    print("filtering....")
    trips = trips.loc[
        (trips.bikeid1 == trips.bikeid2) & (trips.startstationid != trips.endstationid)
    ]
    trips.drop(columns=["bikeid2"], inplace=True)

    # format and export
    print("formatting...")
    trips.rename(
        columns={
            "bikeid1": "bikeid",
            "stoptime": "prevtrip_stoptime",
            "endstationid": "prevtrip_endstationid",
        },
        inplace=True,
    )
    print("exporting to parquet...")
    rebal_filepath = "data/rebalance_parquet/" + str(year) + "_rebalances.parquet"
    trips.to_parquet(rebal_filepath, engine="pyarrow")

print("complete!")

starting... 2013 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2014 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2015 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2016 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2017 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2018 ----------------------
loading...
sorting...
creating trips1...

# Create Master Rebalance Table

In [None]:
years = [
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
]  # 2013 and 2021+ intientionally excluded

rebal_dfs = []
rebal_files = [
    "data/rebalance_parquet/" + str(y) + "_rebalances.parquet" for y in years
]

# create list of dataframes and concat
for rebal_file in rebal_files:
    print("appending..." + rebal_file)
    rebal_dfs.append(pd.read_parquet(rebal_file))
print("concating dfs...")
rebal = pd.concat(rebal_dfs)

# export to parquet
print("exporting...")
rebal.to_parquet("data/rebalance_parquet/all_rebalances.parquet", engine="pyarrow")
print("complete!")

appending...data/rebalance_parquet/2014_rebalances.parquet
appending...data/rebalance_parquet/2015_rebalances.parquet
appending...data/rebalance_parquet/2016_rebalances.parquet
appending...data/rebalance_parquet/2017_rebalances.parquet
appending...data/rebalance_parquet/2018_rebalances.parquet
appending...data/rebalance_parquet/2019_rebalances.parquet
appending...data/rebalance_parquet/2020_rebalances.parquet
concating dfs...
exporting...
complete!


# Create Featured Rebalance Table
Merge w/ stations data to include features like boro and elevation for EDA

In [None]:
# load dataframes
stations = pd.read_parquet("data/stations/stations.parquet")
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances.parquet")

# merge startstation features
col_rename = {
    "stationid": "startstationid",
    "stationname": "startstationname",
    "capacity": "startcapacity",
    "neighbourhood": "startneighborhood",
    "boro": "startboro",
    "elevation_ft": "startelevation_ft",
}
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="startstationid")

# merge prevtrip_endstation features
col_rename = {
    "stationid": "prevtrip_endstationid",
    "stationname": "prevtrip_endstationname",
    "capacity": "prevtrip_capacity",
    "neighbourhood": "prevtrip_neighborhood",
    "boro": "prevtrip_boro",
    "elevation_ft": "prevtrip_elevation_ft",
}
stations = pd.read_parquet("data/stations/stations.parquet")
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="prevtrip_endstationid")

rebal.to_parquet(
    "data/rebalance_parquet/all_rebalances_features.parquet", engine="pyarrow"
)

# Create Pairwise Rebalance Table (just for mapping)

In [None]:
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances_features.parquet")
rebal = rebal.astype({"starttime": "datetime64"})
rebal["rebal_year"] = pd.DatetimeIndex(rebal["starttime"]).year
rebal.head()

Unnamed: 0,bikeid,prevtrip_stoptime,prevtrip_endstationid,starttime,startstationid,startstationname,startcapacity,startneighborhood,startboro,startelevation_ft,prevtrip_endstationname,prevtrip_capacity,prevtrip_neighborhood,prevtrip_boro,prevtrip_elevation_ft
0,14529,10/1/2014 19:18:57,285.0,2014-10-01 23:27:47,345.0,W 13 St & 6 Ave,,West Village,Manhattan,27.15,Broadway & E 14 St,78.0,East Village,Manhattan,40.24
1,14529,10/1/2014 23:36:23,438.0,2014-10-10 01:46:00,2002.0,Wythe Ave & Metropolitan Ave,57.0,,Brooklyn,26.09,St Marks Pl & 1 Ave,47.0,East Village,Manhattan,25.69
2,14529,10/10/2014 09:22:03,243.0,2014-10-10 12:58:01,366.0,Clinton Ave & Myrtle Ave,33.0,,Brooklyn,59.95,Fulton St & Rockwell Pl,,Fort Greene,,34.05
3,14529,10/10/2014 13:25:54,285.0,2014-10-10 15:20:17,116.0,W 17 St & 8 Ave,50.0,Chelsea,Manhattan,20.55,Broadway & E 14 St,78.0,East Village,Manhattan,40.24
4,14529,10/10/2014 18:48:50,477.0,2014-10-16 07:59:55,301.0,E 2 St & Avenue B,58.0,East Village,Manhattan,22.46,W 41 St & 8 Ave,71.0,Theater District,Manhattan,42.46


In [None]:
# group by pairs and count number per pair
rebpair = (
    rebal.groupby(["rebal_year", "startstationname", "prevtrip_endstationname"])[
        ["bikeid"]
    ]
    .count()
    .reset_index()
    .sort_values(by="bikeid", ascending=False)
)
rebpair.rename(columns={"bikeid": "rebal_count"}, inplace=True)

# filter for only pairs that have been rebalanced more than ~10 times
rebpair = rebpair.loc[rebpair.rebal_count > 10]

# add id columns
rebpair = rebpair.merge(
    rebal[["startstationid", "startstationname"]].drop_duplicates(
        subset="startstationname"
    ),
    how="left",
    on="startstationname",
).merge(
    rebal[["prevtrip_endstationid", "prevtrip_endstationname"]].drop_duplicates(
        subset="prevtrip_endstationname"
    ),
    how="left",
    on="prevtrip_endstationname",
)
rebpair.rename(
    columns={
        "startstationid": "stationid_to",
        "prevtrip_endstationid": "stationid_from",
    },
    inplace=True,
)

# create a single string for rebalance route
rebpair["rebal_route"] = (
    rebpair.prevtrip_endstationname.astype("str")
    + " to "
    + rebpair.startstationname.astype("str")
)

rebpair[:20]

Unnamed: 0,rebal_year,startstationname,prevtrip_endstationname,rebal_count,stationid_to,stationid_from,rebal_route
0,2014,Broadway & W 41 St,W 41 St & 8 Ave,8565,465.0,477.0,W 41 St & 8 Ave to Broadway & W 41 St
1,2016,E 14 St & Avenue B,E 7 St & Avenue A,6108,511.0,432.0,E 7 St & Avenue A to E 14 St & Avenue B
2,2014,Broadway & W 36 St,W 33 St & 7 Ave,4161,267.0,492.0,W 33 St & 7 Ave to Broadway & W 36 St
3,2014,W 45 St & 8 Ave,W 42 St & 8 Ave,3899,2021.0,529.0,W 42 St & 8 Ave to W 45 St & 8 Ave
4,2014,E 43 St & Vanderbilt Ave,W 44 St & 5 Ave,3893,318.0,484.0,W 44 St & 5 Ave to E 43 St & Vanderbilt Ave
5,2015,W 52 St & 5 Ave,W 41 St & 8 Ave,3741,520.0,477.0,W 41 St & 8 Ave to W 52 St & 5 Ave
6,2016,E 20 St & FDR Drive,E 7 St & Avenue A,3445,487.0,432.0,E 7 St & Avenue A to E 20 St & FDR Drive
7,2014,Broadway & W 51 St,W 51 St & 6 Ave,3405,500.0,510.0,W 51 St & 6 Ave to Broadway & W 51 St
8,2016,W 52 St & 5 Ave,W 33 St & 7 Ave,3147,520.0,492.0,W 33 St & 7 Ave to W 52 St & 5 Ave
9,2015,Pershing Square North,E 47 St & Park Ave,3112,519.0,359.0,E 47 St & Park Ave to Pershing Square North


In [None]:
rebpair.to_parquet("data/rebalance_parquet/rebalance_pairs.parquet", engine="pyarrow")