# Bike Rebalances EDA
Citibike does not provide data regarding bike rebalances, however, a bike that starts from a station where it did not end its previous trip it likely was either rebalanced or taken out of service. We will assume the former is the case for this preliminary exercise and consider ways to make this more robust in the future.

This Notebook creates rebalance tables for years 2014-2020

In [None]:
import pandas as pd
import gc

# Create Annual Rebalance Tables

In [None]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

# set import columns and preferred dtypes
trip_columns = [
    "starttime",
    "stoptime",
    "startstationid",
    "endstationid",
    "bikeid",
]
col_types = {
    "starttime": "datetime64",
    "stoptime": "datetime64",
    "startstationid": "category",
    "endstationid": "category",
    "bikeid": "category",
}

# create an dummy dataframe to offset when merging
offset = pd.DataFrame(
    {
        "starttime": pd.to_datetime("2010-09-01"),
        "startstationid": 0,
        "stoptime": pd.to_datetime("2010-09-01"),
        "endstationid": 0,
        "bikeid": 0,
    },
    index=[0],
)

for year in years:

    # load data from parquet
    print("starting...", year, "----------------------")
    print("loading...")
    trips = pd.read_parquet(
        "data/tripdata_parquet/NY/" + str(year) + ".parquet",
        engine="pyarrow",
        columns=trip_columns,
    ).reset_index()
    trips.drop(trips.columns[0], axis=1, inplace=True)  # drop the dask index

    # order trips sequentially by bike
    print("sorting...")
    trips = trips.sort_values(by=["bikeid", "starttime"])

    # offset rides1 (start stations) to track end station, rides 2 for start station
    print("creating trips1...")
    trips1 = (
        pd.concat([offset, trips])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid1"})
    )

    print("creating trips2...")
    trips2 = (
        pd.concat([trips, offset])
        .reset_index(drop=True)
        .rename(columns={"bikeid": "bikeid2"})
    )

    # concat horizontally - a ride would start from the same endstation unless rebalanced
    print("concating trips1 and trips 2...")
    trips = pd.concat(
        [
            trips1[["bikeid1", "stoptime", "endstationid"]],
            trips2[["bikeid2", "starttime", "startstationid"]],
        ],
        axis=1,
    )

    # remove temp dataframes from memory
    del [trips1, trips2]
    gc.collect()

    # filter using rebalance criteria
    print("filtering....")
    trips = trips.loc[
        (trips.bikeid1 == trips.bikeid2) & (trips.startstationid != trips.endstationid)
    ]
    trips.drop(columns=["bikeid2"], inplace=True)

    # format and export
    print("formatting...")
    trips.rename(
        columns={
            "bikeid1": "bikeid",
            "stoptime": "prevtrip_stoptime",
            "endstationid": "prevtrip_endstationid",
        },
        inplace=True,
    )
    print("exporting to parquet...")
    rebal_filepath = "data/rebalance_parquet/" + str(year) + "_rebalances.parquet"
    trips.to_parquet(rebal_filepath, engine="pyarrow")

print("complete!")

starting... 2013 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2014 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2015 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2016 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2017 ----------------------
loading...
sorting...
creating trips1...
creating trips2...
concating trips1 and trips 2...
filtering....
formatting...
exporting to parquet...
starting... 2018 ----------------------
loading...
sorting...
creating trips1...

# Create Master Rebalance Table

In [None]:
years = [
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
]  # 2013 and 2021+ intientionally excluded

rebal_dfs = []
rebal_files = [
    "data/rebalance_parquet/" + str(y) + "_rebalances.parquet" for y in years
]

# create list of dataframes and concat
for rebal_file in rebal_files:
    print("appending..." + rebal_file)
    rebal_dfs.append(pd.read_parquet(rebal_file))
print("concating dfs...")
rebal = pd.concat(rebal_dfs)

# export to parquet
print("exporting...")
rebal.to_parquet("data/rebalance_parquet/all_rebalances.parquet", engine="pyarrow")
print("complete!")

appending...data/rebalance_parquet/2014_rebalances.parquet
appending...data/rebalance_parquet/2015_rebalances.parquet
appending...data/rebalance_parquet/2016_rebalances.parquet
appending...data/rebalance_parquet/2017_rebalances.parquet
appending...data/rebalance_parquet/2018_rebalances.parquet
appending...data/rebalance_parquet/2019_rebalances.parquet
appending...data/rebalance_parquet/2020_rebalances.parquet
concating dfs...
exporting...
complete!


# Create Featured Rebalance Table
Merge w/ stations data to include features like boro and elevation for EDA

In [None]:
# load dataframes
stations = pd.read_parquet("data/stations/stations.parquet")
rebal = pd.read_parquet("data/rebalance_parquet/all_rebalances.parquet")

# merge startstation features
col_rename = {
    "stationid": "startstationid",
    "stationname": "startstationname",
    "capacity": "startcapacity",
    "neighbourhood": "startneighborhood",
    "boro": "startboro",
    "elevation_ft": "startelevation_ft",
}
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="startstationid")

# merge prevtrip_endstation features
col_rename = {
    "stationid": "prevtrip_endstationid",
    "stationname": "prevtrip_endstationname",
    "capacity": "prevtrip_capacity",
    "neighbourhood": "prevtrip_neighborhood",
    "boro": "prevtrip_boro",
    "elevation_ft": "prevtrip_elevation_ft",
}
stations = pd.read_parquet("data/stations/stations.parquet")
stations.rename(columns=col_rename, inplace=True)
stations.drop(columns=(["latitude", "longitude", "zipcode"]), inplace=True)
rebal = rebal.merge(stations, how="left", on="prevtrip_endstationid")

rebal.to_parquet(
    "data/rebalance_parquet/all_rebalances_features.parquet", engine="pyarrow"
)