# Bike Rebalances [2019]
Citibike does not provide data regarding bike rebalances, however, a bike that starts from a station where it did not end its previous trip it likely was either rebalanced or taken out of service. We will assume the former.


In [None]:
import pandas as pd
from pandas import to_datetime
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
import gc
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

## Calculate Bike Teleports

In [None]:
col_select = [
    "starttime",
    "stoptime",
    "startstationid",
    "startstationname",
    "startstationlatitude",
    "startstationlongitude",
    "endstationid",
    "endstationname",
    "endstationlatitude",
    "endstationlongitude",
    "bikeid",
]
# declare category column type to reduce memory usage compared to object
# consider changing float64 to float32, datetime64 to 32 if possible, ids to ints
col_types = {
    "startstationid": "category",
    "startstationname": "category",
    "endstationid": "category",
    "endstationname": "category",
    "bikeid": "category",
}


rides_raw = pd.read_csv(
    "data/NY_2019.csv",
    index_col=False,
    parse_dates=["starttime", "stoptime"],
    usecols=col_select,
    dtype=col_types,
)


pd.DataFrame.from_records(
    [
        (
            col,
            rides_raw[col].nunique(),
            rides_raw[col].dtype,
            rides_raw[col].memory_usage(deep=True),
        )
        for col in rides_raw.columns
    ],
    columns=["Column Name", "Unique", "Data Type", "Memory Usage"],
)

In [None]:
# comparison to when left as objects

# Column  Name	              Unique	    Data Type	      Memory Usage
# 0	      starttime	          20539444	  datetime64[ns]	 164413704
# 1	      stoptime	          20539225	  datetime64[ns]	 164413704
# 2	      startstationid      936	      float64	         164413704
# 3	      startstationname	  938	      object	        1574199724
# 4	      endstationid	      973	      float64	         164413704
# 5	      endstationname	  976	      object	        1573922082
# 6	      bikeid	          19571	      int64	             164413704

In [None]:
rides_raw.head()

In [None]:
# order trips sequentially by bike
rides = rides_raw.sort_values(by=["bikeid", "starttime"])

# create an dummy dataframe ot offset when merging
offset = pd.DataFrame(
    {
        "starttime": pd.to_datetime("2010-09-01"),
        "startstationid": 0,
        "stoptime": pd.to_datetime("2010-09-01"),
        "endstationid": 0,
        "bikeid": 0,
    },
    index=[0],
)

# offset rides1 (start stations) to track end station, rides 1 for start station
rides1 = (
    pd.concat([offset, rides])
    .reset_index(drop=True)
    .rename(columns={"bikeid": "bikeid1"})
)
rides2 = (
    pd.concat([rides, offset])
    .reset_index(drop=True)
    .rename(columns={"bikeid": "bikeid2"})
)

# concat horizontally - a ride would start from the same endstation unless rebalanced
rides = pd.concat(
    [
        rides1[["bikeid1", "stoptime", "endstationid", "endstationname"]],
        rides2[["bikeid2", "starttime", "startstationid", "startstationname"]],
    ],
    axis=1,
)

# remove temp dataframes from memory
del [[offset, rides1, rides2]]
gc.collect()

rides.head(10)

In [None]:
# filter for rebalances - bikeid = same, different stop and start stations
rebal = rides[
    [
        "bikeid1",
        "stoptime",
        "endstationid",
        "endstationname",
        "starttime",
        "startstationid",
        "startstationname",
    ]
].loc[(rides.bikeid1 == rides.bikeid2) & (rides.startstationid != rides.endstationid)]

rebal.reset_index(drop=True, inplace=True)

rebal.head()

In [None]:
print(rides.shape)
print(rebal.shape)
print("The ratio of rebalances to rides in 2019 is: ", rebal.shape[0] / rides.shape[0])

In [None]:
# plot top20 rebalances
rebalout = (
    rebal["endstationname"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Station", "endstationname": "Count_Out"})[:20]
)
rebalin = (
    rebal["startstationname"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Station", "startstationname": "Count_In"})[:20]
)

plt.figure(figsize=(10, 8))
plt.title("Citi Bike Rebalancing [2019] From Stations")
sns.barplot(y=rebalout.Station, x=rebalout.Count_Out, orient="h")

plt.figure(figsize=(10, 8))
plt.title("Citi Bike Rebalancing [2019] To Stations")
sns.barplot(y=rebalin.Station, x=rebalin.Count_In, orient="h")

In [None]:
# geoencode - https://towardsdatascience.com/reverse-geocoding-with-nyc-bike-share-data-cdef427987f8
geolocator = Nominatim(user_agent="nsp023@gmail.com")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1, max_retries=0)

stations = rides_raw[
    [
        "startstationid",
        "startstationname",
        "startstationlatitude",
        "startstationlongitude",
    ]
].drop_duplicates(subset=["startstationid"])
stations.rename(
    columns={
        "startstationid": "stationid",
        "startstationname": "stationname",
        "startstationlatitude": "latitude",
        "startstationlongitude": "longitude",
    },
    inplace=True,
)
stations.set_index("stationid", inplace=True)
stations

In [None]:
# leave commented out - use CSV created below

# locations_lst=[]
# for index, row in stations.iterrows():
#     locations_lst.append(reverse("{}, {}".format(row['latitude'],\
#     row['longitude'])).raw['address'])

# pd.DataFrame(locations_lst[:10])

In [None]:
locations = pd.DataFrame(
    index=stations.index,
    data=locations_lst,
    columns=["neighbourhood", "suburb", "postcode"],
)
locations.rename(
    columns={"neighbourhood": "neighborhood", "suburb": "boro", "postcode": "zipcode"},
    inplace=True,
)
locations["neighborhood"] = locations["neighborhood"].astype("category")
locations["boro"] = locations["boro"].astype("category")
locations[["stationname", "latitude", "longitude"]] = stations[
    ["stationname", "latitude", "longitude"]
]
locations.reset_index(inplace=True)
locations

In [None]:
locations.info()

In [None]:
locations.to_csv(
    "data/locations_NY_2019.csv",
)