# Analysis of Citibike bike dock stations

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime

## Read data
* trips
* stations

In [None]:
DATA_DIR = "../data/"
YEARLY_TRIPS_DIR = DATA_DIR + "tripdata_parquet/NY/"
STATIONS_DIR = DATA_DIR + "stations/"
PARQUET_EXTENSION = ".parquet"
TRIPS_COLUMNS = [
    "tripduration",
    "starttime",
    "stoptime",
    "startstationid",
    "endstationid",
    "bikeid",
    "usertype",
    "birthyear",
    "gender",
]

In [None]:
%%time
# read trips parquet, reset index, and drop dask column
trips = pd.read_parquet(
    YEARLY_TRIPS_DIR + "2019" + PARQUET_EXTENSION,
    columns=TRIPS_COLUMNS,
    engine="pyarrow",
).reset_index()
trips.drop(trips.columns[0], axis=1, inplace=True)  # drop the dask index

# manually change dtype of columns for trips df
trips["tripduration"] = trips["tripduration"].astype("int32")
trips["startstationid"] = trips["startstationid"].astype("int16")
trips["endstationid"] = trips["endstationid"].astype("int16")
trips["bikeid"] = trips["bikeid"].astype("int32")
trips["birthyear"] = trips["birthyear"].astype("int16")
trips["gender"] = trips["gender"].astype("int8")
trips["usertype"] = trips["usertype"].astype("category")
trips["starttime"] = pd.to_datetime(trips["starttime"])
trips["stoptime"] = pd.to_datetime(trips["stoptime"])

trips

In [None]:
# read stations
stations = pd.read_csv(STATIONS_DIR + "stations.csv", index_col=0)
stations

Unnamed: 0,stationid,stationname,latitude,longitude,capacity,neighbourhood,boro,zipcode,elevation_ft
0,455.0,1 Ave & E 44 St,40.750020,-73.969053,59.0,Turtle Bay,Manhattan,10017-6927,46.80
1,434.0,9 Ave & W 18 St,40.743174,-74.003664,60.0,Chelsea District,Manhattan,10019,15.90
2,491.0,E 24 St & Park Ave S,40.740964,-73.986022,,Manhattan Community Board 5,Manhattan,10010,34.87
3,384.0,Fulton St & Waverly Ave,40.683178,-73.965964,31.0,,Brooklyn,11238,78.10
4,474.0,5 Ave & E 29 St,40.745168,-73.986831,56.0,Midtown South,Manhattan,10035,41.55
...,...,...,...,...,...,...,...,...,...
1425,3685.0,Prospect Park - 5 Year Anniversary Celebration,40.660652,-73.964590,,,Brooklyn,11225,85.71
1426,3695.0,E 5 St & 2 Ave,40.726870,-73.989190,,East Village,Manhattan,10003,36.09
1427,3700.0,E 87 St & 3 Ave,40.779406,-73.953336,,Carnegie Hill,Manhattan,10028,79.35
1428,3805.0,E 80 St & Park Ave,40.776173,-73.959757,,Manhattan Community Board 8,Manhattan,10075,79.28


## Helper Functions

In [None]:
# https://stackoverflow.com/questions/36271302/changing-color-scale-in-seaborn-bar-plot
def colors_from_values(values: pd.Series, palette_name: str, ascending=True):
    # convert to indices
    values = values.sort_values(ascending=ascending).reset_index()
    indices = values.sort_values(by=values.columns[0]).index
    # use the indices to get the colors
    palette = sns.color_palette(palette_name, len(values))
    return np.array(palette).take(indices, axis=0)

## Stations EDA