# Creates yearly summary table for Citibike trips data
* Also adds MTA subway and bus ridership data from
    * https://new.mta.info/agency/new-york-city-transit/subway-bus-ridership-2019
    * https://new.mta.info/agency/new-york-city-transit/subway-bus-ridership-2020

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import logging
import os

# import our helpers module
import helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
DATA_DIR = "data/"
SUMMARY_DIR = DATA_DIR + "summaries/"
logging.basicConfig(level=logging.DEBUG)

In [None]:
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(os.path.dirname(SUMMARY_DIR))

In [None]:
def summarise_years(years: range) -> pd.DataFrame:
    """
    Summarizes yearly statistics for trip data
    :param years: range of years to summarize
    :return: summary dataframe
    """
    summaries = []
    for year in years:
        print(f"Summarizing year {year}...")
        trips = helpers.get_trips(year, DATA_DIR)

        num_trips = trips.shape[0]
        avg_trip_duration_s = round(trips.tripduration.mean())
        num_bikes = trips.bikeid.nunique()

        num_stations = trips.startstationid.nunique()
        start_station_values = trips.startstationid.value_counts(ascending=False)
        mode_start_station_id = start_station_values.index[0]
        mode_start_station_trip_count = start_station_values.iloc[0]

        end_station_values = trips.endstationid.value_counts(ascending=False)
        mode_end_station_id = end_station_values.index[0]
        mode_end_station_trip_count = end_station_values.iloc[0]

        usertype_values = trips.usertype.value_counts()
        subscribers = usertype_values["Subscriber"]
        # some year have no "Customer" usertype so set to zero if this value does not exist
        customers = usertype_values["Customer"] if len(usertype_values.index) > 1 else 0

        gender_values = trips.gender.value_counts()
        trips_by_males = gender_values[1]
        trips_by_females = gender_values[2]

        summary = pd.Series(
            [
                num_trips,
                avg_trip_duration_s,
                num_bikes,
                num_stations,
                mode_start_station_id,
                mode_start_station_trip_count,
                mode_end_station_id,
                mode_end_station_trip_count,
                subscribers,
                customers,
                trips_by_males,
                trips_by_females,
            ],
            index=[
                "trip_count",
                "avg_trip_duration",
                "num_unique_bikes",
                "num_unique_stations",
                "mode_start_station_id",
                "mode_start_station_count",
                "mode_end_station_id",
                "mode_end_station_trip_count",
                "subscribers",
                "customers",
                "trips_by_males",
                "trips_by_females",
            ],
            name=year,
        )
        summaries.append(summary)

    return pd.concat(summaries, axis=1).T

In [None]:
%%time
summaries = summarise_years(range(2014, 2021))

DEBUG:root:Reading data/tripdata_parquet/NY/2014.parquet for year 2014...


Summarizing year 2014...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2014!
DEBUG:root:Reading data/tripdata_parquet/NY/2015.parquet for year 2015...


Summarizing year 2015...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2015!
DEBUG:root:Reading data/tripdata_parquet/NY/2016.parquet for year 2016...


Summarizing year 2016...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2016!
DEBUG:root:Reading data/tripdata_parquet/NY/2017.parquet for year 2017...


Summarizing year 2017...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2017!
DEBUG:root:Reading data/tripdata_parquet/NY/2018.parquet for year 2018...


Summarizing year 2018...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2018!
DEBUG:root:Reading data/tripdata_parquet/NY/2019.parquet for year 2019...


Summarizing year 2019...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2019!
DEBUG:root:Reading data/tripdata_parquet/NY/2020.parquet for year 2020...


Summarizing year 2020...


DEBUG:root:Changing dtypes
DEBUG:root:Done with 2020!


CPU times: user 2min 37s, sys: 2min 26s, total: 5min 3s
Wall time: 4min 52s


Add MTA subway and bus data

In [None]:
summaries.to_csv(SUMMARY_DIR + "summaries" + helpers.CSV_EXTENSION)

In [None]:
mta = pd.DataFrame(index=[2014, 2015, 2016, 2017, 2018, 2019, 2020])
# no data for 2013 and 2021
mta["subway"] = [
    -1,
    1751287621,
    1762565419,
    1756814800,
    1727366607,
    1680060402,
    1697787002,
    639541029,
    -1,
]
mta["bus"] = [
    -1,
    667051170 + 125581237,
    776081306,
    764030270,
    724834684,
    690809514,
    677588084,
    382424444,
    -1,
]  # MTA Bus Company + NYC Transit Buses
summaries = pd.concat([summaries, mta], axis=1)
summaries.to_csv(SUMMARY_DIR + "summaries" + helpers.CSV_EXTENSION)
summaries

ValueError: Length of values (9) does not match length of index (7)