# Creates yearly summary table for Citibike trips data
* Also adds MTA subway and bus ridership data from
    * https://new.mta.info/agency/new-york-city-transit/subway-bus-ridership-2019
    * https://new.mta.info/agency/new-york-city-transit/subway-bus-ridership-2020

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import logging
import os

# import our helpers module
import helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
DATA_DIR = "data/"
SUMMARY_DIR = DATA_DIR + "summaries/"
logging.basicConfig(level=logging.DEBUG)

In [None]:
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(os.path.dirname(SUMMARY_DIR))

In [None]:
def summarise_years(years: range) -> pd.DataFrame:
    """
    Summarizes yearly statistics for trip data
    :param years: range of years to summarize
    :return: summary dataframe
    """
    summaries = []
    for year in years:
        print(f"Summarizing year {year}...")
        trips = helpers.get_trips(year, DATA_DIR)

        num_trips = trips.shape[0]
        avg_trip_duration_s = round(trips.tripduration.mean())

        start_station_values = trips.startstationid.value_counts(ascending=False)
        mode_start_station_id = start_station_values.index[0]
        mode_start_station_trip_count = start_station_values.iloc[0]

        end_station_values = trips.endstationid.value_counts(ascending=False)
        mode_end_station_id = end_station_values.index[0]
        mode_end_station_trip_count = end_station_values.iloc[0]

        usertype_values = trips.usertype.value_counts()
        subscribers = usertype_values["Subscriber"]
        # some year have no "Customer" usertype so set to zero if this value does not exist
        customers = usertype_values["Customer"] if len(usertype_values.index) > 1 else 0

        gender_values = trips.gender.value_counts()
        trips_by_males = gender_values[1]
        trips_by_females = gender_values[2]

        summary = pd.Series(
            [
                num_trips,
                avg_trip_duration_s,
                mode_start_station_id,
                mode_start_station_trip_count,
                mode_end_station_id,
                mode_end_station_trip_count,
                subscribers,
                customers,
                trips_by_males,
                trips_by_females,
            ],
            index=[
                "trip_count",
                "avg_trip_duration",
                "mode_start_station_id",
                "mode_start_station_count",
                "mode_end_station_id",
                "mode_end_station_trip_count",
                "subscribers",
                "customers",
                "trips_by_males",
                "trips_by_females",
            ],
            name=year,
        )
        summaries.append(summary)

    return pd.concat(summaries, axis=1).T

In [None]:
%%time
summaries = summarise_years(range(2013, 2022))

Add MTA subway and bus data

In [None]:
mta = pd.DataFrame(index=[2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])
# no data for 2013 and 2021
mta["subway"] = [
    -1,
    1751287621,
    1762565419,
    1756814800,
    1727366607,
    1680060402,
    1697787002,
    639541029,
    -1,
]
mta["bus"] = [
    -1,
    667051170 + 125581237,
    776081306,
    764030270,
    724834684,
    690809514,
    677588084,
    382424444,
    -1,
]  # MTA Bus Company + NYC Transit Buses
summaries = pd.concat([summaries, mta], axis=1)
summaries.to_csv(SUMMARY_DIR + "summaries" + helpers.CSV_EXTENSION)
summaries

Unnamed: 0,trip_count,avg_trip_duration,mode_start_station_id,mode_start_station_count,mode_end_station_id,mode_end_station_trip_count,subscribers,customers,trips_by_males,trips_by_females,subway,bus,subway.1,bus.1
2013,5364026,900,497,53113,497,57567,4697086,666940,3590728,1105721,-1,-1,-1,-1
2014,7868282,826,521,99245,497,83706,7287687,580595,5635920,1650274,1751287621,792632407,1751287621,792632407
2015,8626591,815,519,96750,293,89073,8626591,0,6608883,2006962,1762565419,776081306,1762565419,776081306
2016,12222976,819,519,141891,519,138989,12188588,34388,9215130,2973111,1756814800,764030270,1756814800,764030270
2017,14718438,895,519,154466,519,153185,14407273,311165,10944297,3709823,1727366607,724834684,1727366607,724834684
2018,17545842,988,519,150257,519,149932,15613370,1932472,11969615,4091945,1680060402,690809514,1680060402,690809514
2019,20551517,978,519,156575,519,155536,17679393,2872124,14052230,4938982,1697787002,677588084,1697787002,677588084
2020,19506857,1311,3141,100753,426,101767,14955766,4551091,11798407,5551873,639541029,382424444,639541029,382424444
2021,1095346,934,3141,7395,3141,7282,945169,150177,714609,289850,-1,-1,-1,-1
