# WIP!! Summarize all the Citibike trip data CSVs
Trying out
* Dask dataframes
* SQL
* creating summary df from monthly CSVs
* ???

In [None]:
import pandas as pd
import os
import logging

In [None]:
DATA_DIR = "data/"
CSV_DIR = DATA_DIR + "tripdata_csv/"
NY_DIR = CSV_DIR + "NY/"
NJ_DIR = CSV_DIR + "NJ/"

DB_FILE = "data/tripdata.db"

logging.basicConfig(level=logging.WARNING)

In [None]:
JC_DATA = os.listdir(NJ_DIR)  # NOTE: this includes Hoboken and Jersey City
NYC_DATA = os.listdir(NY_DIR)

logging.info(
    f"{len(JC_DATA)} Jersey City files and {len(NYC_DATA)} New York City files"
)

In [None]:
# Data ranges for NYC and NJ
# NOTE: data schema changes beginning 2021-02
# See: https://citibikenyc.com/system-data
SCHEMA_CHANGE_DATE = "2021-02"
# nyc_start = (2013, 6)
# nyc_change = (2021, 2)
# nyc_end = (2022, 2)

# nj_start = (2015, 9)
# nj_change = nyc_change
# nj_end = nyc_end

In [None]:
# CSV paths for NYC, JC (pre and post schema change)
nyc_old = sorted([NY_DIR + f for f in os.listdir(NY_DIR) if f < SCHEMA_CHANGE_DATE])
nyc_new = sorted([NY_DIR + f for f in os.listdir(NY_DIR) if f >= SCHEMA_CHANGE_DATE])

jc_old = sorted([NJ_DIR + f for f in os.listdir(NJ_DIR) if f < SCHEMA_CHANGE_DATE])
jc_new = sorted([NJ_DIR + f for f in os.listdir(NJ_DIR) if f >= SCHEMA_CHANGE_DATE])

In [None]:
# TODO only works for old schema at the moment
def summarise_months(outfilename: str, months: list):
    """
    Writes monthly summary given list of monthly trip data

    :param outfilename: where to write the summary csv
    :param months: list of CSVs for the monthly trip data
    :return: None
    """
    summaries = []

    for file in months:
        df = pd.read_csv(file)
        df.columns = [col.lower().replace(" ", "") for col in df.columns]
        # logging.debug(f'{file}: {list(df.columns)}')

        year_month = file.split("/")[-1].removesuffix(".csv")  # YYYYMM

        summary = pd.Series(dtype=object)
        summary["datetime"] = year_month
        summary["counttrips"] = df.shape[0]
        summary["meanduration"] = df.tripduration.mean()
        summary["modestartstationid"] = df.startstationid.mode()
        summary["modestartstationname"] = df.startstationname.mode()
        summary["modestartstationlatitude"] = df.startstationlatitude.mode()
        summary["modestartstationlongitude"] = df.startstationlongitude.mode()
        summary["modeendstationid"] = df.endstationid.mode()
        summary["modeendstationname"] = df.endstationname.mode()
        summary["modeendstationlatitude"] = df.endstationlatitude.mode()
        summary["modeendstationlongitude"] = df.endstationlongitude.mode()

        if "usertype" in df.columns:
            summary["usertypevalues"] = df.usertype.value_counts()
        elif "member_casual" in df.columns:
            summary["usertypevalues"] = df.member_casual.value_counts()

        if "gender" in df.columns:
            summary["gendervalues"] = df.gender.value_counts()

        summaries.append(summary)

    summary_df = pd.DataFrame()
    summary_df = summary_df.append(
        summaries
    )  # TODO use concat instead to suppress warning
    summary_df.set_index("datetime")
    summary_df.to_csv(outfilename)

In [None]:
# write summary data month by month for NYC and NJ
summarise_months(DATA_DIR + "summary_nyc_old_schema.csv", nyc_old)
summarise_months(DATA_DIR + "summary_jc_old_schema.csv", jc_old)

In [None]:
# read summary
nyc_old_schema_summary = pd.read_csv("data/summary_nyc_old_schema.csv", index_col=0)
nyc_old_schema_summary

In [None]:
def clobber_year(year=2019, state="NY") -> pd.DataFrame:
    """
    Writes a csv to `data/` with given `outfilename` that is all monthly trip data for that `year`

    :param year: the year for which to concatenate data files
    :param outfilename: the file to write to. E.g., 'clobber_2019.csv'
    :param state: 'NY' or 'NJ'. default 'NY'
    :return: the merged dataframe
    """

    range_start = str(year) + "-01"
    range_end = str(year) + "-13"  # Not sure why I have to select 13 here...
    files = None
    if state == "NY":
        files = sorted(
            [NY_DIR + f for f in os.listdir(NY_DIR) if range_start <= f <= range_end]
        )
    elif state == "NJ":
        files = sorted(
            [NJ_DIR + f for f in os.listdir(NJ_DIR) if range_start <= f <= range_end]
        )
    else:
        raise IndexError(f"No data for state: {state}")

    logging.debug(f"Will merge these files: {files}, number of files: {len(files)}")

    # Concatenate all monthly data in range
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        df.columns = [col.lower().replace(" ", "") for col in df.columns]
        logging.debug(f"Appending df file: {file}...")
        dfs.append(df)

    logging.debug(f"Merging dataframes...")
    clobbered = pd.concat(dfs, axis=0, ignore_index=True)

    return clobbered

In [None]:
# get 2019 data for testing
year_2019 = clobber_year(2019, "NY")
year_2019.to_csv("data/NY_2019.csv")
year_2019

In [None]:
year_2019.tail()

In [None]:
# read JC summary
jc_old_schema_summary = pd.read_csv("data/summary_nyc_old_schema.csv", index_col=0)
jc_old_schema_summary

In [None]:
# clobber all old nyc CSVs NOTE THIS CRASHES COMPUTER


# nyc_old_dfs = []
# for file in nyc_old:
#     print(f'file {NY_DIR + file}')
#     df = pd.read_csv(NY_DIR + file)
#     nyc_old_dfs.append(df)
#
# nyc_old_df = pd.concat(nyc_old_dfs, axis=0, ignore_index=True)

In [None]:
# import dask.dataframe as dd
# ddf = dd.read_csv(nyc_old,
#                   dtype={'birth year': 'object',
#                          'end station id': 'float64'})
#
# # columns are Sentence Cased for some CSVs and lower cased for others
# ddf = ddf.rename(columns=str.lower)

In [None]:
# ddf.describe().compute()