# Get Citibike trip data
This script:
* downloads Citibike trip data zip files from S3 bucket to `data` directory
* unzips downloaded files to CSVs
* separates NJ and NY data into different directories
* renames CSVs to YYYY-MM.csv filename format

## Prerequisites
`boto3` uses AWS credentials from `~/.aws/credentials`
* [Create AWS account](https://aws.amazon.com/resources/create-account/)
* [Install AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
* [Configure AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html)
    * run `aws configure` in terminal and set credentials from access key created for a user

## Citibike AWS S3 bucket
[https://s3.amazonaws.com/tripdata/index.html](https://s3.amazonaws.com/tripdata/index.html)

In [None]:
import boto3
import os, zipfile, logging

import pandas as pd

* NOTE: Change log level below to logging.DEBUG to see log messages. Set to logging.WARNING to suppress log messages
* Set directory paths
* Set S3 bucket

In [None]:
TRIPDATA_BUCKET = "tripdata"
TRIPDATA_ZIP = "data/tripdata_zip/"
TRIPDATA_CSV = "data/tripdata_csv/"
JC_DIR = TRIPDATA_CSV + "NJ/"
NYC_DIR = TRIPDATA_CSV + "NY/"
TRIPDATA_PARQUET = "data/tripdata_parquet/"
JC_PQT_DIR = TRIPDATA_PARQUET + "NJ/"
NYC_PQT_DIR = TRIPDATA_PARQUET + "NY/"
SCHEMA_CHANGE_DATE = "2021-02"

s3 = boto3.resource("s3")
tripdata = s3.Bucket(TRIPDATA_BUCKET)

# Set level to WARNING or higher to suppress INFO/DEBUG messages
logging.basicConfig(level=logging.WARNING)

* Download Citibike trip data zips

In [None]:
# create directory to save ZIPs to
if not os.path.exists(TRIPDATA_ZIP):
    os.makedirs(os.path.dirname(TRIPDATA_ZIP))

# download all S3 objects in bucket to directory if empty
if not os.listdir(TRIPDATA_ZIP):
    for s3_object in tripdata.objects.all():
        _, filename = os.path.split(s3_object.key)
        logging.info(f"Downloading {filename} to {TRIPDATA_ZIP}...")
        tripdata.download_file(s3_object.key, TRIPDATA_ZIP + filename)

In [None]:
# remove index.html and combo csv file
files_to_remove = ["index.html", "201307-201402-citibike-tripdata.zip"]

for file in files_to_remove:
    path = TRIPDATA_ZIP + file
    if os.path.exists(path):
        os.remove(path)
        logging.info(f"Removed {path}")

logging.info(f"{len(os.listdir(TRIPDATA_ZIP))} objects downloaded")

* Unzip CSVs into directory

In [None]:
# create directory to save CSVs to
if not os.path.exists(TRIPDATA_CSV):
    os.makedirs(os.path.dirname(TRIPDATA_CSV))

# unzip all if not unzipped already
if not os.listdir(TRIPDATA_CSV):
    for filename in os.listdir(TRIPDATA_ZIP):
        logging.debug(f"Attempting to extract {filename}...")
        if filename.endswith(".zip"):
            filepath = os.path.abspath(TRIPDATA_ZIP + filename)
            with zipfile.ZipFile(filepath) as zipitem:
                logging.debug(f"Extracting {zipitem.filename}...")
                zipitem.extractall(TRIPDATA_CSV)
                zipitem.close()
                logging.debug(f"Extracted {zipitem.filename}")

logging.info(
    f"{len(os.listdir(TRIPDATA_CSV))} files in directory (count might include non CSV files)"
)

* Store Jersey City versus NYC trips in separate folders

In [None]:
if not os.path.exists(JC_DIR):
    os.makedirs(os.path.dirname(JC_DIR))

if not os.path.exists(NYC_DIR):
    os.makedirs(os.path.dirname(NYC_DIR))

logging.debug(f"Separating JC and NYC data...")
csv_files = [file for file in os.listdir(TRIPDATA_CSV) if file.endswith(".csv")]
for file in csv_files:
    if os.path.isdir(file):
        logging.debug(f"Skipping directory {file}")
    else:
        logging.debug(f"Moving file {file}...")

        # All Jersey City trip data CSVs begin with "JC"
        if file.startswith("JC"):
            os.replace(TRIPDATA_CSV + file, JC_DIR + file)
        else:
            os.replace(TRIPDATA_CSV + file, NYC_DIR + file)

logging.info(
    f"{len(os.listdir(JC_DIR))} files in NJ directory (count might include non CSV files)"
)
logging.info(
    f"{len(os.listdir(NYC_DIR))} files in NY directory (count might include non CSV files)"
)

* Rename the files to YYYY-MM.csv format

In [None]:
def rename_files(directory: str):
    """rename files to YYYY-MM.csv format"""
    for file in os.listdir(directory):
        filedigits = ""
        for c in file:
            if c.isdigit():
                filedigits += c

        # some CSVs are missing leading zero in month. e.g., 20161 instead of 201601
        if len(filedigits) != 6:
            logging.warning(f"File not in YYYYMM format: {file}")
            filedigits = filedigits[:-1] + "0" + filedigits[-1]

        new_file = filedigits[:-2] + "-" + filedigits[-2:] + ".csv"
        os.rename(directory + file, directory + new_file)
        logging.debug(f"Renaming file to: {new_file}")


logging.info(f"Renaming NJ and NY files to YYYY-MM.csv format")
rename_files(JC_DIR)
rename_files(NYC_DIR)

* Convert CSV files to parquet
    * Only convert files with the old Citibike schema (pre-2021-02)  
* Drop NAs
* Use standardized column names

In [None]:
# convert to parquet
if not os.path.exists(TRIPDATA_PARQUET):
    os.makedirs(os.path.dirname(TRIPDATA_PARQUET))

if not os.path.exists(JC_PQT_DIR):
    os.makedirs(os.path.dirname(JC_PQT_DIR))

if not os.path.exists(NYC_PQT_DIR):
    os.makedirs(os.path.dirname(NYC_PQT_DIR))


def convert_to_parquet(directory_from: str, directory_to: str):
    """convert csv in `directory_from` to parquet format in `directory_to`"""
    csv_files = sorted(
        [
            directory_from + f
            for f in os.listdir(directory_from)
            if f < SCHEMA_CHANGE_DATE
        ]
    )
    i = 0
    for csv_file in csv_files:
        i += 1
        filename = csv_file.split("/")[-1][:-4]
        newfilepath = directory_to + filename + ".parquet"
        logging.debug(f"parquetifying file {i} of {len(csv_files)}...")

        csv_df = pd.read_csv(csv_file)
        csv_df.columns = [col.lower().replace(" ", "") for col in csv_df.columns]
        csv_df.dropna(inplace=True)  # remove NAs for trips to stations in the Bronx
        csv_df["birthyear"] = csv_df["birthyear"].replace(
            r"\\N", "0", regex=True
        )  # replace \N with string '0'
        csv_df.to_parquet(newfilepath)

In [None]:
%%time
convert_to_parquet(NYC_DIR, NYC_PQT_DIR)

In [None]:
%%time
convert_to_parquet(JC_DIR, JC_PQT_DIR)