# Get Citibike trip data

## Prerequisites
`boto3` uses AWS credentials from `~/.aws/credentials`
* [Create AWS account](https://aws.amazon.com/resources/create-account/)
* [Install AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
* [Configure AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html)
    * run `aws configure` in terminal and set credentials from access key created for a user

## Citibike AWS S3 bucket
[https://s3.amazonaws.com/tripdata/index.html](https://s3.amazonaws.com/tripdata/index.html)

In [None]:
import boto3
import os, zipfile

In [None]:
TRIPDATA_BUCKET = 'tripdata'
TRIPDATA_ZIP = 'data/tripdata_zip/'
TRIPDATA_CSV = 'data/tripdata_csv/'

s3 = boto3.resource('s3')
tripdata = s3.Bucket(TRIPDATA_BUCKET)

In [None]:
# create directory to save data to
if not os.path.exists(TRIPDATA_ZIP):
    os.makedirs(os.path.dirname(TRIPDATA_ZIP))

if not os.path.exists(TRIPDATA_CSV):
    os.makedirs(os.path.dirname(TRIPDATA_CSV))

In [None]:
# download all S3 objects in bucket to directory if empty
if not os.listdir(TRIPDATA_ZIP):
    for s3_object in tripdata.objects.all():
        _, filename = os.path.split(s3_object.key)
        print(f'Downloading {filename} to {TRIPDATA_ZIP}...')
        tripdata.download_file(s3_object.key, TRIPDATA_ZIP + filename)

In [None]:
# remove index.html and combo csv file
files_to_remove = ['index.html', '201307-201402-citibike-tripdata.zip']

for file in files_to_remove:
    path = TRIPDATA_ZIP + file
    if os.path.exists(path):
        os.remove(path)
        print(f'Removed {path}')

print(f'{len(os.listdir(TRIPDATA_ZIP))} objects downloaded')

In [None]:
# unzip all if not unzipped already
if not os.listdir(TRIPDATA_CSV):
    for filename in os.listdir(TRIPDATA_ZIP):
        print(f'Attempting to extract {filename}...')
        if filename.endswith('.zip'):
            filepath = os.path.abspath(TRIPDATA_ZIP + filename)
            with zipfile.ZipFile(filepath) as zipitem:
                print(f'Extracting {zipitem.filename}...')
                zipitem.extractall(TRIPDATA_CSV)
                zipitem.close()
                print(f'Extracted {zipitem.filename}')

print(f'{len(os.listdir(TRIPDATA_CSV))} files in directory (count might include non CSV files)')