### Notebook that explores how to read parquet files and upload to GCS bucket

The goal is to read from the page https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page the data for all 2022 and upload every month in a bucket

In [1]:
import io
import os
import requests
import pandas as pd
from google.cloud import storage

In [2]:
init_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
BUCKET = os.environ.get("GCP_GCS_BUCKET", "terraform-stavros-bucket")

In [3]:
def upload_to_gcs(bucket, object_name, local_file):
    """
    Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
    """
    # # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
    # # (Ref: https://github.com/googleapis/python-storage/issues/74)
    # storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
    # storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB

    client = storage.Client()
    bucket = client.bucket(bucket)
    blob = bucket.blob(object_name)
    blob.upload_from_filename(local_file)

In [4]:
def web_to_gcs(year, service):
    for i in range(12):
        
        # sets the month part of the file_name string
        month = '0'+str(i+1)
        month = month[-2:]

        # csv file_name
        file_name = f"{service}_tripdata_{year}-{month}.parquet"

        # download it using requests via a pandas df
        request_url = f"{init_url}{file_name}"
        r = requests.get(request_url)
        open(file_name, 'wb').write(r.content)
        print(f"Local: {file_name}")

        # upload it to gcs 
        upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name)
        print(f"GCS: {service}/{file_name}")


In [5]:
web_to_gcs('2022', 'green')

Local: green_tripdata_2022-01.parquet
GCS: green/green_tripdata_2022-01.parquet
Local: green_tripdata_2022-02.parquet
GCS: green/green_tripdata_2022-02.parquet
Local: green_tripdata_2022-03.parquet
GCS: green/green_tripdata_2022-03.parquet
Local: green_tripdata_2022-04.parquet
GCS: green/green_tripdata_2022-04.parquet
Local: green_tripdata_2022-05.parquet
GCS: green/green_tripdata_2022-05.parquet
Local: green_tripdata_2022-06.parquet
GCS: green/green_tripdata_2022-06.parquet
Local: green_tripdata_2022-07.parquet
GCS: green/green_tripdata_2022-07.parquet
Local: green_tripdata_2022-08.parquet
GCS: green/green_tripdata_2022-08.parquet
Local: green_tripdata_2022-09.parquet
GCS: green/green_tripdata_2022-09.parquet
Local: green_tripdata_2022-10.parquet
GCS: green/green_tripdata_2022-10.parquet
Local: green_tripdata_2022-11.parquet
GCS: green/green_tripdata_2022-11.parquet
Local: green_tripdata_2022-12.parquet
GCS: green/green_tripdata_2022-12.parquet


In [6]:
df = pd.read_parquet('green_tripdata_2022-01.parquet')
df


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2022-01-01 00:14:21,2022-01-01 00:15:33,N,1.0,42,42,1.0,0.44,3.50,0.50,0.5,0.00,0.0,,0.3,4.80,2.0,1.0,0.00
1,1,2022-01-01 00:20:55,2022-01-01 00:29:38,N,1.0,116,41,1.0,2.10,9.50,0.50,0.5,0.00,0.0,,0.3,10.80,2.0,1.0,0.00
2,1,2022-01-01 00:57:02,2022-01-01 01:13:14,N,1.0,41,140,1.0,3.70,14.50,3.25,0.5,4.60,0.0,,0.3,23.15,1.0,1.0,2.75
3,2,2022-01-01 00:07:42,2022-01-01 00:15:57,N,1.0,181,181,1.0,1.69,8.00,0.50,0.5,0.00,0.0,,0.3,9.30,2.0,1.0,0.00
4,2,2022-01-01 00:07:50,2022-01-01 00:28:52,N,1.0,33,170,1.0,6.26,22.00,0.50,0.5,5.21,0.0,,0.3,31.26,1.0,1.0,2.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62490,2,2022-01-31 23:25:00,2022-01-31 23:33:00,,,40,65,,1.40,8.38,0.00,0.0,1.93,0.0,,0.3,10.61,,,
62491,2,2022-01-31 23:52:00,2022-02-01 00:10:00,,,36,61,,2.97,14.92,0.00,0.0,0.00,0.0,,0.3,15.22,,,
62492,2,2022-01-31 23:17:00,2022-01-31 23:36:00,,,75,167,,3.70,16.26,0.00,0.0,0.00,0.0,,0.3,16.56,,,
62493,2,2022-01-31 23:45:00,2022-01-31 23:55:00,,,116,166,,1.88,9.48,0.00,0.0,2.17,0.0,,0.3,11.95,,,
