In [5]:
import dlt
import requests
import pandas as pd
from dlt.destinations import filesystem
from io import BytesIO
import json
import os
import duckdb
import time
# from google.colab import userdata

print("dlt version: " + str(dlt.__version__))
print("requests version: " + str(requests.__version__))
print("pandas version: " + str(pd.__version__))
print("json version: " + str(json.__version__))
print("duckdb version: " + str(duckdb.__version__))

dlt version: 1.21.0
requests version: 2.32.5
pandas version: 3.0.0
json version: 2.0.9
duckdb version: 1.4.4


Set JSON credentials as GCP_CREDENTIALS secrets.

In [None]:
# os.environ["DESTINATION__CREDENTIALS"] = "gcs.json"

# os.environ["BUCKET_URL"] = "gs://sandbox-486719-nyc-taxi-raw"

In [None]:
# Install for production
# %%capture

# !pip install dlt[bigquery, gs]

In [None]:
# Install for testing
# %%capture

# !pip install dlt[duckdb]

In [7]:
def standardize_columns(df, taxi_type):
    if taxi_type == "green":
        df = df.rename(columns={
            "lpep_pickup_datetime": "pickup_datetime",
            "lpep_dropoff_datetime": "dropoff_datetime"
        })
    else:
        df = df.rename(columns={
            "tpep_pickup_datetime": "pickup_datetime",
            "tpep_dropoff_datetime": "dropoff_datetime"
        })
    return df

In [8]:
def cast_types(df):

    numeric_int_cols = [
        "VendorID","passenger_count","RatecodeID",
        "PULocationID","DOLocationID","payment_type"
    ]

    numeric_float_cols = [
        "trip_distance","fare_amount","extra","mta_tax",
        "tip_amount","tolls_amount","improvement_surcharge",
        "total_amount","congestion_surcharge"
    ]

    # Convert datetimes
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"], errors = "coerce")
    df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"], errors = "coerce")

    # Convert numeric columns safely
    for col in numeric_int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors = "coerce").astype("Int64")

    for col in numeric_float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors = "coerce")

    if "store_and_fwd_flag" in df.columns:
        df["store_and_fwd_flag"] = df["store_and_fwd_flag"].astype("string")

    return df

Ingest data into DuckDB.

In [10]:
# Define a dlt resource to download and process Parquet files as single table
@dlt.resource(name = "rides", write_disposition = "replace")

def download_taxi_data():

    base_urls = {
        # "yellow": "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow",
        # "green": "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green",
        "yellow": "data",
        "green": "data",
    }

    years = [2019, 2020]

    for taxi_type, base_url in base_urls.items():
        for year in years:
            for month in range(1, 13):
                
                month_str = f"{month:02d}"
                file_name = f"{taxi_type}_tripdata_{year}-{month_str}.csv.gz"
                url = f"{base_url}/{file_name}"

                print(f"Downloading {url}")

                # retry logic (network hiccups happen)
                max_retries = 3

                for attempt in range(max_retries):
                    try:
                        df = pd.read_csv(
                            url,
                            compression = "gzip",
                            low_memory = False,   # avoid dtype guessing warning
                            dtype = str           # force schema stability across months
                        )

                        df = standardize_columns(df, taxi_type)
                        df = cast_types(df)

                        # add metadata columns
                        df["taxi_type"] = taxi_type
                        df["year"] = str(year)
                        df["month"] = month_str

                        yield df
                        break  # success, exit retry loop

                    except Exception as e:
                        print(f"Failed attempt {attempt+1} for {file_name}: {e}")
                        time.sleep(2)

                        if attempt == max_retries - 1:
                            print(f"Skipping file after retries: {file_name}")

# Initialize the pipeline
pipeline = dlt.pipeline(
    pipeline_name = "rides_pipeline",
    destination = "duckdb",  # Use DuckDB for testing
    # destination="bigquery",  # Use BigQuery for production
    dataset_name = "rides_dataset",
)

# Run the pipeline to load Parquet data into DuckDB
info = pipeline.run(download_taxi_data)

# Print the results
print(info)

Downloading data/yellow_tripdata_2019-01.csv.gz
Downloading data/yellow_tripdata_2019-02.csv.gz
Downloading data/yellow_tripdata_2019-03.csv.gz
Downloading data/yellow_tripdata_2019-04.csv.gz
Downloading data/yellow_tripdata_2019-05.csv.gz
Downloading data/yellow_tripdata_2019-06.csv.gz
Downloading data/yellow_tripdata_2019-07.csv.gz
Downloading data/yellow_tripdata_2019-08.csv.gz
Downloading data/yellow_tripdata_2019-09.csv.gz
Downloading data/yellow_tripdata_2019-10.csv.gz


PipelineStepFailed: Pipeline execution failed at `step=extract` when processing package with `load_id=1770828488.110611` with exception:

<class 'KeyboardInterrupt'>


In [None]:
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

# Describe the dataset to see loaded tables
res = conn.sql("DESCRIBE").df()

print(res)

In [None]:
# provide a resource name to query a table of that name
with pipeline.sql_client() as client:
    with client.execute_query(f"SELECT count(1) FROM rides") as cursor:
        data = cursor.df()

print(data)

Ingest parquet files to GCS.

In [None]:
# Define a dlt source to download and process Parquet files as resources
@dlt.source(name = "rides")

def download_parquet():
    prefix = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata"
    for month in range(1, 7):
        file_name = f"yellow_tripdata_2024-0{month}.parquet"
        url = f"{prefix}_2024-0{month}.parquet"
        response = requests.get(url)

        df = pd.read_parquet(BytesIO(response.content))

        # Return the dataframe as a dlt resource for ingestion
        yield dlt.resource(df, name = file_name)


with open("gcs.json", "r") as f:
    credentials_dict = json.load(f)

my_bucket_url = "gs://sandbox-486719-nyc-taxi-test"

pipeline = dlt.pipeline(
    pipeline_name = "rides_pipeline",
    destination = filesystem(
        bucket_url = my_bucket_url,
        credentials = credentials_dict,
        layout = "{schema_name}/{table_name}.{ext}"
    ),
    dataset_name = "rides_dataset"
)

# Initialize the pipeline
# pipeline = dlt.pipeline(
#    pipeline_name = "rides_pipeline",
#    destination = filesystem,
#    dataset_name = "rides_dataset",
#)

# Run the pipeline to load Parquet data into DuckDB
load_info = pipeline.run(download_parquet(), loader_file_format = "parquet")

# Print the results
print(load_info)