In [7]:
import dlt
import requests
import pandas as pd
from dlt.destinations import filesystem
from io import BytesIO
import json
import os
import duckdb
# from google.colab import userdata

print("dlt version: " + str(dlt.__version__))
print("requests version: " + str(requests.__version__))
print("pandas version: " + str(pd.__version__))
print("json version: " + str(json.__version__))
print("duckdb version: " + str(duckdb.__version__))

dlt version: 1.21.0
requests version: 2.32.5
pandas version: 3.0.0
json version: 2.0.9
duckdb version: 1.4.4


Set JSON credentials as GCP_CREDENTIALS secrets.

In [None]:
# os.environ["DESTINATION__CREDENTIALS"] = "gcs.json"

# os.environ["BUCKET_URL"] = "gs://sandbox-486719-nyc-taxi-raw"

In [None]:
# Install for production
# %%capture

# !pip install dlt[bigquery, gs]

In [None]:
# Install for testing
# %%capture

# !pip install dlt[duckdb]

Ingest data into DuckDB.

In [3]:
# Define a dlt resource to download and process Parquet files as single table
@dlt.resource(name = "rides", write_disposition = "replace")

def download_parquet():
    prefix = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata'

    for month in range(1, 7):
        url = f"{prefix}_2024-0{month}.parquet"
        response = requests.get(url)

        df = pd.read_parquet(BytesIO(response.content))

        yield df

# Initialize the pipeline
pipeline = dlt.pipeline(
    pipeline_name = "rides_pipeline",
    destination = "duckdb",  # Use DuckDB for testing
    # destination="bigquery",  # Use BigQuery for production
    dataset_name = "rides_dataset",
)

# Run the pipeline to load Parquet data into DuckDB
info = pipeline.run(download_parquet)

# Print the results
print(info)

Pipeline rides_pipeline load step completed in 2.95 seconds
1 load package(s) were loaded to destination duckdb and into dataset rides_dataset
The duckdb destination used duckdb:////Users/rob/Projects/GitHub/data-engineering-zoomcamp/03-data-warehousing/rides_pipeline.duckdb location to store data
Load package 1770650399.0568619 is LOADED and contains no failed jobs


In [4]:
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

# Describe the dataset to see loaded tables
res = conn.sql("DESCRIBE").df()

print(res)

         database         schema                 name  \
0  rides_pipeline  rides_dataset           _dlt_loads   
1  rides_pipeline  rides_dataset  _dlt_pipeline_state   
2  rides_pipeline  rides_dataset         _dlt_version   
3  rides_pipeline  rides_dataset                rides   

                                        column_names  \
0  [load_id, schema_name, status, inserted_at, sc...   
1  [version, engine_version, pipeline_name, state...   
2  [version, engine_version, inserted_at, schema_...   
3  [vendor_id, tpep_pickup_datetime, tpep_dropoff...   

                                        column_types  temporary  
0  [VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...      False  
1  [BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...      False  
2  [BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...      False  
3  [INTEGER, TIMESTAMP WITH TIME ZONE, TIMESTAMP ...      False  


In [5]:
# provide a resource name to query a table of that name
with pipeline.sql_client() as client:
    with client.execute_query(f"SELECT count(1) FROM rides") as cursor:
        data = cursor.df()

print(data)

   count(1)
0  20332093


Ingest parquet files to GCS.

In [None]:
# Define a dlt source to download and process Parquet files as resources
@dlt.source(name = "rides")

def download_parquet():
    prefix = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata"
    for month in range(1, 7):
        file_name = f"yellow_tripdata_2024-0{month}.parquet"
        url = f"{prefix}_2024-0{month}.parquet"
        response = requests.get(url)

        df = pd.read_parquet(BytesIO(response.content))

        # Return the dataframe as a dlt resource for ingestion
        yield dlt.resource(df, name = file_name)


with open("gcs.json", "r") as f:
    credentials_dict = json.load(f)

my_bucket_url = "gs://sandbox-486719-nyc-taxi-test"

pipeline = dlt.pipeline(
    pipeline_name = "rides_pipeline",
    destination = filesystem(
        bucket_url = my_bucket_url,
        credentials = credentials_dict,
        layout = "{schema_name}/{table_name}.{ext}"
    ),
    dataset_name = "rides_dataset"
)

# Initialize the pipeline
# pipeline = dlt.pipeline(
#    pipeline_name = "rides_pipeline",
#    destination = filesystem,
#    dataset_name = "rides_dataset",
#)

# Run the pipeline to load Parquet data into DuckDB
load_info = pipeline.run(download_parquet(), loader_file_format = "parquet")

# Print the results
print(load_info)