In [1]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
import duckdb
import pandas as pd

# Define the API resource for NYC taxi data
@dlt.resource(name="rides")   # <--- The name of the resource (will be used as the table name)
def ny_taxi():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )


    for page in client.paginate("data_engineering_zoomcamp_api"):    # <--- API endpoint for retrieving taxi ride data
        yield page   # <--- yield data to manage memory


# define new dlt pipeline
pipeline = dlt.pipeline(destination="duckdb")

In [6]:
# run the pipeline with the new resource
load_info = pipeline.run(ny_taxi, write_disposition="replace")
print("Pipeline Execution Info:", load_info)

Pipeline Execution Info: Pipeline dlt_ipykernel_launcher load step completed in 1.21 seconds
1 load package(s) were loaded to destination duckdb and into dataset dlt_ipykernel_launcher_dataset
The duckdb destination used duckdb:////Users/yitian66/Documents/DE-Datacamp/Workshop/dlt_ipykernel_launcher.duckdb location to store data
Load package 1739324181.655221 is LOADED and contains no failed jobs


In [7]:
# explore loaded data
data = pipeline.dataset(dataset_type="default").rides.df()
print("Data Preview:")
print(data.head())  # Display the first few rows of the loaded data

Data Preview:
     end_lat    end_lon  fare_amt  passenger_count payment_type  start_lat  \
0  40.742963 -73.980072      45.0                1       Credit  40.641525   
1  40.740187 -74.005698       6.5                1       Credit  40.722065   
2  40.718043 -74.004745      12.5                5       Credit  40.761945   
3  40.739637 -73.985233       4.9                1         CASH  40.749802   
4  40.730032 -73.852693      25.7                1         CASH  40.776825   

   start_lon  tip_amt  tolls_amt  total_amt  trip_distance  \
0 -73.787442      9.0       4.15      58.15          17.52   
1 -74.009767      1.0       0.00       8.50           1.56   
2 -73.983038      2.0       0.00      15.50           3.37   
3 -73.992247      0.0       0.00       5.40           1.11   
4 -73.949233      0.0       4.15      29.85          11.09   

     trip_dropoff_date_time     trip_pickup_date_time  surcharge vendor_name  \
0 2009-06-14 23:48:00+00:00 2009-06-14 23:23:00+00:00        0.0

In [10]:
# Connect to the DuckDB database
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

# Describe the dataset 
desc_df = conn.sql("DESCRIBE").df()

print(desc_df.head())

                 database                          schema  \
0  dlt_ipykernel_launcher  dlt_ipykernel_launcher_dataset   
1  dlt_ipykernel_launcher  dlt_ipykernel_launcher_dataset   
2  dlt_ipykernel_launcher  dlt_ipykernel_launcher_dataset   
3  dlt_ipykernel_launcher  dlt_ipykernel_launcher_dataset   

                  name                                       column_names  \
0           _dlt_loads  [load_id, schema_name, status, inserted_at, sc...   
1  _dlt_pipeline_state  [version, engine_version, pipeline_name, state...   
2         _dlt_version  [version, engine_version, inserted_at, schema_...   
3                rides  [end_lat, end_lon, fare_amt, passenger_count, ...   

                                        column_types  temporary  
0  [VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...      False  
1  [BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...      False  
2  [BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...      False  
3  [DOUBLE, DOUBLE, DOUBLE, BIGINT, VARCHAR,

In [12]:
data = pipeline.dataset(dataset_type="default").rides.df()
print(len(data))

10000


In [13]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            AVG(date_diff('minute', trip_pickup_date_time, trip_dropoff_date_time))
            FROM rides;
            """
        )
    # Prints column values of the first row
    print(res)

[(12.3049,)]
