# dlt exploration notebook

In [2]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
import json


In [3]:
# define the base URL of the API
BASE_URL = "https://us-central1-dlthub-analytics.cloudfunctions.net"
ENDPOINT = "data_engineering_zoomcamp_api"
STARTING_PAGE = 1
# define the endpoint we want to query

# define a function that returns a paginated getter
def paginated_getter():
    # create a REST client with the base URL and a page number paginator
    client = RESTClient(
        base_url=BASE_URL,
        paginator=PageNumberPaginator(
          base_page=STARTING_PAGE,  # Start pagination from page 1
          total_path=None  # No total count path, paginate until no more pages
        )
    )

    # Iterate over each page of data from the API using the client
    for page in client.paginate(ENDPOINT):
        yield page  # Yield each page of data for processing

In [5]:
page_num = STARTING_PAGE
for page_data in paginated_getter():
    if page_num == STARTING_PAGE:
      # Store the first record as a sample for reference
      sample_record = json.dumps(page_data[0], indent=2)
    
    # Print the total records on each page   
    print(f"Page {page_num} - Total records: {len(page_data)}")
    page_num+=1
    break

Page 1 - Total records: 1000


In [6]:
# Print the sample record
print("Sample Record: \n", sample_record)

Sample Record: 
 {
  "End_Lat": 40.742963,
  "End_Lon": -73.980072,
  "Fare_Amt": 45.0,
  "Passenger_Count": 1,
  "Payment_Type": "Credit",
  "Rate_Code": null,
  "Start_Lat": 40.641525,
  "Start_Lon": -73.787442,
  "Tip_Amt": 9.0,
  "Tolls_Amt": 4.15,
  "Total_Amt": 58.15,
  "Trip_Distance": 17.52,
  "Trip_Dropoff_DateTime": "2009-06-14 23:48:00",
  "Trip_Pickup_DateTime": "2009-06-14 23:23:00",
  "mta_tax": null,
  "store_and_forward": null,
  "surcharge": 0.0,
  "vendor_name": "VTS"
}


In [7]:
# dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_pipeline",
    destination="duckdb",
    dataset_name="ny_taxi_data" 
)


In [8]:
# Load the data into the "rides" table in DuckDB, replacing any existing data.
load_info = pipeline.run(
  paginated_getter(),  # The generator function that retrieves paginated data
  table_name="rides",  # Specify the target table name
  write_disposition="replace"  # Replace the existing data in the table
)

print(load_info)

Pipeline ny_taxi_pipeline load step completed in 2.44 seconds
1 load package(s) were loaded to destination duckdb and into dataset ny_taxi_data
The duckdb destination used duckdb:////home/pedro/projects/data-engineering-zoomcamp/workshops/dlt-pipeline/ny_taxi_pipeline.duckdb location to store data
Load package 1739407311.977948 is LOADED and contains no failed jobs


In [None]:
# Streamlit info
!dlt pipeline ny_taxi_pipeline show

### Data in duckdb

As expected, 10,000 rows on duckDb

![](assets/images/image.png)

And the json file properly dumped into duckDb

| Index | name                   | data_type | nullable |
|-------|------------------------|-----------|----------|
| 0     | end_lat                | double    | true     |
| 1     | end_lon                | double    | true     |
| 2     | fare_amt               | double    | true     |
| 3     | passenger_count        | bigint    | true     |
| 4     | payment_type           | text      | true     |
| 5     | start_lat              | double    | true     |
| 6     | start_lon              | double    | true     |
| 7     | tip_amt                | double    | true     |
| 8     | tolls_amt              | double    | true     |
| 9     | total_amt              | double    | true     |
| 10    | trip_distance          | double    | true     |
| 11    | trip_dropoff_date_time | timestamp | true     |
| 12    | trip_pickup_date_time  | timestamp | true     |
| 13    | surcharge              | double    | true     |
| 14    | vendor_name            | text      | true     |
| 15    | _dlt_load_id           | text      | false    |
| 16    | _dlt_id                | text      | false    |
| 17    | store_and_forward      | double    | true     |


## dlt decorator


In [9]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator

BASE_URL = "https://us-central1-dlthub-analytics.cloudfunctions.net"
ENDPOINT = "data_engineering_zoomcamp_api"
STARTING_PAGE = 1

# Define the API resource for NYC taxi data
@dlt.resource(name="rides")   # <--- The name of the resource (will be used as the table name)
def ny_taxi():
    client = RESTClient(
        base_url=BASE_URL,
        paginator=PageNumberPaginator(
            base_page=STARTING_PAGE,
            total_path=None
        )
    )

    for page in client.paginate(ENDPOINT):    # <--- API endpoint for retrieving taxi ride data
        yield page   # <--- yield data to manage memory
        break

In [10]:
# define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="decorated_ny_taxi_pipeline", 
    destination="duckdb",
    dataset_name="decorated_ny_taxi_data"
)

In [None]:
# run the pipeline with the new resource
load_info = pipeline.run(
  ny_taxi,
  write_disposition="replace",
  # table_name="decorated_rides" # Optional 
)

print(load_info)

In [None]:
# Streamlit info
!dlt pipeline decorated_ny_taxi_pipeline show

In [12]:

# Optionally, explore loaded data
pipeline.dataset(dataset_type="default").rides.df()

Unnamed: 0,end_lat,end_lon,fare_amt,passenger_count,payment_type,start_lat,start_lon,tip_amt,tolls_amt,total_amt,trip_distance,trip_dropoff_date_time,trip_pickup_date_time,surcharge,vendor_name,_dlt_load_id,_dlt_id,store_and_forward
0,40.742963,-73.980072,45.0,1,Credit,40.641525,-73.787442,9.0,4.15,58.15,17.52,2009-06-14 23:48:00+00:00,2009-06-14 23:23:00+00:00,0.0,VTS,1739407361.0965526,AEJzInshPiaE9A,
1,40.740187,-74.005698,6.5,1,Credit,40.722065,-74.009767,1.0,0.00,8.50,1.56,2009-06-18 17:43:00+00:00,2009-06-18 17:35:00+00:00,1.0,VTS,1739407361.0965526,ev3sLEeCEaWdZQ,
2,40.718043,-74.004745,12.5,5,Credit,40.761945,-73.983038,2.0,0.00,15.50,3.37,2009-06-10 18:27:00+00:00,2009-06-10 18:08:00+00:00,1.0,VTS,1739407361.0965526,hhm/XJ3hsHu74Q,
3,40.739637,-73.985233,4.9,1,CASH,40.749802,-73.992247,0.0,0.00,5.40,1.11,2009-06-14 23:58:00+00:00,2009-06-14 23:54:00+00:00,0.5,VTS,1739407361.0965526,nPv7xvZZbNlGLA,
4,40.730032,-73.852693,25.7,1,CASH,40.776825,-73.949233,0.0,4.15,29.85,11.09,2009-06-13 13:23:00+00:00,2009-06-13 13:01:00+00:00,0.0,VTS,1739407361.0965526,TYWz4xQlrBRXEw,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,40.742998,-73.919065,6.9,1,CASH,40.743523,-73.918735,0.0,0.00,6.90,1.83,2009-06-10 06:23:00+00:00,2009-06-10 06:16:00+00:00,0.0,VTS,1739407361.0965526,GQUvI0so4CSjWA,
996,40.731953,-73.985330,7.3,1,CASH,40.733143,-74.006408,0.0,0.00,7.80,1.59,2009-06-10 05:11:00+00:00,2009-06-10 05:02:00+00:00,0.5,VTS,1739407361.0965526,t+lpDMGk848XgQ,
997,40.712640,-73.998870,5.7,1,CASH,40.711865,-74.010158,0.0,0.00,5.70,0.79,2009-06-13 12:45:00+00:00,2009-06-13 12:37:00+00:00,0.0,VTS,1739407361.0965526,TayEkBPNIE7p+w,
998,40.732998,-74.007113,7.3,2,CASH,40.744658,-73.992063,0.0,0.00,7.80,1.87,2009-06-09 21:34:00+00:00,2009-06-09 21:25:00+00:00,0.5,VTS,1739407361.0965526,Ec3BJfNbiRIauw,


## Incremental load

In [16]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator

BASE_URL = "https://us-central1-dlthub-analytics.cloudfunctions.net"
ENDPOINT = "data_engineering_zoomcamp_api"
STARTING_PAGE = 1

# Define the API resource for NYC taxi data
@dlt.resource(name="rides", write_disposition="append")   # <--- The name of the resource (will be used as the table name)
def ny_taxi(
    cursor_date = dlt.sources.incremental(
        "Trip_Dropoff_DateTime",   # <--- field to track, our timestamp
        initial_value="2009-06-15",   # <--- start date June 15, 2009
    )
  ):
    client = RESTClient(
        base_url=BASE_URL,
        paginator=PageNumberPaginator(
            base_page=STARTING_PAGE,
            total_path=None
        )
    )

    for page in client.paginate(ENDPOINT):    # <--- API endpoint for retrieving taxi ride data
        yield page   # <--- yield data to manage memory
        break

In [17]:
# define new dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_incremental",
    destination="duckdb",
    dataset_name="ny_taxi_data"
)

In [18]:
# run the pipeline with the new resource
load_info = pipeline.run(ny_taxi)

print(pipeline.last_trace)

Run started at 2025-02-13 01:05:26.625355+00:00 and COMPLETED in 3.40 seconds with 4 steps.
Step extract COMPLETED in 2.85 seconds.

Load package 1739408726.7149644 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.12 seconds.
Normalized data for the following tables:
- _dlt_pipeline_state: 1 row(s)
- rides: 346 row(s)

Load package 1739408726.7149644 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs

Step load COMPLETED in 0.34 seconds.
Pipeline ny_taxi_incremental load step completed in 0.30 seconds
1 load package(s) were loaded to destination duckdb and into dataset ny_taxi_data
The duckdb destination used duckdb:////home/pedro/projects/data-engineering-zoomcamp/workshops/dlt-pipeline/ny_taxi_incremental.duckdb location to store data
Load package 1739408726.7149644 is LOADED and contains no failed jobs

Step run COMPLETED in 3.40 seconds.
Pipeline ny_taxi_incremental load step completed in 0.30

In [None]:
!dlt pipeline ny_taxi_incremental show

In [35]:
with pipeline.sql_client() as client:
    res = client.execute_sql(
            """
            SELECT
            MIN(trip_dropoff_date_time)
            FROM rides;
            """
        )
    # print(res)
    print(res[0][0])

    

2009-06-15 09:10:00+00:00


In [36]:
# define new dlt pipeline
pipeline = dlt.pipeline(pipeline_name="ny_taxi_incremental", destination="duckdb", dataset_name="ny_taxi_data")


In [37]:
# run the pipeline with the new resource
load_info = pipeline.run(ny_taxi)
print(pipeline.last_trace)

Run started at 2025-02-13 01:11:57.823780+00:00 and COMPLETED in 3.22 seconds with 4 steps.
Step extract COMPLETED in 2.83 seconds.

Load package 1739409118.1726925 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs

Step normalize COMPLETED in 0.06 seconds.
No data found to normalize

Step load COMPLETED in 0.03 seconds.
Pipeline ny_taxi_incremental load step completed in ---
0 load package(s) were loaded to destination duckdb and into dataset None
The duckdb destination used duckdb:////home/pedro/projects/data-engineering-zoomcamp/workshops/dlt-pipeline/ny_taxi_incremental.duckdb location to store data

Step run COMPLETED in 3.22 seconds.
Pipeline ny_taxi_incremental load step completed in ---
0 load package(s) were loaded to destination duckdb and into dataset None
The duckdb destination used duckdb:////home/pedro/projects/data-engineering-zoomcamp/workshops/dlt-pipeline/ny_taxi_incremental.duckdb location to store data
