# dlt exploration notebook

In [33]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
import json


In [34]:
# define the base URL of the API
BASE_URL = "https://us-central1-dlthub-analytics.cloudfunctions.net"
ENDPOINT = "data_engineering_zoomcamp_api"
STARTING_PAGE = 1
# define the endpoint we want to query

# define a function that returns a paginated getter
def paginated_getter():
    # create a REST client with the base URL and a page number paginator
    client = RESTClient(
        base_url=BASE_URL,
        paginator=PageNumberPaginator(
          base_page=STARTING_PAGE,  # Start pagination from page 1
          total_path=None  # No total count path, paginate until no more pages
        )
    )

    # Iterate over each page of data from the API using the client
    for page in client.paginate(ENDPOINT):
        yield page  # Yield each page of data for processing

In [37]:
page_num = STARTING_PAGE
for page_data in paginated_getter():
    if page_num == STARTING_PAGE:
      # Store the first record as a sample for reference
      sample_record = json.dumps(page_data[0], indent=2)
    
    # Print the total records on each page   
    print(f"Page {page_num} - Total records: {len(page_data)}")
    page_num+=1
    # break

Page 1 - Total records: 1000
Page 2 - Total records: 1000
Page 3 - Total records: 1000
Page 4 - Total records: 1000
Page 5 - Total records: 1000
Page 6 - Total records: 1000
Page 7 - Total records: 1000
Page 8 - Total records: 1000
Page 9 - Total records: 1000
Page 10 - Total records: 1000
Page 11 - Total records: 0


In [41]:
# Print the sample record
print("Sample Record: \n", sample_record)

Sample Record: 
 {
  "End_Lat": 40.742963,
  "End_Lon": -73.980072,
  "Fare_Amt": 45.0,
  "Passenger_Count": 1,
  "Payment_Type": "Credit",
  "Rate_Code": null,
  "Start_Lat": 40.641525,
  "Start_Lon": -73.787442,
  "Tip_Amt": 9.0,
  "Tolls_Amt": 4.15,
  "Total_Amt": 58.15,
  "Trip_Distance": 17.52,
  "Trip_Dropoff_DateTime": "2009-06-14 23:48:00",
  "Trip_Pickup_DateTime": "2009-06-14 23:23:00",
  "mta_tax": null,
  "store_and_forward": null,
  "surcharge": 0.0,
  "vendor_name": "VTS"
}


In [39]:
# dlt pipeline
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_pipeline",
    destination="duckdb",
    dataset_name="ny_taxi_data" 
)


In [42]:
# Load the data into the "rides" table in DuckDB, replacing any existing data.
load_info = pipeline.run(
  paginated_getter(),  # The generator function that retrieves paginated data
  table_name="rides",  # Specify the target table name
  write_disposition="replace"  # Replace the existing data in the table
)

print(load_info)

Pipeline ny_taxi_pipeline load step completed in 2.38 seconds
1 load package(s) were loaded to destination duckdb and into dataset ny_taxi_data
The duckdb destination used duckdb:////home/pedro/projects/data-engineering-zoomcamp/workshops/dlt-pipeline/ny_taxi_pipeline.duckdb location to store data
Load package 1739389539.3857207 is LOADED and contains no failed jobs


In [None]:
# Streamlit info
!dlt pipeline ny_taxi_pipeline show

### Data in duckdb

As expected, 10,000 rows on duckDb

![](assets/images/image.png)

And the json file properly dumped into duckDb

| Index | name                   | data_type | nullable |
|-------|------------------------|-----------|----------|
| 0     | end_lat                | double    | true     |
| 1     | end_lon                | double    | true     |
| 2     | fare_amt               | double    | true     |
| 3     | passenger_count        | bigint    | true     |
| 4     | payment_type           | text      | true     |
| 5     | start_lat              | double    | true     |
| 6     | start_lon              | double    | true     |
| 7     | tip_amt                | double    | true     |
| 8     | tolls_amt              | double    | true     |
| 9     | total_amt              | double    | true     |
| 10    | trip_distance          | double    | true     |
| 11    | trip_dropoff_date_time | timestamp | true     |
| 12    | trip_pickup_date_time  | timestamp | true     |
| 13    | surcharge              | double    | true     |
| 14    | vendor_name            | text      | true     |
| 15    | _dlt_load_id           | text      | false    |
| 16    | _dlt_id                | text      | false    |
| 17    | store_and_forward      | double    | true     |
