In [1]:
#!pip install duckdb
#!pip install 'polars[pyarrow]'



## Setup

In [1]:
import duckdb

In [2]:
duckLocalFileCon = duckdb.connect("/your/file/path/nyc.db")

aws_access_key_id=""
aws_secret_access_key=""
aws_session_token=""

aws_region=""
src_s3_uri="s3://"
target_s3_uri="s3://"
table_name = "tbl_nyc_taxi_trips"

In [3]:
def create_aws_sso_secrets(duckdb_obj, key_id, secret, token, region):
    create_secret_ddl = f"""
    CREATE OR REPLACE SECRET my_ephem_sts_token 
        ( 
            TYPE S3,
            KEY_ID '{key_id}',
            SECRET '{secret}',
            SESSION_TOKEN '{token}',
            REGION '{region}'
        );"""

    return duckdb_obj.sql(create_secret_ddl).fetchall()

def init_reader(duckdb_obj, db_url, table_name):
    
    def read_nyc_table(cab_type, trip_month, trip_year):
        query = f"""
            SELECT * 
            FROM delta_scan('{db_url}/{table_name}') nyc
            WHERE nyc.cab_type=? and nyc.trip_month=? and nyc.trip_year=?
        """
        return duckdb_obj.execute(query, [cab_type, trip_month, trip_year]).pl()
    return read_nyc_table

In [4]:
create_aws_sso_secrets(duckLocalFileCon, aws_access_key_id, aws_secret_access_key, aws_session_token, aws_region)

[(True,)]

In [5]:
nycCabTripsReader = init_reader(duckLocalFileCon, target_s3_uri, table_name)

We query the delta table now, for green taxi trips in January 2024

**expected result**:
```
shape: (56_551, 23)
┌──────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬───────────┬───────────┐
│ VendorID ┆ pickup_dat ┆ dropoff_d ┆ RatecodeI ┆ … ┆ airport_f ┆ cab_type ┆ trip_year ┆ trip_mont │
│ ---      ┆ etime      ┆ atetime   ┆ D         ┆   ┆ ee        ┆ ---      ┆ ---       ┆ h         │
│ i32      ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ str      ┆ str       ┆ ---       │
│          ┆ datetime[μ ┆ datetime[ ┆ f64       ┆   ┆ f64       ┆          ┆           ┆ str       │
│          ┆ s]         ┆ μs]       ┆           ┆   ┆           ┆          ┆           ┆           │
╞══════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪═══════════╪═══════════╡
│ 2        ┆ 2024-01-01 ┆ 2024-01-0 ┆ 1.0       ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 00:46:55   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 00:58:25  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-01 ┆ 2024-01-0 ┆ 1.0       ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 00:31:42   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 00:52:34  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-01 ┆ 2024-01-0 ┆ 1.0       ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 00:30:21   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 00:49:23  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 1        ┆ 2024-01-01 ┆ 2024-01-0 ┆ 1.0       ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 00:30:20   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 00:42:12  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-01 ┆ 2024-01-0 ┆ 1.0       ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 00:32:38   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 00:43:37  ┆           ┆   ┆           ┆          ┆           ┆           │
│ …        ┆ …          ┆ …         ┆ …         ┆ … ┆ …         ┆ …        ┆ …         ┆ …         │
│ 2        ┆ 2024-01-31 ┆ 2024-01-3 ┆ null      ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 20:46:00   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 20:55:00  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-31 ┆ 2024-01-3 ┆ null      ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 21:06:00   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 21:11:00  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-31 ┆ 2024-01-3 ┆ null      ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 21:36:00   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 21:40:00  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-31 ┆ 2024-01-3 ┆ null      ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 22:45:00   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 22:51:00  ┆           ┆   ┆           ┆          ┆           ┆           │
│ 2        ┆ 2024-01-31 ┆ 2024-01-3 ┆ null      ┆ … ┆ 0.0       ┆ green    ┆ 2024      ┆ January   │
│          ┆ 22:28:00   ┆ 1         ┆           ┆   ┆           ┆          ┆           ┆           │
│          ┆            ┆ 22:59:00  ┆           ┆   ┆           ┆          ┆           ┆           │
└──────────┴────────────┴───────────┴───────────┴───┴───────────┴──────────┴───────────┴───────────┘
```

In [6]:
greenTripsJan2024 = nycCabTripsReader('green', 'January', '2024')

`greenTripsJan2024` is a [polars](https://pola.rs/) dataframe

At this point you may execute a sql query on the `greenTripsJan2024` variable to check its data, like so:

```python
duckLocalFileCon.sql("""
    SELECT *
    FROM greenTripsJan2024
""")
```

## Loading data into DuckDB

### Table creation & Initial data load


We are now ready to create our duckdb table `nyc_taxi_trips` with this initial data set - trip data from Jan 2024, for green cabs 

> ***convinience query*** for dropping the existing table (***for demo purposes***)
>
> ```python
duckLocalFileCon.execute("""
    DROP TABLE NYC_TAXI_TRIPS
""").fetchall()

In [8]:
duckLocalFileCon.execute("""
    CREATE OR REPLACE TABLE NYC_TAXI_TRIPS AS 
        SELECT *
        FROM greenTripsJan2024
""").fetchall()

[(56551,)]

`CHECKPOINT` will save the in-memory WAL to the duckdb file

In [9]:
duckLocalFileCon.execute("CHECKPOINT nyc").fetchall()

[]

#### Checks

Check the number of rows loaded into the table with

```sql
    SELECT count(*) as num_of_trips
    FROM NYC_TAXI_TRIPS
```

**expected result**

```
┌──────────────┐
│ num_of_trips │
│    int64     │
├──────────────┤
│        56551 │
└──────────────┘
```

Also check the columns in the table with:

```sql
DESCRIBE TABLE NYC_TAXI_TRIPS

```

**expected output**

```
┌───────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│      column_name      │ column_type │  null   │   key   │ default │  extra  │
│        varchar        │   varchar   │ varchar │ varchar │ varchar │ varchar │
├───────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ VendorID              │ INTEGER     │ YES     │ NULL    │ NULL    │ NULL    │
│ pickup_datetime       │ TIMESTAMP   │ YES     │ NULL    │ NULL    │ NULL    │
│ dropoff_datetime      │ TIMESTAMP   │ YES     │ NULL    │ NULL    │ NULL    │
│ RatecodeID            │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ pickup_location_id    │ INTEGER     │ YES     │ NULL    │ NULL    │ NULL    │
│ drop_off_location_id  │ INTEGER     │ YES     │ NULL    │ NULL    │ NULL    │
│ passenger_count       │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ trip_distance         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ fare_amount           │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ extra                 │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ mta_tax               │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ tip_amount            │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ tolls_amount          │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ ehail_fee             │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ improvement_surcharge │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ total_amount          │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ payment_type          │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ trip_type             │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ congestion_surcharge  │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ airport_fee           │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ cab_type              │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ trip_year             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ trip_month            │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
├───────────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┤
│ 23 rows                                                           6 columns │
└─────────────────────────────────────────────────────────────────────────────┘
```

### Incremental Data Load

#### Load 1

Let us now load yellow cab data for jan & feb into the same duck table

##### Fetch data from delta table

In [6]:
yellowTripsJan2024 = nycCabTripsReader('yellow', 'January', '2024')
yellowTripsFeb2024 = nycCabTripsReader('yellow', 'February', '2024')

In [7]:
duckLocalFileCon.execute("""
    INSERT INTO NYC_TAXI_TRIPS
        SELECT *
        FROM yellowTripsJan2024
""").fetchall()

[(2964624,)]

In [8]:
duckLocalFileCon.execute("""
    INSERT INTO NYC_TAXI_TRIPS
        SELECT *
        FROM yellowTripsFeb2024
""").fetchall()

[(3007525,)]

In [9]:
duckLocalFileCon.execute("CHECKPOINT nyc").fetchall()

[]

##### Checks

Check the number of rows loaded into the table at this point,

```sql
    SELECT count(*) as num_of_trips
    FROM NYC_TAXI_TRIPS
```

**expected result**

> Initial, green for jan 2024 (56,551) + delta yellow jan 2024 (2,964,624) + delta yellow feb 2024 (3,007,525) = 6,028,700

```
┌──────────────┐
│ num_of_trips │
│    int64     │
├──────────────┤
│      6028700 │
└──────────────┘
```

#### Load 2

Let us now load green cab data for feb into the same duck table

In [10]:
greenTripsFeb2024 = nycCabTripsReader('green', 'February', '2024')

In [11]:
duckLocalFileCon.execute("""
    INSERT INTO NYC_TAXI_TRIPS
        SELECT *
        FROM greenTripsFeb2024
""").fetchall()

[(53577,)]

In [12]:
duckLocalFileCon.execute("CHECKPOINT nyc").fetchall()

[]

The `nyc.db` duckdb file would be `352.01 mb` in size at this point  