In [18]:
import pandas as pd

## Reading the Dataset

In [22]:
df = pd.read_parquet('data/yellow_tripdata_2024-01.parquet')

In [23]:
df.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0
5,1,2024-01-01 00:54:08,2024-01-01 01:26:31,1.0,4.7,1.0,N,148,141,1,29.6,3.5,0.5,6.9,0.0,1.0,41.5,2.5,0.0
6,2,2024-01-01 00:49:44,2024-01-01 01:15:47,2.0,10.82,1.0,N,138,181,1,45.7,6.0,0.5,10.0,0.0,1.0,64.95,0.0,1.75
7,1,2024-01-01 00:30:40,2024-01-01 00:58:40,0.0,3.0,1.0,N,246,231,2,25.4,3.5,0.5,0.0,0.0,1.0,30.4,2.5,0.0
8,2,2024-01-01 00:26:01,2024-01-01 00:54:12,1.0,5.44,1.0,N,161,261,2,31.0,1.0,0.5,0.0,0.0,1.0,36.0,2.5,0.0
9,2,2024-01-01 00:28:08,2024-01-01 00:29:16,1.0,0.04,1.0,N,113,113,2,3.0,1.0,0.5,0.0,0.0,1.0,8.0,2.5,0.0


In [25]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

We can easily visualize the dataset sql schema by using pandas as follows:

In [26]:
print(pd.io.sql.get_schema(df, 'taxi'))

CREATE TABLE "taxi" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" REAL,
  "trip_distance" REAL,
  "RatecodeID" REAL,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL,
  "Airport_fee" REAL
)


## Data Ingestion

So we can now create the table and inject the data into it.

In [55]:
from sqlalchemy import create_engine
import pyarrow.parquet as pq
from time import time

In [49]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi', echo=False)

We will iterate over the dataset and insert the data into the table.

In [57]:
pf = pq.ParquetFile('data/yellow_tripdata_2024-01.parquet')

In [58]:
for i, batch in enumerate(pf.iter_batches(batch_size=100000)):
    start = time()
    batch.to_pandas().to_sql('taxi', engine, if_exists='append')
    print(f'Batch: {i}. Elapsed time: {time() - start:.2f} sec')

Batch: 0. Elapsed time: 4.97 sec
Batch: 1. Elapsed time: 4.74 sec
Batch: 2. Elapsed time: 4.63 sec
Batch: 3. Elapsed time: 4.86 sec
Batch: 4. Elapsed time: 5.44 sec
Batch: 5. Elapsed time: 4.94 sec
Batch: 6. Elapsed time: 4.63 sec
Batch: 7. Elapsed time: 4.72 sec
Batch: 8. Elapsed time: 4.74 sec
Batch: 9. Elapsed time: 4.74 sec
Batch: 10. Elapsed time: 4.91 sec
Batch: 11. Elapsed time: 4.86 sec
Batch: 12. Elapsed time: 4.98 sec
Batch: 13. Elapsed time: 4.72 sec
Batch: 14. Elapsed time: 4.80 sec
Batch: 15. Elapsed time: 4.68 sec
Batch: 16. Elapsed time: 6.65 sec
Batch: 17. Elapsed time: 5.55 sec
Batch: 18. Elapsed time: 4.92 sec
Batch: 19. Elapsed time: 4.77 sec
Batch: 20. Elapsed time: 4.87 sec
Batch: 21. Elapsed time: 5.03 sec
Batch: 22. Elapsed time: 4.72 sec
Batch: 23. Elapsed time: 4.85 sec
Batch: 24. Elapsed time: 5.98 sec
Batch: 25. Elapsed time: 5.76 sec
Batch: 26. Elapsed time: 4.87 sec
Batch: 27. Elapsed time: 4.70 sec
Batch: 28. Elapsed time: 4.75 sec
Batch: 29. Elapsed time: