In [1]:
import pandas as pd

## Reading the Dataset

In [10]:
df = pd.read_parquet('data/green_tripdata_2019-10.parquet')

In [11]:
df.head(10)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1.0,112,196,1.0,5.88,18.0,0.5,0.5,0.0,0.0,,0.3,19.3,2.0,1.0,0.0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1.0,43,263,1.0,0.8,5.0,3.25,0.5,0.0,0.0,,0.3,9.05,2.0,1.0,0.0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1.0,255,228,2.0,7.5,21.5,0.5,0.5,0.0,0.0,,0.3,22.8,2.0,1.0,0.0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1.0,181,181,1.0,0.9,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1.0,97,188,1.0,2.52,10.0,0.5,0.5,2.26,0.0,,0.3,13.56,1.0,1.0,0.0
5,2,2019-10-01 00:35:01,2019-10-01 00:43:40,N,1.0,65,49,1.0,1.47,8.0,0.5,0.5,1.86,0.0,,0.3,11.16,1.0,1.0,0.0
6,1,2019-10-01 00:28:09,2019-10-01 00:30:49,N,1.0,7,179,1.0,0.6,4.0,0.5,0.5,1.0,0.0,,0.3,6.3,1.0,1.0,0.0
7,2,2019-10-01 00:28:26,2019-10-01 00:32:01,N,1.0,41,74,1.0,0.56,4.5,0.5,0.5,0.0,0.0,,0.3,5.8,2.0,1.0,0.0
8,2,2019-10-01 00:14:01,2019-10-01 00:26:16,N,1.0,255,49,1.0,2.42,10.5,0.5,0.5,0.0,0.0,,0.3,11.8,2.0,1.0,0.0
9,1,2019-10-01 00:03:03,2019-10-01 00:17:13,Y,1.0,130,131,1.0,3.4,13.0,0.5,0.5,2.85,0.0,,0.3,17.15,1.0,1.0,0.0


In [14]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

We can easily visualize the dataset sql schema by using pandas as follows:

In [15]:
print(pd.io.sql.get_schema(df, 'taxi_2019'))

CREATE TABLE "taxi_2019" (
"VendorID" INTEGER,
  "lpep_pickup_datetime" TIMESTAMP,
  "lpep_dropoff_datetime" TIMESTAMP,
  "store_and_fwd_flag" TEXT,
  "RatecodeID" REAL,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "passenger_count" REAL,
  "trip_distance" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "ehail_fee" TEXT,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "payment_type" REAL,
  "trip_type" REAL,
  "congestion_surcharge" REAL,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP
)


## Data Ingestion

So we can now create the table and inject the data into it.

In [16]:
from sqlalchemy import create_engine
import pyarrow.parquet as pq
from time import time

In [17]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi', echo=False)

We will iterate over the dataset and insert the data into the table.

In [18]:
pf = pq.ParquetFile('data/green_tripdata_2019-10.parquet')

In [19]:
for i, batch in enumerate(pf.iter_batches(batch_size=100000)):
    start = time()
    batch.to_pandas().to_sql('taxi_2019', engine, if_exists='append')
    print(f'Batch: {i}. Elapsed time: {time() - start:.2f} sec')

Batch: 0. Elapsed time: 6.37 sec
Batch: 1. Elapsed time: 5.76 sec
Batch: 2. Elapsed time: 5.76 sec
Batch: 3. Elapsed time: 5.37 sec
Batch: 4. Elapsed time: 3.90 sec


## Zonas

Añadiremos ahora la tabla de zonas, que contiene la información de las zonas de Nueva York, para poder realizar peticiones más complejas.

In [24]:
pf = pq.ParquetFile('data/taxi_zone_lookup.parquet')

In [25]:
for i, batch in enumerate(pf.iter_batches(batch_size=100000)):
    start = time()
    batch.to_pandas().to_sql('zones', engine, if_exists='append')
    print(f'Batch: {i}. Elapsed time: {time() - start:.2f} sec')

Batch: 0. Elapsed time: 0.09 sec


In [27]:
df = pd.read_parquet('data/taxi_zone_lookup.parquet')

In [28]:
df

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,,
