# Ingest NYC Green Taxi data to Postgres database

## Import libraries

In [1]:
import pandas as pd
from sqlalchemy import create_engine  # connect to Postgres database
from time import time

## Load data

In [2]:
df = pd.read_csv("green_tripdata_2019-01.csv.gz", nrows=100)
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.00,3.0,0.5,0.5,0.00,0.0,,0.3,4.30,2,1,
1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,6.0,0.5,0.5,0.00,0.0,,0.3,7.30,2,1,
2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,4.5,0.5,0.5,0.00,0.0,,0.3,5.80,1,1,
3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,13.5,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,18.0,0.5,0.5,0.00,0.0,,0.3,19.30,2,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2019-01-01 00:15:42,2019-01-01 00:41:04,N,1,97,179,1,7.83,26.0,0.5,0.5,5.46,0.0,,0.3,32.76,1,1,
96,2,2019-01-01 00:21:28,2019-01-01 00:48:15,N,1,7,230,1,4.49,20.0,0.5,0.5,4.00,0.0,,0.3,25.30,1,1,
97,2,2019-01-01 00:17:10,2019-01-01 00:23:34,N,1,42,42,1,1.08,6.5,0.5,0.5,0.00,0.0,,0.3,7.80,2,1,
98,2,2019-01-01 00:25:01,2019-01-01 00:43:55,N,1,42,232,2,8.74,25.0,0.5,0.5,0.00,0.0,,0.3,26.30,2,1,


In [3]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               100 non-null    int64         
 1   lpep_pickup_datetime   100 non-null    datetime64[ns]
 2   lpep_dropoff_datetime  100 non-null    datetime64[ns]
 3   store_and_fwd_flag     100 non-null    object        
 4   RatecodeID             100 non-null    int64         
 5   PULocationID           100 non-null    int64         
 6   DOLocationID           100 non-null    int64         
 7   passenger_count        100 non-null    int64         
 8   trip_distance          100 non-null    float64       
 9   fare_amount            100 non-null    float64       
 10  extra                  100 non-null    float64       
 11  mta_tax                100 non-null    float64       
 12  tip_amount             100 non-null    float64       
 13  tolls_

## Create a connection to Postgres¶

In [5]:
# specify the database you want to use based on the docker run command we had
# postgresql://username:password@localhost:port/dbname
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

# create the connection to the database engine to see if everything is working properly
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f62b89a3b50>

## Generate the database schema

In [6]:
# pass the engine variable to get_schema function
# pandas will execute the schema SQL statement using the engine connection we have defined
print(pd.io.sql.get_schema(df, name="green_taxi_data", con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




## Write the schema to Postgres database

In [7]:
df_iter = pd.read_csv("green_tripdata_2019-01.csv.gz", iterator=True, chunksize=100000)
df = next(df_iter)
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.00,3.00,0.5,0.5,0.00,0.00,,0.3,4.30,2,1,
1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,6.00,0.5,0.5,0.00,0.00,,0.3,7.30,2,1,
2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,4.50,0.5,0.5,0.00,0.00,,0.3,5.80,1,1,
3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,13.50,0.5,0.5,2.96,0.00,,0.3,19.71,1,1,
4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,18.00,0.5,0.5,0.00,0.00,,0.3,19.30,2,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2,2019-01-06 09:56:09,2019-01-06 10:01:30,N,1,130,216,1,1.23,6.00,0.0,0.5,0.00,0.00,,0.3,6.80,2,1,
99996,2,2019-01-06 09:12:49,2019-01-06 09:52:38,N,5,218,16,1,21.44,47.65,0.0,0.5,0.00,5.76,,0.0,53.91,1,2,
99997,2,2019-01-06 09:02:06,2019-01-06 09:37:42,N,5,139,188,1,14.77,37.84,0.0,0.5,0.00,0.00,,0.0,38.34,1,2,
99998,2,2019-01-06 09:55:01,2019-01-06 10:04:34,N,1,72,188,1,1.80,8.50,0.0,0.5,0.00,0.00,,0.3,9.30,1,1,


In [8]:
df.head(0).to_sql(name="green_taxi_data", con=engine, if_exists="replace")

0

## Insert the data into the database

In [9]:
%time df.to_sql(name="green_taxi_data", con=engine, if_exists="append", chunksize=100000)

CPU times: user 2.14 s, sys: 39.8 ms, total: 2.18 s
Wall time: 4.37 s


1000

In [10]:
while True:
    try:
        t_start = time()

        df = next(df_iter)

        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

        df.to_sql(name="green_taxi_data", con=engine, if_exists="append")

        t_end = time()

        print(f"Inserted another chunk, took {(t_end - t_start):3f} seconds")
    except StopIteration:
        print("Finished ingesting data into the postgres database")
        break

Inserted another chunk, took 4.445141 seconds
Inserted another chunk, took 4.383706 seconds
Inserted another chunk, took 4.402402 seconds
Inserted another chunk, took 4.336893 seconds
Inserted another chunk, took 4.468048 seconds
Inserted another chunk, took 1.375608 seconds
Finished ingesting data into the postgres database


## Insert taxi zone lookup table to Postgres

In [11]:
zones = pd.read_csv("taxi+_zone_lookup.csv")
zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [12]:
zones.to_sql(name="taxi_zones", con=engine, if_exists="replace")

265