In [24]:
import pandas as pd

In [25]:
# checking pandas version
pd.__version__

'2.0.3'

In [26]:
# read the parquet file from local for draft
raw_data = pd.read_parquet(path='./yellow_tripdata_2023-01.parquet')
# check data length
print(len(raw_data))
# preview first 10 rows
raw_data.head()

3066766


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [27]:
# because the original data is too large we need to trim it down for testing purpose
test_data = raw_data.head(100)
test_data.to_csv("test_data_100.csv", index=False)

In [28]:
# reassign raw_data to the test_data

raw_data = test_data
print(len(raw_data))

100


In [29]:
from sqlalchemy import create_engine

In [30]:
# create an engine to connect to PostgreSQL database
# before running this, ensure the PostgreSQL database is up and running in the container
db_username = 'admin'
db_password = 'admin'
db_host = 'localhost'
db_port = 5432
db_name = 'ny_taxi'

engine_path = f"postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"

engine = create_engine(engine_path)

In [31]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f59458e14f0>

In [32]:
# demo pandas's io.sql.get_schema() function to see how it can help building DDL
# this can also help us validating the data type
# if the timestamp is showing up as STRING, we need to do something about it
# for now, it looks good
# note that this function does not create a table for us
table_name = 'yellow_taxi_data_test'
print(pd.io.sql.get_schema(raw_data, name=table_name, con=engine))


CREATE TABLE yellow_taxi_data_test (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [33]:
# slide the df into smaller chunks
# although here we only have 100 rows, in the actual data, we have 3M rows
df_iter = pd.read_csv('test_data_100.csv', iterator=True, chunksize=50)




In [34]:
df = next(df_iter)

In [35]:
len(df)

50

In [36]:
# create table using to_sql
# can go to pgadmin to validate
# the table should be created in the public schema
df.head(0).to_sql(name=table_name, con=engine, if_exists='replace')

0

In [37]:
from time import time

In [38]:
need_data_insert = True

while need_data_insert:
    try:
        t_start = time()
        df.to_sql(name='yellow_taxi_data_test', con=engine, if_exists='append', index=False)

        t_end = time()
        print('Inserted another chunk..., took %.3f seconds' % (t_end - t_start))
        
        df = next(df_iter)

    except StopIteration:
        need_data_insert = False  # Update the flag to exit the loop

Inserted another chunk..., took 0.020 seconds
Inserted another chunk..., took 0.019 seconds
