In [1]:
import pandas as pd

In [2]:
pd.__version__

'2.1.1'

In [3]:
df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100)

In [6]:
df.columns.to_list()

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge']

In [7]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [9]:
from sqlalchemy import create_engine

In [11]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [7]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [12]:
chunk_size = 50000
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=chunk_size)

In [13]:
df = next(df_iter)

In [14]:
len(df)

50000

In [15]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [16]:
df.head

<bound method NDFrame.head of        VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0             1  2021-01-01 00:30:10   2021-01-01 00:36:12                1   
1             1  2021-01-01 00:51:20   2021-01-01 00:52:19                1   
2             1  2021-01-01 00:43:30   2021-01-01 01:11:06                1   
3             1  2021-01-01 00:15:48   2021-01-01 00:31:01                0   
4             2  2021-01-01 00:31:49   2021-01-01 00:48:21                1   
...         ...                  ...                   ...              ...   
49995         2  2021-01-02 19:02:32   2021-01-02 19:17:19                1   
49996         2  2021-01-02 19:36:52   2021-01-02 19:58:42                1   
49997         1  2021-01-02 19:24:23   2021-01-02 19:38:22                2   
49998         2  2021-01-02 18:58:10   2021-01-02 19:04:30                1   
49999         2  2021-01-02 19:06:12   2021-01-02 19:33:24                3   

       trip_distance 

In [17]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

In [18]:
%time df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

CPU times: user 2.97 s, sys: 34.6 ms, total: 3 s
Wall time: 4.91 s


1000

In [19]:
from time import time

In [20]:
while True:
    try:
        t_start = time()

        df = next(df_iter)

        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

        df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

        t_end = time()

        print('inserted another chunk, took %.3f second' % (t_end - t_start))
    except StopIteration:
        break

inserted another chunk, took 5.269 second
inserted another chunk, took 5.126 second
inserted another chunk, took 5.001 second
inserted another chunk, took 5.232 second
inserted another chunk, took 5.169 second
inserted another chunk, took 4.837 second
inserted another chunk, took 5.203 second
inserted another chunk, took 5.136 second
inserted another chunk, took 4.830 second
inserted another chunk, took 4.860 second
inserted another chunk, took 5.223 second
inserted another chunk, took 5.074 second
inserted another chunk, took 4.932 second
inserted another chunk, took 4.855 second
inserted another chunk, took 4.921 second
inserted another chunk, took 4.893 second
inserted another chunk, took 5.349 second
inserted another chunk, took 5.253 second
inserted another chunk, took 4.975 second
inserted another chunk, took 5.098 second
inserted another chunk, took 5.378 second
inserted another chunk, took 5.538 second
inserted another chunk, took 5.526 second
inserted another chunk, took 4.778

  df = next(df_iter)


inserted another chunk, took 4.673 second
inserted another chunk, took 4.547 second
inserted another chunk, took 1.845 second


In [21]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2023-10-25 17:57:20--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.56.176, 52.216.50.216, 3.5.3.10, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.56.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: ‘taxi+_zone_lookup.csv.4’


2023-10-25 17:57:21 (76.3 MB/s) - ‘taxi+_zone_lookup.csv.4’ saved [12322/12322]



In [22]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [23]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [24]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265