### Import libraries

In [3]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

### Read csv file

In [43]:
df = pd.read_csv('yellow_tripdata_2021-01.csv',nrows=100)

### Check if it's OK

In [44]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.10,1,N,142,43,2,8.0,3.0,0.5,0.00,0.0,0.3,11.80,2.5
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.20,1,N,238,151,2,3.0,0.5,0.5,0.00,0.0,0.3,4.30,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.70,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.60,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2021-01-01 00:12:41,2021-01-01 00:26:47,1,4.13,1,N,161,226,1,14.5,0.5,0.5,3.66,0.0,0.3,21.96,2.5
96,2,2021-01-01 00:23:29,2021-01-01 00:35:03,2,4.12,1,N,162,74,2,13.5,0.5,0.5,0.00,0.0,0.3,17.30,2.5
97,2,2021-01-01 00:46:17,2021-01-01 00:54:25,2,2.22,1,N,144,170,1,9.0,0.5,0.5,2.56,0.0,0.3,15.36,2.5
98,2,2021-01-01 00:28:16,2021-01-01 00:51:44,1,7.11,1,N,264,264,2,23.5,0.5,0.5,0.00,0.0,0.3,24.80,0.0


### Change datatype to TIMESTAMP (need to be added later)

In [45]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

### Print schema

In [46]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data'))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


### Create Engine to connect to our DB

In [4]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

### Print exactly insertion spell with correct datatypes fro each column

In [49]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data',con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




### Create iterator to inject data chunk by chunk

In [50]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv',iterator=True, chunksize=100_000)

### Inject columns names to DB

In [51]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

### Try to inject all data to DB

In [52]:
try: 
	while True:
		t_start = time()
		df = next(df_iter)
		df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
		df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
		df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
		t_end = time()

		print('Insertion done in %.3f seconds' % (t_end - t_start))
except(StopIteration):
	print('Insertion compleat')

Insertion done in 13.299 seconds
Insertion done in 13.639 seconds
Insertion done in 13.185 seconds
Insertion done in 13.251 seconds
Insertion done in 13.190 seconds
Insertion done in 13.120 seconds
Insertion done in 11.736 seconds
Insertion done in 13.499 seconds
Insertion done in 13.090 seconds
Insertion done in 13.179 seconds
Insertion done in 14.684 seconds
Insertion done in 13.772 seconds


  df = next(df_iter)


Insertion done in 14.505 seconds
Insertion done in 7.112 seconds
Insertion compleat


### Add taxi zones file

In [1]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2024-01-20 23:33:51--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.53.168, 52.216.54.128, 52.216.249.30, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.53.168|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: ‘taxi+_zone_lookup.csv’


2024-01-20 23:33:52 (68,5 KB/s) - ‘taxi+_zone_lookup.csv’ saved [12322/12322]



In [5]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [6]:
df_zones.head(1)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR


In [7]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265