## Download the datasets

But first create a data folder as this will be the same dataset used throughout the course.

In [1]:
from pathlib import Path

data_dir = "../../../data/"
Path(data_dir).mkdir(parents=True, exist_ok=True)

In [None]:
# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz -P ../../../homeworks/data

# # decompress the downloaded file
# !gzip -d ../../../data/green_tripdata_2019-09.csv.gz

In [None]:
# !wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv -P ../../../homeworks/data

## Have a look at the data

In [2]:
!head -n 10 ../../../data/green_tripdata_2019-09.csv

VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
2,2019-09-01 00:10:53,2019-09-01 00:23:46,N,1,65,189,5,2.00,10.5,0.5,0.5,2.36,0,,0.3,14.16,1,1,0
2,2019-09-01 00:31:22,2019-09-01 00:44:37,N,1,97,225,5,3.20,12,0.5,0.5,0,0,,0.3,13.3,2,1,0
2,2019-09-01 00:50:24,2019-09-01 01:03:20,N,1,37,61,5,2.99,12,0.5,0.5,0,0,,0.3,13.3,2,1,0
2,2019-09-01 00:27:06,2019-09-01 00:33:22,N,1,145,112,1,1.73,7.5,0.5,0.5,1.5,0,,0.3,10.3,1,1,0
2,2019-09-01 00:43:23,2019-09-01 00:59:54,N,1,112,198,1,3.42,14,0.5,0.5,3.06,0,,0.3,18.36,1,1,0
2,2019-09-01 00:55:37,2019-09-01 01:17:02,N,1,7,41,1,6.26,21,0.5,0.5,0,6.12,,0.3,28.42,2,1,0
2,2019-09-01 00:28:55,2019-09-01 00:52:09,N,1,33,37,1,4.34,18,0.5,0.5,3.86,0,,0.3,23.16,1,1,0
1,2019-09-01 00:03:13,2019-09-01 00:03:32,N,1,55,55,1,1.20,2.5,0.5,

In [3]:
!head -n 10 ../../../data/taxi+_zone_lookup.csv

"LocationID","Borough","Zone","service_zone"
1,"EWR","Newark Airport","EWR"
2,"Queens","Jamaica Bay","Boro Zone"
3,"Bronx","Allerton/Pelham Gardens","Boro Zone"
4,"Manhattan","Alphabet City","Yellow Zone"
5,"Staten Island","Arden Heights","Boro Zone"
6,"Staten Island","Arrochar/Fort Wadsworth","Boro Zone"
7,"Queens","Astoria","Boro Zone"
8,"Queens","Astoria Park","Boro Zone"
9,"Queens","Auburndale","Boro Zone"


In [4]:
# check how many line sin the csv file
!wc -l ../../../data/green_tripdata_2019-09.csv

449064 ../../../data/green_tripdata_2019-09.csv


## Pandas

In [None]:
import pandas as pd

df_100 = pd.read_csv(data_dir + 'green_tripdata_2019-09.csv', nrows=100)
df_100.head()

In [None]:
df_100.lpep_pickup_datetime = pd.to_datetime(df_100.lpep_pickup_datetime)
df_100.lpep_dropoff_datetime = pd.to_datetime(df_100.lpep_dropoff_datetime)

Generate SQL schema for postgres database.

In [None]:
from sqlalchemy import create_engine

engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [None]:
print(pd.io.sql.get_schema(df_100, name="yellow_taxi_data", con=engine))

Chunk csv files to not upload over 1 million rows all at once.

In [None]:
df_iter = pd.read_csv(data_dir + 'green_tripdata_2019-09.csv', iterator=True, chunksize=10000)

In [None]:
df = next(df_iter)

df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [None]:
df.head(0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

In the `pgcli -h localhost -p 5432 -u root -d ny_taxi` 
- check the table was created successfully using `\dt`
- see how many records were created in the database: `SELECT count(1) FROM yellow_taxi_data`

In [None]:
from time import time

while True:
    try:
        t_start = time()
        df = next(df_iter)
        
        # convert tring to datetime
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
        
        t_end = time()
        
        print("Inserted another chunk... it took %.3f second(s)" % (t_end - t_start))
        
    except StopIteration: # catch exception and break gracefully
        break 