In [None]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm.auto import tqdm

: 

In [None]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [None]:
dtype = {
    "VendorID": "Int64",
    "passenger_count": "Int64",
    "trip_distance": "float64",
    "RatecodeID": "Int64",
    "store_and_fwd_flag": "string",
    "PULocationID": "Int64",
    "DOLocationID": "Int64",
    "payment_type": "Int64",
    "fare_amount": "float64",
    "extra": "float64",
    "mta_tax": "float64",
    "tip_amount": "float64",
    "tolls_amount": "float64",
    "improvement_surcharge": "float64",
    "total_amount": "float64",
    "congestion_surcharge": "float64"
}

parse_dates = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime"
]

prefix = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/'

df = pd.read_csv(
    prefix + 'yellow_tripdata_2021-01.csv.gz',
    nrows=100,
    dtype=dtype,
    parse_dates=parse_dates
)

In [None]:
# Display first rows
df.head()

In [None]:
# Check data types
df.dtypes

In [None]:
# Check data shape
df.shape

In [None]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))

## Create table

In [None]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

## Ingesting Data in Chunks


In [None]:
df_iter = pd.read_csv(
    prefix + 'yellow_tripdata_2021-01.csv.gz',
    nrows=100,
    dtype=dtype,
    parse_dates=parse_dates,
    iterator=True,
    chunksize=100000
)

In [None]:
for df_chunk in df_iter:
    print(len(df_chunk))

In [None]:
first = True

for df_chunk in tqdm(df_iter):

    if first:
        # Create table schema (no data)
        df_chunk.head(0).to_sql(
            name="yellow_taxi_data",
            con=engine,
            if_exists="replace"
        )
        first = False
        print("Table created")

    # Insert chunk
    df_chunk.to_sql(
        name="yellow_taxi_data",
        con=engine,
        if_exists="append"
    )

    print("Inserted:", len(df_chunk))

In [None]:
df_chunk.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

## Ingestion Loop

### To delete records

uv run pgcli -h localhost -p 5432 -u root -d ny_taxi
DELETE FROM yellow_taxi_data