In [1]:
import pandas as pd

In [2]:
# Read a sample of the data
prefix = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv/'
df = pd.read_csv(prefix, nrows=100)

In [3]:
# Display first rows
df.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [4]:
# Check data types
df.dtypes

LocationID       int64
Borough         object
Zone            object
service_zone    object
dtype: object

In [5]:
# Check data shape
df.shape

(100, 4)

In [6]:
df.describe()

Unnamed: 0,LocationID
count,100.0
mean,50.5
std,29.011492
min,1.0
25%,25.75
50%,50.5
75%,75.25
max,100.0


# <b>Create database connection </b>

In [7]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

## Get DDL Schema

In [8]:
print(pd.io.sql.get_schema(df, name='zones', con=engine))


CREATE TABLE zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




# Create table structure

In [10]:
df.head(n=0).to_sql(name='zones', con=engine, if_exists='replace')

0

# Feed the table -> Prepare -> optimize -> Feed

## Prepare the pipeline & flow optimization

The process of preparing the data pipeline by reading the data in chunks (100,000 rows at a time) and optimizing the flow for efficient processing.

In [None]:
df_iter = pd.read_csv(
    prefix,
    iterator=True, #read the first row and wait -> enable reading the file in chuncks
    chunksize=100000 # Process 100,000 rows at a time

## Iterate over chunks

In [14]:
for df_chunk in df_iter:
    print(len(df_chunk))

265


## Inserting data in the table 'zones'

In [16]:
df_chunk.to_sql(name='zones', con=engine, if_exists='append')

265

## Add a progression bar

In [17]:
from tqdm.auto import tqdm

for df_chunk in tqdm(df_iter):
    ...

0it [00:00, ?it/s]

# Verify the data

Bash : uv run pgcli -h localhost -p 5432 -u root -d ny_taxi

or

pgadmin