In [1]:
import os
import requests
import pandas as pd
from io import BytesIO
import dlt
from dlt.destinations import filesystem
import duckdb

print("requests version: " + str(requests.__version__))
print("pandas version: " + str(pd.__version__))
print("dlt version: " + str(dlt.__version__))
print("duckdb version: " + str(duckdb.__version__))

requests version: 2.32.5
pandas version: 2.3.3
dlt version: 1.21.0
duckdb version: 1.4.4


In [2]:
# Path to downloaded service account JSON
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcs.json"

# Project name
PROJECT_ID = "sandbox-486719"

In [None]:
# -------------------------------
# SCHEMAS
# -------------------------------

YELLOW_RENAME = {
    "VendorID": "vendor_id",
    "tpep_pickup_datetime": "pickup_datetime",
    "tpep_dropoff_datetime": "dropoff_datetime",
    "passenger_count": "passenger_count",
    "trip_distance": "trip_distance",
    "RatecodeID": "rate_code",
    "store_and_fwd_flag": "store_and_fwd_flag",
    "payment_type": "payment_type",
    "fare_amount": "fare_amount",
    "extra": "extra",
    "mta_tax": "mta_tax",
    "tip_amount": "tip_amount",
    "tolls_amount": "tolls_amount",
    "improvement_surcharge": "imp_surcharge",
    "airport_fee": "airport_fee",
    "total_amount": "total_amount",
    "PULocationID": "pickup_location_id",
    "DOLocationID": "dropoff_location_id"
}

GREEN_RENAME = {
    "VendorID": "vendor_id",
    "lpep_pickup_datetime": "pickup_datetime",
    "lpep_dropoff_datetime": "dropoff_datetime",
    "store_and_fwd_flag": "store_and_fwd_flag",
    "RatecodeID": "rate_code",
    "passenger_count": "passenger_count",
    "trip_distance": "trip_distance",
    "fare_amount": "fare_amount",
    "extra": "extra",
    "mta_tax": "mta_tax",
    "tip_amount": "tip_amount",
    "tolls_amount": "tolls_amount",
    "ehail_fee": "ehail_fee",
    "airport_fee": "airport_fee",
    "total_amount": "total_amount",
    "payment_type": "payment_type",
    "trip_type": "trip_type",
    "improvement_surcharge": "imp_surcharge",
    "PULocationID": "pickup_location_id",
    "DOLocationID": "dropoff_location_id"
}

# Full schema objects including dtypes, rename map, column order, datetime columns
YELLOW_SCHEMA = {
    "dtypes": {
        "vendor_id": "string",
        "pickup_datetime": "datetime64[ns]",
        "dropoff_datetime": "datetime64[ns]",
        "passenger_count": "Int64",
        "trip_distance": "float64",
        "rate_code": "string",
        "store_and_fwd_flag": "string",
        "payment_type": "string",
        "fare_amount": "float64",
        "extra": "float64",
        "mta_tax": "float64",
        "tip_amount": "float64",
        "tolls_amount": "float64",
        "imp_surcharge": "float64",
        "airport_fee": "float64",
        "total_amount": "float64",
        "pickup_location_id": "string",
        "dropoff_location_id": "string",
        "data_file_year": "Int64",
        "data_file_month": "Int64"
    },
    "rename_map": YELLOW_RENAME,
    "columns": [
        "vendor_id", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance",
        "rate_code", "store_and_fwd_flag", "payment_type", "fare_amount", "extra", "mta_tax",
        "tip_amount", "tolls_amount", "imp_surcharge", "airport_fee", "total_amount",
        "pickup_location_id", "dropoff_location_id", "data_file_year", "data_file_month"
    ],
    "datetime_cols": ["pickup_datetime", "dropoff_datetime"]
}

GREEN_SCHEMA = {
    "dtypes": {
        "vendor_id": "string",
        "pickup_datetime": "datetime64[ns]",
        "dropoff_datetime": "datetime64[ns]",
        "store_and_fwd_flag": "string",
        "rate_code": "string",
        "passenger_count": "Int64",
        "trip_distance": "float64",
        "fare_amount": "float64",
        "extra": "float64",
        "mta_tax": "float64",
        "tip_amount": "float64",
        "tolls_amount": "float64",
        "ehail_fee": "float64",
        "airport_fee": "float64",
        "total_amount": "float64",
        "payment_type": "string",
        "distance_between_service": "float64",
        "time_between_service": "Int64",
        "trip_type": "string",
        "imp_surcharge": "float64",
        "pickup_location_id": "string",
        "dropoff_location_id": "string",
        "data_file_year": "Int64",
        "data_file_month": "Int64"
    },
    "rename_map": GREEN_RENAME,
    "columns": [
        "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code",
        "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
        "tolls_amount", "ehail_fee", "airport_fee", "total_amount", "payment_type",
        "distance_between_service", "time_between_service", "trip_type", "imp_surcharge",
        "pickup_location_id", "dropoff_location_id", "data_file_year", "data_file_month"
    ],
    "datetime_cols": ["pickup_datetime", "dropoff_datetime"]
}

In [4]:
# -------------------------------
# HELPER FUNCTIONS
# -------------------------------

def generate_file_urls(data_type = "yellow", start_year = 2019, end_year = 2020):
    urls = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            month_str = f"{month:02d}"
            file_name = f"{data_type}_tripdata_{year}-{month_str}.csv.gz"
            url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{data_type}/{file_name}"
            urls.append((file_name, url))
    return urls

def apply_schema(df: pd.DataFrame, schema: dict, year: int, month: int) -> pd.DataFrame:
    # Rename columns
    df = df.rename(columns = schema["rename_map"])

    # Add missing columns
    for col in schema["columns"]:
        if col not in df.columns:
            df[col] = pd.NA

    # Add year/month
    df["data_file_year"] = year
    df["data_file_month"] = month

    # Datetime conversion
    for col in schema["datetime_cols"]:
        df[col] = pd.to_datetime(df[col], errors = "coerce")

    # Numeric conversion
    for col, dtype in schema["dtypes"].items():
        if col in schema["datetime_cols"]:
            continue
        if "float" in dtype or "int" in dtype:
            df[col] = pd.to_numeric(df[col], errors = "coerce")

    # String conversion
    for col, dtype in schema["dtypes"].items():
        if dtype == "string":
            df[col] = df[col].astype("string")

    # Final dtype enforcement
    df = df.astype(schema["dtypes"])

    # Reorder columns
    df = df[schema["columns"]]

    return df

In [5]:
# -------------------------------
# DLT SOURCE
# -------------------------------

def make_taxi_source(data_type = "yellow"):
    schema = YELLOW_SCHEMA if data_type == "yellow" else GREEN_SCHEMA

    @dlt.source(name = f"{data_type}_tripdata")
    def taxi_source():
        dfs = []
        for file_name, url in generate_file_urls(data_type):
            print(f"Downloading {file_name}...")

            # Extract year/month
            year_month = file_name.split("_")[-1].replace(".csv.gz","")
            year, month = map(int, year_month.split("-"))

            # Retry loop
            for attempt in range(3):
                try:
                    response = requests.get(url)
                    if response.status_code != 200:
                        raise ValueError(f"File not found (status {response.status_code})")

                    df = pd.read_csv(
                        BytesIO(response.content),
                        compression = "gzip",
                        dtype = {"store_and_fwd_flag": "string"},
                        low_memory = False
                    )

                    # Apply schema
                    df = apply_schema(df, schema, year, month)

                    dfs.append(df)
                    break

                except Exception as e:
                    print(f"Attempt {attempt + 1} failed for {file_name}: {e}")
                    if attempt == 2:
                        print(f"Skipping {file_name} after 3 failed attempts.")
                    else:
                        print("Retrying...")
                            # Yield as DLT resource

        yield dlt.resource(dfs, name = f"{data_type}_tripdata")

    return taxi_source

In [None]:
# -------------------------------
# PIPELINE
# -------------------------------

pipeline = dlt.pipeline(
    pipeline_name = "taxi_data_pipeline",
    destination = "bigquery",
    dataset_name = "nytaxi",
    dev_mode = True
)

# Example: create source and run
yellow_source = make_taxi_source("yellow")
green_source = make_taxi_source("green")

  full_refresh_argument_deprecated("pipeline", full_refresh)


In [7]:
# Load yellow data
yellow_info = pipeline.run(yellow_source())

print("\nYellow taxi data load info:")
print(yellow_info)

Downloading yellow_tripdata_2019-01.csv.gz...
Downloading yellow_tripdata_2019-02.csv.gz...
Downloading yellow_tripdata_2019-03.csv.gz...
Downloading yellow_tripdata_2019-04.csv.gz...
Downloading yellow_tripdata_2019-05.csv.gz...
Downloading yellow_tripdata_2019-06.csv.gz...
Downloading yellow_tripdata_2019-07.csv.gz...
Downloading yellow_tripdata_2019-08.csv.gz...
Downloading yellow_tripdata_2019-09.csv.gz...
Downloading yellow_tripdata_2019-10.csv.gz...
Downloading yellow_tripdata_2019-11.csv.gz...
Downloading yellow_tripdata_2019-12.csv.gz...
Downloading yellow_tripdata_2020-01.csv.gz...
Downloading yellow_tripdata_2020-02.csv.gz...
Downloading yellow_tripdata_2020-03.csv.gz...
Downloading yellow_tripdata_2020-04.csv.gz...
Downloading yellow_tripdata_2020-05.csv.gz...
Downloading yellow_tripdata_2020-06.csv.gz...
Downloading yellow_tripdata_2020-07.csv.gz...
Downloading yellow_tripdata_2020-08.csv.gz...
Downloading yellow_tripdata_2020-09.csv.gz...
Downloading yellow_tripdata_2020-1

In [8]:
# Load green data
green_info = pipeline.run(green_source())

print("Green taxi data load info:")
print(green_info)

Downloading green_tripdata_2019-01.csv.gz...
Downloading green_tripdata_2019-02.csv.gz...
Downloading green_tripdata_2019-03.csv.gz...
Downloading green_tripdata_2019-04.csv.gz...
Downloading green_tripdata_2019-05.csv.gz...
Downloading green_tripdata_2019-06.csv.gz...
Downloading green_tripdata_2019-07.csv.gz...
Downloading green_tripdata_2019-08.csv.gz...
Downloading green_tripdata_2019-09.csv.gz...
Downloading green_tripdata_2019-10.csv.gz...
Downloading green_tripdata_2019-11.csv.gz...
Downloading green_tripdata_2019-12.csv.gz...
Downloading green_tripdata_2020-01.csv.gz...
Downloading green_tripdata_2020-02.csv.gz...
Downloading green_tripdata_2020-03.csv.gz...
Downloading green_tripdata_2020-04.csv.gz...
Downloading green_tripdata_2020-05.csv.gz...
Downloading green_tripdata_2020-06.csv.gz...
Downloading green_tripdata_2020-07.csv.gz...
Downloading green_tripdata_2020-08.csv.gz...
Downloading green_tripdata_2020-09.csv.gz...
Downloading green_tripdata_2020-10.csv.gz...
Downloadin