# Import Yellow Taxi Data - Part 1

Import yellow taxi data into Vast S3 and Vast DB.

The schema changes over time, so we need to evolve the schema:
- currently only new fields are added during loading
- datatype changes are handled before loading the parquet into VastDB
- column renames are handled before loading the parquet into VastDB

In [1]:
! pip3 install --quiet boto3 polars-lts-cpu

In [2]:
# Define the base URL pattern
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{}.parquet"

# Define date range
start_date = "2009-01"
end_date = "2024-08"

In [3]:
import pandas as pd

# date range
dates = pd.date_range(start_date, end_date, freq="MS").strftime("%Y-%m").tolist()
print(f"{min(dates)}..{max(dates)} - len: {len(dates)}")

2009-01..2024-08 - len: 188


In [4]:
import os
from io import StringIO
from urllib.parse import urlparse

import boto3
from botocore.exceptions import NoCredentialsError, ClientError
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
from pyarrow import csv as pa_csv
import requests

In [5]:
S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")
S3_BUCKET = os.getenv("S3A_BUCKET")

In [6]:
print(f"""
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{S3_SECRET_KEY[-4:]}
S3_BUCKET={S3_BUCKET}
""")


---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3_BUCKET=csnow-bucket



In [7]:
# session = connect_to_vastdb(VASTDB_ENDPOINT, VASTDB_ACCESS_KEY, VASTDB_SECRET_KEY)

# Initialize S3 client
s3_client = boto3.client(
    's3', 
    region_name='vast',
    endpoint_url=S3_ENDPOINT,
    aws_access_key_id=S3_ACCESS_KEY,
    aws_secret_access_key=S3_SECRET_KEY
)

In [8]:
# Define a function to download files
import os
import requests
from urllib.parse import urlparse

def download_file(url):
    file_name = os.path.basename(urlparse(url).path)
    temp_file = file_name + ".tmp"

    # Check if file exists, skip if so
    if not os.path.exists(file_name):
        # print(f"Downloading {file_name}...")
        response = requests.get(url, stream=True)

        if response.status_code == 200:
            with open(temp_file, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            # Atomically rename the temporary file to the final filename
            os.rename(temp_file, file_name)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download {file_name}. Status code: {response.status_code}")
            os.remove(temp_file)  # Remove the incomplete temporary file
    else:
        print(f"{file_name} already exists. Skipping download.")


# Define a function to upload the file to S3
def upload_to_s3(s3_client, file_path, bucket_name, s3_key):
    try:
        # print(f"Uploading {file_path} to S3 bucket {bucket_name}...")
        s3_client.upload_file(file_path, bucket_name, s3_key)
        print(f"File uploaded to s3://{bucket_name}/{s3_key}")
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except NoCredentialsError:
        print("Credentials not available.")
    except Exception as e:
        print(f"Error uploading file to S3: {e}")

def file_exists_in_s3(s3_client, bucket_name, s3_key):
    try:
        s3_client.head_object(Bucket=bucket_name, Key=s3_key)
        return True  # File exists
    except ClientError as e:
        if e.response['Error']['Code'] == "404":
            return False  # File does not exist
        raise  # Re-raise other exceptions


# Define a function to delete the file after processing
def delete_local_file(file_path):
    try:
        os.remove(file_path)
        print(f"Deleted local {file_path}")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")


In [9]:
column_name_mapping = [
    {'name': 'airport_fee',            'mapping': {'type': 'float64',  'source': ['airport_fee']}},
    {'name': 'congestion_surcharge',   'mapping': {'type': 'float64',  'source': ['congestion_surcharge']}},
    {'name': 'dolocationid',           'mapping': {'type': 'int32',    'source': ['dolocationid']}},
    {'name': 'dropoff_latitude',       'mapping': {'type': 'float64',  'source': ['dropoff_latitude', 'end_lat']}},
    {'name': 'dropoff_longitude',      'mapping': {'type': 'float64',  'source': ['dropoff_longitude', 'end_lon']}},
    {'name': 'extra',                  'mapping': {'type': 'float64',  'source': ['extra']}},
    {'name': 'fare_amount',            'mapping': {'type': 'float64',  'source': ['fare_amount', 'fare_amt']}},
    {'name': 'improvement_surcharge',  'mapping': {'type': 'float64',  'source': ['improvement_surcharge']}},
    {'name': 'mta_tax',                'mapping': {'type': 'float64',  'source': ['mta_tax']}},
    {'name': 'passenger_count',        'mapping': {'type': 'int64',    'source': ['passenger_count']}},
    {'name': 'payment_type',           'mapping': {'type': 'str',      'source': ['payment_type']}},
    {'name': 'pulocationid',           'mapping': {'type': 'int32',    'source': ['pulocationid']}},
    {'name': 'pickup_latitude',        'mapping': {'type': 'float64',  'source': ['pickup_latitude', 'start_lat']}},
    {'name': 'pickup_longitude',       'mapping': {'type': 'float64',  'source': ['pickup_longitude', 'start_lon']}},
    {'name': 'ratecodeid',             'mapping': {'type': 'int64',    'source': ['ratecodeid', 'rate_code']}},
    {'name': 'store_and_fwd_flag',     'mapping': {'type': 'float64',  'source': ['store_and_forward']}},
    {'name': 'surcharge',              'mapping': {'type': 'float64',  'source': ['surcharge']}},
    {'name': 'tip_amount',             'mapping': {'type': 'float64',  'source': ['tip_amount', 'tip_amt']}},
    {'name': 'tolls_amount',           'mapping': {'type': 'float64',  'source': ['tolls_amount', 'tolls_amt']}},
    {'name': 'total_amount',           'mapping': {'type': 'float64',  'source': ['total_amount', 'total_amt']}},
    {'name': 'tpep_dropoff_datetime',  'mapping': {'type': 'str',      'source': ['trip_dropoff_datetime', 'dropoff_datetime']}},
    {'name': 'tpep_pickup_datetime',   'mapping': {'type': 'str',      'source': ['trip_pickup_datetime', 'pickup_datetime']}},
    {'name': 'trip_distance',          'mapping': {'type': 'float64',  'source': ['trip_distance']}},
    {'name': 'vendorid',               'mapping': {'type': 'str',      'source': ['vendor_name', 'vendor_id']}},
]

In [10]:
# use polars to process the parquet files - it is more cpu and memory efficient than pandas

import polars as pl
import logging

def transform_and_save_parquet(date_str, parquet_file, column_name_mapping):
    """
    Reads a Parquet file, renames and casts columns based on the provided mapping,
    and saves the transformed data back to the same Parquet file.

    Args:
        parquet_file (str): The path to the Parquet file.
        column_name_mapping (list): A list of dictionaries, each containing:
            - 'name': The desired new column name.
            - 'mapping': A dictionary with keys:
                - 'type': The desired data type for the column.
                - 'source': A list of source column names to use.
    """
    logging.basicConfig(filename='transformation_log.log', level=logging.INFO)
    logging.info(f"Starting transformation for {parquet_file}")

    print(f"Reading {parquet_file}")
    df = pl.read_parquet(parquet_file)

    # Lowercase the column names
    df = df.rename({col: col.lower() for col in df.columns})

    for mapping in column_name_mapping:
        target_name = mapping['name']
        source_columns = mapping['mapping']['source']
        target_type = mapping['mapping']['type']

        # Map string data type to Polars data type
        type_mapping = {
            'float64': pl.Float64,
            'int64': pl.Int64,
            'int32': pl.Int32,
            'str': pl.String,
            'datetime': pl.Datetime,
        }
        target_type = type_mapping.get(target_type, pl.Utf8)

        # Find the first valid source column
        source_column = next((col for col in source_columns if col in df.columns), None)

        if source_column:
            # Rename and cast the source column to the target name and type
            df = df.rename({source_column: target_name}).with_columns(
                pl.col(target_name).cast(target_type)
            )
            logging.info(f"{date_str}: Transformed column '{source_column}' to '{target_name}' as {target_type}")
        else:
            # Create the target column with a default value if no source column exists
            default_value = -999.0 if target_type == pl.Float64 else -999
            df = df.with_columns(pl.lit(default_value).cast(target_type).alias(target_name))
            logging.warning(f"{date_str}: Created column '{target_name}' with default value due to missing source")

    # Retain only the mapped columns
    mapped_names = [mapping['name'] for mapping in column_name_mapping]
    df = df.select(mapped_names)

    print(f"Writing {parquet_file}")
    df.write_parquet(parquet_file)

    # make available to garbage collect
    del df

    #######
    # hack to convert large_string to string

    print(f"Converting large_string to string in {parquet_file}")

    import pyarrow as pa
    import pyarrow.parquet as pq
    
    # Read the Parquet file
    table = pq.read_table(parquet_file)
    
    # Modify the schema to replace `large_string` with `string`
    new_fields = [
        pa.field(field.name, pa.string() if field.type == pa.large_string() else field.type)
        for field in table.schema
    ]
    new_schema = pa.schema(new_fields)
    
    # Cast columns with `large_string` type to `string`
    new_columns = [
        table[column.name].cast(pa.string()) if column.type == pa.large_string() else table[column.name]
        for column in table.schema
    ]
    
    # Create a new table with updated schema and columns
    converted_table = pa.table(new_columns, schema=new_schema)
    
    # Write the converted table to a new Parquet file
    pq.write_table(converted_table, parquet_file)

    #######
    
    logging.info(f"{date_str}: Transformation completed and saved to {parquet_file}")



In [11]:
# Download and process all files from 2019-01 to 2024-08
for date in dates:

    file_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{date}.parquet"

    file_name = f"yellow_tripdata_{date}.parquet"
    s3_key = f"yellow_tripdata/yellow_tripdata_{date}.parquet"
    
    if file_exists_in_s3(s3_client, S3_BUCKET, s3_key):
        print(f"{s3_key} exists in S3, skipping ...")
        continue
    else:
        download_file(file_url)
        pa_table = transform_and_save_parquet(date, file_name, column_name_mapping)
        upload_to_s3(s3_client, file_name, S3_BUCKET, s3_key)
        
        # Delete the file after processing
        delete_local_file(file_name)


Downloaded yellow_tripdata_2009-01.parquet
Reading yellow_tripdata_2009-01.parquet
Writing yellow_tripdata_2009-01.parquet
Converting large_string to string in yellow_tripdata_2009-01.parquet
File uploaded to s3://csnow-bucket/yellow_tripdata/yellow_tripdata_2009-01.parquet
Deleted local yellow_tripdata_2009-01.parquet
Downloaded yellow_tripdata_2009-02.parquet
Reading yellow_tripdata_2009-02.parquet
Writing yellow_tripdata_2009-02.parquet
Converting large_string to string in yellow_tripdata_2009-02.parquet
File uploaded to s3://csnow-bucket/yellow_tripdata/yellow_tripdata_2009-02.parquet
Deleted local yellow_tripdata_2009-02.parquet
Downloaded yellow_tripdata_2009-03.parquet
Reading yellow_tripdata_2009-03.parquet
Writing yellow_tripdata_2009-03.parquet
Converting large_string to string in yellow_tripdata_2009-03.parquet
File uploaded to s3://csnow-bucket/yellow_tripdata/yellow_tripdata_2009-03.parquet
Deleted local yellow_tripdata_2009-03.parquet
Downloaded yellow_tripdata_2009-04.pa