# Bulk Load Yellow Taxi Data into VastDB - Part 2

This notebook uses the VastDB Bulk Import functionality to ingest parquet data from Vast S3 to Vast DB.

The data was uploaded to S3 by the [previous notebook](./yellow_taxi_data_pt2_ingest_to_vastdb.ipynb).

- The yellow trip data was saved to S3 as defined in `.env-local` by:
  - S3A_ACCESS_KEY
  - S3A_SECRET_KEY
  - S3A_ENDPOINT
  - S3A_BUCKET
  - S3A_HIVE_TAXI_URI
- The parquet files will be imported into Vast DB as defined in `.env-local` by:
  - VASTDB_ACCESS_KEY
  - VASTDB_SECRET_KEY
  - VASTDB_ENDPOINT
  - VASTDB_NYT_BUCKET
  - VASTDB_NYT_SCHEMA
  - VASTDB_NYT_TABLE

In [1]:
################################################################################
# Set DROP_TABLE=True if you wish to start with an empty table
################################################################################

DROP_TABLE=True

# Load one year at at time
BATCH_SIZE=12

In [2]:
! pip3 install --quiet vastdb

In [3]:
# Define date range
start_date = "2009-01"
end_date = "2024-08"

In [4]:
import pandas as pd

# date range
dates = pd.date_range(start_date, end_date, freq="MS").strftime("%Y-%m").tolist()
print(f"{min(dates)}..{max(dates)} - len: {len(dates)}")

2009-01..2024-08 - len: 188


In [5]:
import os
from io import StringIO
from urllib.parse import urlparse

import boto3
from botocore.exceptions import NoCredentialsError

import vastdb

In [6]:
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_NYT_BUCKET=os.getenv("VASTDB_NYT_BUCKET")
VASTDB_NYT_SCHEMA=os.getenv("VASTDB_NYT_SCHEMA")
VASTDB_NYT_TABLE=os.getenv("VASTDB_NYT_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")
S3_BUCKET = os.getenv("S3A_BUCKET")
S3_HIVE_TAXI_URI = os.getenv("S3A_HIVE_TAXI_URI")

_, S3_FOLDER, _ = S3_HIVE_TAXI_URI.replace('s3a://', '').split('/')

In [7]:
print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_NYT_BUCKET={VASTDB_NYT_BUCKET}
VASTDB_NYT_SCHEMA={VASTDB_NYT_SCHEMA}
VASTDB_NYT_TABLE={VASTDB_NYT_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{S3_SECRET_KEY[-4:]}
S3_BUCKET={S3_BUCKET}
S3_HIVE_TAXI_URI={S3_HIVE_TAXI_URI}
S3_FOLDER={S3_FOLDER}
""")


---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_NYT_BUCKET=csnow-db
VASTDB_NYT_SCHEMA=nyt
VASTDB_NYT_TABLE=taxi
---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3_BUCKET=csnow-bucket
S3_HIVE_TAXI_URI=s3a://csnow-bucket/nyt/
S3_FOLDER=nyt



In [8]:
# Drop the table

session = vastdb.connect(endpoint=VASTDB_ENDPOINT, access=VASTDB_ACCESS_KEY, secret=VASTDB_SECRET_KEY)
with session.transaction() as tx:
    try:
        bucket = tx.bucket(VASTDB_NYT_BUCKET)
        schema = bucket.schema(VASTDB_NYT_SCHEMA)
        table = schema.table(VASTDB_NYT_TABLE)
        if schema and table and DROP_TABLE: 
            table.drop()
    except Exception as e:
        print("*******", e)
        pass

In [9]:
def connect_to_vastdb(endpoint, access_key, secret_key):
    """Connects to VastDB."""
    try:
        session = vastdb.connect(endpoint=endpoint, access=access_key, secret=secret_key)
        print("Connected to VastDB")
        return session
    except Exception as e:
        raise RuntimeError(f"Failed to connect to VastDB: {e}") from e

def import_to_vastdb(session, bucket_name, schema_name, table_name, files_to_import):
    with session.transaction() as tx:
        bucket = tx.bucket(bucket_name)
        schema = bucket.schema(schema_name, fail_if_missing=False) or bucket.create_schema(schema_name)
        table = schema.table(table_name, fail_if_missing=False)

        if table:
            table.import_files(files_to_import=files_to_import)
        else:
            table = vastdb.util.create_table_from_files(
                schema=schema, 
                table_name=table_name,
                parquet_files=files_to_import
            )

In [10]:
def file_exists_in_s3(s3_client, bucket_name, s3_key):
    try:
        s3_client.head_object(Bucket=bucket_name, Key=s3_key)
        return True  # File exists
    except ClientError as e:
        if e.response['Error']['Code'] == "404":
            return False  # File does not exist
        raise  # Re-raise other exceptions

In [11]:
# session = connect_to_vastdb(VASTDB_ENDPOINT, VASTDB_ACCESS_KEY, VASTDB_SECRET_KEY)

# Initialize S3 client
s3_client = boto3.client(
    's3', 
    region_name='vast',
    endpoint_url=S3_ENDPOINT,
    aws_access_key_id=S3_ACCESS_KEY,
    aws_secret_access_key=S3_SECRET_KEY
)

In [12]:
session = connect_to_vastdb(VASTDB_ENDPOINT, VASTDB_ACCESS_KEY, VASTDB_SECRET_KEY)

Connected to VastDB


In [13]:
from math import ceil

# Generate S3 keys
s3_keys = [f"/{S3_BUCKET}/{S3_FOLDER}/yellow_tripdata_{date}.parquet" for date in dates]

def chunk_list(lst, chunk_size):
    """Yield successive chunks of a given size from a list."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

batch_size = BATCH_SIZE

# Process files in batches
for batch in chunk_list(s3_keys, batch_size):
    print("Processing batch:")
    for f in batch:
        print(f"\t{f}")
    import_to_vastdb(
        session=session,
        bucket_name=VASTDB_NYT_BUCKET,
        schema_name=VASTDB_NYT_SCHEMA,
        table_name=VASTDB_NYT_TABLE,
        files_to_import=batch
    )

Processing batch:
	/csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-02.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-03.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-04.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-05.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-06.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-07.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-08.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-09.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-10.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-11.parquet
	/csnow-bucket/nyt/yellow_tripdata_2009-12.parquet
Processing batch:
	/csnow-bucket/nyt/yellow_tripdata_2010-01.parquet
	/csnow-bucket/nyt/yellow_tripdata_2010-02.parquet
	/csnow-bucket/nyt/yellow_tripdata_2010-03.parquet
	/csnow-bucket/nyt/yellow_tripdata_2010-04.parquet
	/csnow-bucket/nyt/yellow_tripdata_2010-05.parquet
	/csnow-bucket/nyt/yellow_tripdata_2010-06.parquet
	/csnow-bucket/nyt/yellow_tripdata_2010-07.par