# Bulk Load Yellow Taxi Data

In [1]:
! pip3 install --quiet vastdb

In [2]:
# Define date range
start_date = "2009-01"
end_date = "2024-08"

In [3]:
import pandas as pd

# date range
dates = pd.date_range(start_date, end_date, freq="MS").strftime("%Y-%m").tolist()
print(f"{min(dates)}..{max(dates)} - len: {len(dates)}")

2009-01..2024-08 - len: 188


In [4]:
import os
from io import StringIO
from urllib.parse import urlparse

import boto3
from botocore.exceptions import NoCredentialsError

import vastdb

In [5]:
VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")
S3_BUCKET = os.getenv("S3A_BUCKET")

###### SET THIS ######
VASTDB_TWITTER_INGEST_SCHEMA = 'taxi_data'
VASTDB_TWITTER_INGEST_TABLE = 'yellow_tripdata'
###### SET THIS ######

In [6]:
print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3_BUCKET={S3_BUCKET}
""")


---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=taxi_data
VASTDB_TWITTER_INGEST_TABLE=yellow_tripdata
---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3_BUCKET=csnow-bucket



In [7]:
def connect_to_vastdb(endpoint, access_key, secret_key):
    """Connects to VastDB."""
    try:
        session = vastdb.connect(endpoint=endpoint, access=access_key, secret=secret_key)
        print("Connected to VastDB")
        return session
    except Exception as e:
        raise RuntimeError(f"Failed to connect to VastDB: {e}") from e

def import_to_vastdb(session, bucket_name, schema_name, table_name, files_to_import):
    with session.transaction() as tx:
        bucket = tx.bucket(bucket_name)
        schema = bucket.schema(schema_name, fail_if_missing=False) or bucket.create_schema(schema_name)
        table = schema.table(table_name, fail_if_missing=False)

        if table:
            table.import_files(files_to_import=files_to_import)
        else:
            table = vastdb.util.create_table_from_files(
                schema=schema, 
                table_name=table_name,
                parquet_files=files_to_import
            )

In [8]:
def file_exists_in_s3(s3_client, bucket_name, s3_key):
    try:
        s3_client.head_object(Bucket=bucket_name, Key=s3_key)
        return True  # File exists
    except ClientError as e:
        if e.response['Error']['Code'] == "404":
            return False  # File does not exist
        raise  # Re-raise other exceptions

In [9]:
# session = connect_to_vastdb(VASTDB_ENDPOINT, VASTDB_ACCESS_KEY, VASTDB_SECRET_KEY)

# Initialize S3 client
s3_client = boto3.client(
    's3', 
    region_name='vast',
    endpoint_url=S3_ENDPOINT,
    aws_access_key_id=S3_ACCESS_KEY,
    aws_secret_access_key=S3_SECRET_KEY
)

In [None]:
session = connect_to_vastdb(VASTDB_ENDPOINT, VASTDB_ACCESS_KEY, VASTDB_SECRET_KEY)

for date in dates:

    file_name = f"yellow_tripdata_{date}.parquet"
    s3_key = f"yellow_tripdata/yellow_tripdata_{date}.parquet"
    
    if file_exists_in_s3(s3_client, S3_BUCKET, s3_key):

        # Upload the file to S3
        s3_key = f"yellow_tripdata/yellow_tripdata_{date}.parquet"
        print(s3_key)

        import_to_vastdb(
            session=session,
            bucket_name=VASTDB_TWITTER_INGEST_BUCKET, 
            schema_name=VASTDB_TWITTER_INGEST_SCHEMA, 
            table_name=VASTDB_TWITTER_INGEST_TABLE, 
            files_to_import=[f"/{S3_BUCKET}/{s3_key}"]
        )


Connected to VastDB
yellow_tripdata/yellow_tripdata_2009-01.parquet
