In [None]:
!pip install boto3 tqdm dotenv

In [None]:
import boto3
import os
from boto3.s3.transfer import TransferConfig
from tqdm import tqdm
from dotenv import load_dotenv

In [None]:
# ---------------------------------------------------------------------
# LOAD CONFIGURATION
# ---------------------------------------------------------------------
load_dotenv("config.env")

DATASCIENCE_PROJECT_NAMESPACE = os.getenv('DATASCIENCE_PROJECT_NAMESPACE')

In [None]:
directory = '/opt/app-root/src/raft-workshop/dataset'
key_base = f'{DATASCIENCE_PROJECT_NAMESPACE}/dataset'
bucket_name = os.getenv("AWS_S3_BUCKET")

print(
    f"""
Upload Configuration
--------------------
Local directory  : {directory}
S3 bucket        : {bucket_name}
S3 key base      : {key_base}

Result:
Files from the local dataset directory will be uploaded to:
s3://{bucket_name}/{key_base}/
"""
)

In [None]:
# Configure S3 transfer settings for efficient multi-part uploads
config = TransferConfig(
    multipart_threshold=1024 * 25,
    max_concurrency=10,
    multipart_chunksize=1024 * 25,
    use_threads=True
)

In [None]:
# Walk the local dataset directory and upload each file to MinIO with progress bar
for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)

        rel_path = os.path.relpath(file_path, directory)
        key_name = os.path.join(key_base, rel_path)

        print(key_name)
        try:
            with tqdm(
                total=os.path.getsize(file_path),
                unit='B',
                unit_scale=True,
                desc=file_path
            ) as pbar:
                s3_client = boto3.client('s3', endpoint_url=os.getenv("AWS_S3_ENDPOINT"))
                s3_client.upload_file(
                    file_path,
                    "test",
                    key_name,
                    Config=config,
                    Callback=lambda bytes_transferred: pbar.update(bytes_transferred)
                )
            print(f'File {file_path} uploaded to {bucket_name}/{key_name}')
        except Exception as e:
            print(f'Error occurred while uploading {file_path}: {e}')