In [1]:
#Import Library
import sys
import os 
from glob import glob
from minio import Minio


In [2]:
# ---------------- load_cfg from helpers.oy ----------------
import yaml

def load_cfg(cfg_file):
    """
    Load configuration from a YAML config file
    """
    cfg = None
    with open(cfg_file, "r") as f:
        try:
            cfg = yaml.safe_load(f)
        except yaml.YAMLError as exc:
            print(exc)

    return cfg

In [3]:
# ---------------- import MinIOClients from minio_utils.py ----------------

class MinIOClient:
    def __init__(self, endpoint_url, access_key, secret_key):
        self.endpoint_url = endpoint_url
        self.access_key = access_key
        self.secret_key = secret_key

    def create_conn(self):
        client = Minio(
            endpoint=self.endpoint_url,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=False,
        )
        return client

    def create_bucket(self, bucket_name):
        client = self.create_conn()
        
        # Create bucket if not exist
        found = client.bucket_exists(bucket_name=bucket_name)
        if not found:
            client.make_bucket(bucket_name=bucket_name)
            print(f"Bucket {bucket_name} created successfully!")
        else:
            print(f"Bucket {bucket_name} already exists, skip creating!")

    def list_parquet_files(self, bucket_name, prefix=""):
        client = self.create_conn()

        # List all objects in the bucket with the given prefix
        objects = client.list_objects(bucket_name, prefix=prefix, recursive=True)
        # Filter and collect Parquet file names
        parquet_files = [obj.object_name for obj in objects if obj.object_name.endswith('.parquet')]
            
        return parquet_files

In [4]:
CFG_FILE = "../config/datalake.yaml"
YEARS = ["2020", "2021", "2022", "2023", "2024"]
MONTHS = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [6]:
cfg = load_cfg(CFG_FILE)
cfg

{'nyc_data': {'folder_path': 'data'},
 'datalake': {'endpoint': 'localhost:9000',
  'bucket_name_1': 'raw',
  'bucket_name_2': 'processed',
  'bucket_name_3': 'sandbox',
  'folder_name': 'batch',
  'access_key': '0VQBMtMhycuIrat2ivLH',
  'secret_key': 'xozBRG1AkxBkEnwN3JePy1BhhvHQGtE1sCAEmZeI'}}

In [14]:
def extract_load(cfg):
    datalake_cfg = cfg["datalake"]
    nyc_data_cfg = cfg["nyc_data"]
    
    # Create MinIO client
    client = MinIOClient(
        endpoint_url=datalake_cfg["endpoint"],
        access_key=datalake_cfg["access_key"],
        secret_key=datalake_cfg["secret_key"],
    )
    
    client.create_bucket(datalake_cfg["bucket_name_1"])
    
    # for year in YEARS:
    # ...
    year = "2024"
    # Upload files
    all_fps = glob(os.path.join(nyc_data_cfg["folder_path"], year, "*.parquet"))
    for fp in all_fps:
        print(f"Uploading {fp} to MinIO...")
        client_minio = client.create_conn()
        client_minio.fput_object(
            bucket_name=datalake_cfg["bucket_name_1"],
            object_name=os.path.basename(fp),
            file_path=fp,
        )
    

In [15]:
if __name__ == "__main__":
    print("Extracting and loading data to MinIO...")
    cfg = load_cfg(CFG_FILE)
    extract_load(cfg)

Extracting and loading data to MinIO...
Bucket raw created successfully!
Uploading ../data\2024\green_tripdata_2024-01.parquet to MinIO...
Uploading ../data\2024\yellow_tripdata_2024-01.parquet to MinIO...
