In [1]:
import sys
import os
import warnings
import logging


In [2]:
# from utils.helpers import load_cfg 
import yaml


def load_cfg(cfg_file):
    """
    Load configuration from a YAML config file
    """
    cfg = None
    with open(cfg_file, "r") as f:
        try:
            cfg = yaml.safe_load(f)
        except yaml.YAMLError as exc:
            print(exc)

    return cfg


In [3]:
# from utils.minio_utils import MinIOClient

from minio import Minio


class MinIOClient:
    def __init__(self, endpoint_url, access_key, secret_key):
        self.endpoint_url = endpoint_url
        self.access_key = access_key
        self.secret_key = secret_key

    def create_conn(self):
        client = Minio(
            endpoint=self.endpoint_url,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=False,
        )
        return client

    def create_bucket(self, bucket_name):
        client = self.create_conn()

        # Create bucket if not exist
        found = client.bucket_exists(bucket_name=bucket_name)
        if not found:
            client.make_bucket(bucket_name=bucket_name)
            print(f"Bucket {bucket_name} created successfully!")
        else:
            print(f"Bucket {bucket_name} already exists, skip creating!")

    def list_parquet_files(self, bucket_name, prefix=""):
        client = self.create_conn()

        # List all objects in the bucket with the given prefix
        objects = client.list_objects(bucket_name, prefix=prefix, recursive=True)
        # Filter and collect Parquet file names
        parquet_files = [
            obj.object_name for obj in objects if obj.object_name.endswith(".parquet")
        ]

        return parquet_files


In [4]:
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s:%(funcName)s:%(levelname)s:%(message)s')
warnings.filterwarnings('ignore')

In [5]:
__file__ = os.getcwd()
print(__file__) 

d:\BigData\MyProject\src\batch_processing


In [6]:
project_root = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
project_root

'd:\\BigData'

In [7]:
CFG_FILE = os.path.join(project_root, "MyProject/config", "datalake.yaml")

In [8]:
cfg = load_cfg(CFG_FILE)
datalake_cfg = cfg["datalake"]
datalake_cfg

{'endpoint': 'localhost:9000',
 'bucket_name_1': 'raw',
 'bucket_name_2': 'processed',
 'bucket_name_3': 'sandbox',
 'folder_name': 'batch',
 'access_key': '0VQBMtMhycuIrat2ivLH',
 'secret_key': 'xozBRG1AkxBkEnwN3JePy1BhhvHQGtE1sCAEmZeI'}

In [9]:
MINIO_ENDPOINT = datalake_cfg["endpoint"]
MINIO_ACCESS_KEY = datalake_cfg["access_key"]
MINIO_SECRET_KEY = datalake_cfg["secret_key"]
BUCKET_NAME_2 = datalake_cfg['bucket_name_2']
BUCKET_NAME_3 = datalake_cfg['bucket_name_3']

In [10]:
MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, BUCKET_NAME_2, BUCKET_NAME_3

('localhost:9000',
 '0VQBMtMhycuIrat2ivLH',
 'xozBRG1AkxBkEnwN3JePy1BhhvHQGtE1sCAEmZeI',
 'processed',
 'sandbox')

In [53]:
###############################################
# PySpark
###############################################

def delta_convert(endpoint_url, access_key, secret_key):
    """
        Convert parquet file to delta format
    """
    from pyspark.sql import SparkSession
    from delta.pip_utils import configure_spark_with_delta_pip
    
    jars = "../../../jars/hadoop-aws-3.3.4.jar,../../../jars/aws-java-sdk-bundle-1.12.262.jar"
    
    builder = SparkSession.builder \
            .appName("DeltaConvert") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.hadoop.fs.s3a.access.key", access_key) \
            .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
            .config("spark.hadoop.fs.s3a.endpoint", endpoint_url) \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
            .config("spark.jars", jars)
    
    spark = configure_spark_with_delta_pip(
        builder,
        extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
    ).getOrCreate()
    
    logging.info('Spark session successfully created!')
    
    client = MinIOClient(
        endpoint_url=endpoint_url,
        access_key=access_key,
        secret_key=secret_key
    )
    client.create_bucket(BUCKET_NAME_3)
    
    # Convert to delta format
    for file in client.list_parquet_files(bucket_name=BUCKET_NAME_2):
        df = spark.read.parquet(f"s3a://{BUCKET_NAME_2}/{file}")
        df.write.format("delta") \
                .mode("overwrite") \
                .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")
        logging.info(f"File {file} converted to delta format!")

In [None]:
delta_convert(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY)

2024-12-16 23:41:16,127:delta_convert:INFO:Spark session successfully created!


Bucket sandbox created successfully!


In [12]:
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip
    
jars = "../../../jars/hadoop-aws-3.3.4.jar,../../../jars/aws-java-sdk-bundle-1.12.262.jar"

builder = SparkSession.builder \
        .appName("DeltaConvert") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
        .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
        .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT) \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.jars", jars)

spark = configure_spark_with_delta_pip(
    builder,
    extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
).getOrCreate()

logging.info('Spark session successfully created!')

client = MinIOClient(
    endpoint_url=MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY
)
client.create_bucket(BUCKET_NAME_3)

2024-12-17 09:23:39,310:<module>:INFO:Spark session successfully created!


Bucket sandbox already exists, skip creating!


In [None]:
for file in client.list_parquet_files(bucket_name=BUCKET_NAME_2):
    df = spark.read.parquet(f"s3a://{BUCKET_NAME_2}/{file}")
    df.write.format("delta") \
            .mode("overwrite") \
            .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")
    logging.info(f"File {file} converted to delta format!")

In [13]:
list_files = client.list_parquet_files(bucket_name=BUCKET_NAME_2)

In [54]:
file = list_files[1]

In [55]:
file

'2023/yellow_tripdata_2023-01.parquet'

In [56]:
f"s3://{BUCKET_NAME_2}"

's3://processed'

In [57]:
f"s3://{BUCKET_NAME_2}/{file}"

's3://processed/2023/yellow_tripdata_2023-01.parquet'

In [58]:
file_path = f"s3a://{BUCKET_NAME_2}/" + file

In [59]:
df = spark.read.parquet(file_path)

In [60]:
df.show(10)

+--------------------+------------+-------------------+------------------+-------------------+-----+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----------+------------------+----------+------------+------------+-------------+--------+
|congestion_surcharge|dolocationid|   dropoff_datetime|  dropoff_latitude|  dropoff_longitude|extra|fare_amount|improvement_surcharge|mta_tax|passenger_count|payment_type|    pickup_datetime|pickup_latitude|  pickup_longitude|pulocationid|ratecodeid|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|vendorid|
+--------------------+------------+-------------------+------------------+-------------------+-----+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----------+------------------+----------+------------+------------+-------------+--------+
|         

In [65]:
df.count()

2733522

In [66]:
df.write.format("delta") \
        .mode("overwrite") \
        .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")

In [47]:
import pandas as pd

In [63]:
test_df = pd.read_parquet('./part-00002-73de2297-9823-488e-90d4-61b970e49e28-c000.snappy.parquet')

In [64]:
test_df

Unnamed: 0,congestion_surcharge,dolocationid,dropoff_datetime,dropoff_latitude,dropoff_longitude,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_latitude,pickup_longitude,pulocationid,ratecodeid,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,vendorid
0,2.5,141,2023-01-01 00:40:36,37.742295,-122.465106,1.00,9.3,1.0,0.5,1.0,...,30.302121,-81.619652,161,1.0,N,0.00,0.0,14.30,0.97,2
1,2.5,237,2023-01-01 01:01:27,-35.023639,138.676741,1.00,7.9,1.0,0.5,1.0,...,40.782773,-73.965363,43,1.0,N,4.00,0.0,16.90,1.10,2
2,2.5,238,2023-01-01 00:37:49,-35.023508,138.676646,1.00,14.9,1.0,0.5,1.0,...,36.103413,-84.131863,48,1.0,N,15.00,0.0,34.90,2.51,2
3,0.0,7,2023-01-01 00:13:25,46.188201,-123.831980,7.25,12.1,1.0,0.5,0.0,...,40.775714,-73.873364,138,1.0,N,0.00,0.0,20.85,1.90,1
4,2.5,79,2023-01-01 00:21:19,40.729269,-73.987361,1.00,11.4,1.0,0.5,1.0,...,30.047424,-90.689813,107,1.0,N,3.28,0.0,19.68,1.43,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048571,2.5,261,2023-01-13 17:07:31,40.711900,-74.012527,2.50,51.3,1.0,0.5,1.0,...,-35.015300,138.635570,236,1.0,N,17.34,0.0,75.14,7.83,2
1048572,2.5,161,2023-01-13 16:36:12,30.302121,-81.619652,2.50,11.4,1.0,0.5,1.0,...,-35.023639,138.676741,237,1.0,N,5.37,0.0,23.27,1.07,2
1048573,2.5,164,2023-01-13 16:56:01,40.749842,-73.984251,2.50,7.2,1.0,0.5,4.0,...,30.302121,-81.619652,161,1.0,N,2.74,0.0,16.44,0.69,2
1048574,2.5,100,2023-01-13 16:22:43,40.753694,-73.990517,2.50,13.5,1.0,0.5,1.0,...,40.734186,-74.005580,249,1.0,N,4.00,0.0,24.00,1.28,2


In [1]:
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip
    
jars = "../../../jars/hadoop-aws-3.3.4.jar,../../../jars/aws-java-sdk-bundle-1.12.262.jar"

builder = SparkSession.builder \
        .appName("DeltaConvert") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provide+r", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.jars", jars)

spark = configure_spark_with_delta_pip(
    builder,
    extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
).getOrCreate()