In [1]:
import sys
import os
import warnings
import logging


In [2]:
# from utils.helpers import load_cfg 
import yaml


def load_cfg(cfg_file):
    """
    Load configuration from a YAML config file
    """
    cfg = None
    with open(cfg_file, "r") as f:
        try:
            cfg = yaml.safe_load(f)
        except yaml.YAMLError as exc:
            print(exc)

    return cfg


In [3]:
# from utils.minio_utils import MinIOClient

from minio import Minio


class MinIOClient:
    def __init__(self, endpoint_url, access_key, secret_key):
        self.endpoint_url = endpoint_url
        self.access_key = access_key
        self.secret_key = secret_key

    def create_conn(self):
        client = Minio(
            endpoint=self.endpoint_url,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=False,
        )
        return client

    def create_bucket(self, bucket_name):
        client = self.create_conn()

        # Create bucket if not exist
        found = client.bucket_exists(bucket_name=bucket_name)
        if not found:
            client.make_bucket(bucket_name=bucket_name)
            print(f"Bucket {bucket_name} created successfully!")
        else:
            print(f"Bucket {bucket_name} already exists, skip creating!")

    def list_parquet_files(self, bucket_name, prefix=""):
        client = self.create_conn()

        # List all objects in the bucket with the given prefix
        objects = client.list_objects(bucket_name, prefix=prefix, recursive=True)
        # Filter and collect Parquet file names
        parquet_files = [
            obj.object_name for obj in objects if obj.object_name.endswith(".parquet")
        ]

        return parquet_files


In [4]:
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s:%(funcName)s:%(levelname)s:%(message)s')
warnings.filterwarnings('ignore')

In [5]:
__file__ = os.getcwd()
print(__file__) 

e:\BigData_2\MyProject\src\batch_processing


In [6]:
project_root = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
project_root

'e:\\BigData_2'

In [7]:
CFG_FILE = os.path.join(project_root, "MyProject/config", "datalake.yaml")

In [8]:
cfg = load_cfg(CFG_FILE)
datalake_cfg = cfg["datalake"]
datalake_cfg

{'endpoint': 'localhost:9000',
 'bucket_name_1': 'raw',
 'bucket_name_2': 'processed',
 'bucket_name_3': 'sandbox',
 'folder_name': 'batch',
 'access_key': 'Xs27nx9M4HgPQ5PXZiUE',
 'secret_key': '8iifKZlUZh1NRbepsISUMdg1CxlaIC6OSPQk5X59'}

In [9]:
MINIO_ENDPOINT = datalake_cfg["endpoint"]
MINIO_ACCESS_KEY = datalake_cfg["access_key"]
MINIO_SECRET_KEY = datalake_cfg["secret_key"]
BUCKET_NAME_2 = datalake_cfg['bucket_name_2']
BUCKET_NAME_3 = datalake_cfg['bucket_name_3']

In [10]:
MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, BUCKET_NAME_2, BUCKET_NAME_3

('localhost:9000',
 'Xs27nx9M4HgPQ5PXZiUE',
 '8iifKZlUZh1NRbepsISUMdg1CxlaIC6OSPQk5X59',
 'processed',
 'sandbox')

In [11]:
JARS_DIR = os.path.join(project_root, "MyProject/jars")
JARS_DIR

'e:\\BigData_2\\MyProject/jars'

In [12]:
jars =  [ JARS_DIR + "/hadoop-aws-3.3.4.jar", JARS_DIR + "/aws-java-sdk-bundle-1.12.262.jar", 
             ]

In [None]:
','.join(jars)

In [15]:
###############################################
# PySpark
###############################################

def delta_convert(endpoint_url, access_key, secret_key):
    """
        Convert parquet file to delta format
    """
    from pyspark.sql import SparkSession
    from delta.pip_utils import configure_spark_with_delta_pip
    
    # jars =  [ JARS_DIR + "/hadoop-aws-3.3.4.jar", JARS_DIR + "/aws-java-sdk-bundle-1.12.262.jar", 
    #          ]
    jars = "../../jars/hadoop-aws-3.3.4.jar,../../jars/aws-java-sdk-bundle-1.12.262.jar"
            
    builder = SparkSession.builder \
            .appName("DeltaConvert") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.hadoop.fs.s3a.access.key", access_key) \
            .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
            .config("spark.hadoop.fs.s3a.endpoint", endpoint_url) \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
            .config("spark.jars", jars)
    
    spark = configure_spark_with_delta_pip(
        builder,
    ).getOrCreate()
    
    logging.info('Spark session successfully created!')
    
    client = MinIOClient(
        endpoint_url=endpoint_url,
        access_key=access_key,
        secret_key=secret_key
    )
    client.create_bucket(BUCKET_NAME_3)
    
    # Convert to delta format
    for file in client.list_parquet_files(bucket_name=BUCKET_NAME_2):
        if "yellow" in file:
            df = spark.read.parquet(f"s3a://{BUCKET_NAME_2}/{file}")
            df.write.format("delta") \
                    .mode("overwrite") \
                    .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")
            logging.info(f"File {file} converted to delta format!")

In [None]:
delta_convert(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY)

In [13]:
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip
jars = "../../jars/hadoop-aws-3.3.4.jar,../../jars/aws-java-sdk-bundle-1.12.262.jar"

builder = SparkSession.builder \
        .appName("DeltaConvert") \
        .config("spark.executor.memory", '2g') \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
        .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
        .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT) \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.jars", jars)

spark = configure_spark_with_delta_pip(
    builder,
    extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
).getOrCreate()

logging.info('Spark session successfully created!')

client = MinIOClient(
    endpoint_url=MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY
)
client.create_bucket(BUCKET_NAME_3)

2024-12-22 15:57:39,559:<module>:INFO:Spark session successfully created!


Bucket sandbox already exists, skip creating!


In [14]:
# for file in client.list_parquet_files(bucket_name=BUCKET_NAME_2):
#     df = spark.read.parquet(f"s3a://{BUCKET_NAME_2}/{file}")
#     df.write.format("delta") \
#             .mode("overwrite") \
#             .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")
#     logging.info(f"File {file} converted to delta format!")

In [14]:
list_files = client.list_parquet_files(bucket_name=BUCKET_NAME_2)

In [15]:
file = list_files[1]

In [16]:
file

'2023/yellow_tripdata_2023-01.parquet'

In [17]:
f"s3://{BUCKET_NAME_2}"

's3://processed'

In [18]:
f"s3://{BUCKET_NAME_2}/{file}"

's3://processed/2023/yellow_tripdata_2023-01.parquet'

In [19]:
file_path = f"s3a://{BUCKET_NAME_2}/" + file

In [20]:
df = spark.read.parquet(file_path)

In [21]:
df.show(10)

+--------------------+------------+-------------------+------------------+-------------------+-----+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----------+------------------+----------+------------+------------+-------------+--------+
|congestion_surcharge|dolocationid|   dropoff_datetime|  dropoff_latitude|  dropoff_longitude|extra|fare_amount|improvement_surcharge|mta_tax|passenger_count|payment_type|    pickup_datetime|pickup_latitude|  pickup_longitude|pulocationid|ratecodeid|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|vendorid|
+--------------------+------------+-------------------+------------------+-------------------+-----+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----------+------------------+----------+------------+------------+-------------+--------+
|         

In [22]:
df.count()

2733522

In [23]:
df.write.format("delta") \
        .mode("overwrite") \
        .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")

In [47]:
import pandas as pd

In [63]:
test_df = pd.read_parquet('./part-00002-73de2297-9823-488e-90d4-61b970e49e28-c000.snappy.parquet')

In [None]:
test_df

In [None]:
DATE = '2024-09-09'
YEAR = DATE.split('-')[0]
MONTH = DATE.split('-')[1]
DAY = DATE.split('-')[2]

path = f"s3a://{BUCKET_NAME}/{YEAR}/{TAXI_TYPE}/{MONTH}/{DAY}.parquet"
root_folder = 
for root, dirs, files in os.walk(root_folder):
        print(f'Đang duyệt thư mục: {root}')
        
        # In ra các thư mục con
        if dirs:
            print("Các thư mục con:")
            for dir in dirs:
                print(f"- {dir}")

In [1]:
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip
    
jars = "../../../jars/hadoop-aws-3.3.4.jar,../../../jars/aws-java-sdk-bundle-1.12.262.jar"

builder = SparkSession.builder \
        .appName("DeltaConvert") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provide+r", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.jars", jars)

spark = configure_spark_with_delta_pip(
    builder,
    extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
).getOrCreate()