In [1]:
import sys
import os
import warnings
import logging


In [2]:
# from utils.helpers import load_cfg 
import yaml


def load_cfg(cfg_file):
    """
    Load configuration from a YAML config file
    """
    cfg = None
    with open(cfg_file, "r") as f:
        try:
            cfg = yaml.safe_load(f)
        except yaml.YAMLError as exc:
            print(exc)

    return cfg


In [3]:
# from utils.minio_utils import MinIOClient

from minio import Minio


class MinIOClient:
    def __init__(self, endpoint_url, access_key, secret_key):
        self.endpoint_url = endpoint_url
        self.access_key = access_key
        self.secret_key = secret_key

    def create_conn(self):
        client = Minio(
            endpoint=self.endpoint_url,
            access_key=self.access_key,
            secret_key=self.secret_key,
            secure=False,
        )
        return client

    def create_bucket(self, bucket_name):
        client = self.create_conn()

        # Create bucket if not exist
        found = client.bucket_exists(bucket_name=bucket_name)
        if not found:
            client.make_bucket(bucket_name=bucket_name)
            print(f"Bucket {bucket_name} created successfully!")
        else:
            print(f"Bucket {bucket_name} already exists, skip creating!")

    def list_parquet_files(self, bucket_name, prefix=""):
        client = self.create_conn()

        # List all objects in the bucket with the given prefix
        objects = client.list_objects(bucket_name, prefix=prefix, recursive=True)
        # Filter and collect Parquet file names
        parquet_files = [
            obj.object_name for obj in objects if obj.object_name.endswith(".parquet")
        ]

        return parquet_files


In [4]:
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s:%(funcName)s:%(levelname)s:%(message)s')
warnings.filterwarnings('ignore')

In [5]:
__file__ = os.getcwd()
print(__file__) 

e:\BigData_2\MyProject\src\batch_processing


In [6]:
project_root = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
project_root

'e:\\BigData_2'

In [7]:
CFG_FILE = os.path.join(project_root, "MyProject/config", "datalake.yaml")

In [8]:
cfg = load_cfg(CFG_FILE)
datalake_cfg = cfg["datalake"]
datalake_cfg

{'endpoint': 'localhost:9000',
 'bucket_name_1': 'raw',
 'bucket_name_2': 'processed',
 'bucket_name_3': 'sandbox',
 'folder_name': 'batch',
 'access_key': 'Xs27nx9M4HgPQ5PXZiUE',
 'secret_key': '8iifKZlUZh1NRbepsISUMdg1CxlaIC6OSPQk5X59'}

In [9]:
MINIO_ENDPOINT = datalake_cfg["endpoint"]
MINIO_ACCESS_KEY = datalake_cfg["access_key"]
MINIO_SECRET_KEY = datalake_cfg["secret_key"]
BUCKET_NAME_2 = datalake_cfg['bucket_name_2']
BUCKET_NAME_3 = datalake_cfg['bucket_name_3']

In [10]:
MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY, BUCKET_NAME_2, BUCKET_NAME_3

('localhost:9000',
 'Xs27nx9M4HgPQ5PXZiUE',
 '8iifKZlUZh1NRbepsISUMdg1CxlaIC6OSPQk5X59',
 'processed',
 'sandbox')

In [11]:
JARS_DIR = os.path.join(project_root, "MyProject/jars")
JARS_DIR

'e:\\BigData_2\\MyProject/jars'

In [12]:
jars =  [ JARS_DIR + "/hadoop-aws-3.3.4.jar", 
          JARS_DIR + "/aws-java-sdk-bundle-1.12.262.jar", 
          # JARS_DIR + "/gcs-connector-hadoop3-latest.jar",
          # JARS_DIR + "/spark-bigquery-latest_2.12.jar"
        ]

In [13]:
','.join(jars)

'e:\\BigData_2\\MyProject/jars/hadoop-aws-3.3.4.jar,e:\\BigData_2\\MyProject/jars/aws-java-sdk-bundle-1.12.262.jar'

In [14]:
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip
jars = "../../jars/hadoop-aws-3.3.4.jar,../../jars/aws-java-sdk-bundle-1.12.262.jar"

builder = SparkSession.builder \
        .appName("DeltaConvert") \
        .config("spark.executor.memory", '2g') \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
        .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
        .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT) \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.jars", jars)

spark = configure_spark_with_delta_pip(
    builder,
    extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
).getOrCreate()

logging.info('Spark session successfully created!')

client = MinIOClient(
    endpoint_url=MINIO_ENDPOINT,
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY
)
client.create_bucket(BUCKET_NAME_3)

2024-12-24 21:34:35,705:<module>:INFO:Spark session successfully created!


Bucket sandbox already exists, skip creating!


In [15]:
# for file in client.list_parquet_files(bucket_name=BUCKET_NAME_2):
#     df = spark.read.parquet(f"s3a://{BUCKET_NAME_2}/{file}")
#     df.write.format("delta") \
#             .mode("overwrite") \
#             .save(f"s3a://{BUCKET_NAME_3}/{datalake_cfg["folder_name"]}")
#     logging.info(f"File {file} converted to delta format!")

In [15]:
list_files = client.list_parquet_files(bucket_name=BUCKET_NAME_2)

In [16]:
file = list_files[1]

In [17]:
file

'2023/yellow_tripdata_2023-01.parquet'

In [20]:
f"s3://{BUCKET_NAME_2}"

's3://processed'

In [21]:
f"s3://{BUCKET_NAME_2}/{file}"

's3://processed/2023/yellow_tripdata_2023-01.parquet'

In [22]:
file_path = f"s3a://{BUCKET_NAME_2}/" + file

In [23]:
file_path

's3a://processed/2023/yellow_tripdata_2023-01.parquet'

In [24]:
df = spark.read.parquet(file_path)

In [25]:
df.show(5)

+--------------------+------------+-------------------+----------------+-----------------+-----+-----------+--------------+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----+----------+------------------+----------+------------+------------+-------------+--------+
|congestion_surcharge|dolocationid|   dropoff_datetime|dropoff_latitude|dropoff_longitude|extra|fare_amount|            id|id_customer|improvement_surcharge|mta_tax|passenger_count|payment_type|    pickup_datetime|pickup_latitude|  pickup_longitude|pulocationid|rate|ratecodeid|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|vendorid|
+--------------------+------------+-------------------+----------------+-----------------+-----+-----------+--------------+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----+----------+------

In [27]:
df_t_2 = spark.read.csv("s3a://raw/taxi_lookup.csv", header=True)

In [28]:
df_t_2.show(10)

+----------+-------------+--------------+------------+-----------+------------------+
|LocationID|      Borough|          zone|service_zone|   latitude|         longitude|
+----------+-------------+--------------+------------+-----------+------------------+
|         1|          EWR|Newark Airport|         EWR|40.68906405|-74.17725485035348|
|         2|       Queens|   Jamaica Bay|   Boro Zone| 40.6039936|       -73.8354124|
|         3|        Bronx|      Allerton|   Boro Zone|  39.915319|-87.93321507559128|
|         4|    Manhattan| Alphabet City| Yellow Zone| 40.7251022|       -73.9795833|
|         5|Staten Island| Arden Heights|   Boro Zone| 53.2843196|         -7.492801|
|         6|Staten Island|      Arrochar|   Boro Zone| 56.1954653|        -4.7480746|
|         7|       Queens|       Astoria|   Boro Zone| 46.1882007|      -123.8319802|
|         8|       Queens|  Astoria Park|   Boro Zone| 33.8241622|       -78.8918751|
|         9|       Queens|    Auburndale|   Boro Zone|

In [28]:
df.count()

2733522

In [None]:
df.write.format("delta") \
        .mode("append") \
        .save(f"s3a://{BUCKET_NAME_3}/test")

In [26]:
test_df = spark.read.format("delta") \
            .load(f"s3a://raw/cdc_db/2024/01/02/data")

In [27]:
test_df.show(10)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+---------+--------------------+--------------+-----------+----+
|vendorid|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|ratecodeid|pulocationid|dolocationid|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|            id|id_customer|rate|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------------------+------------+------------+---------+--------------------+--------------+-----------+----+
|       2| 2023-12-01 07:46:55|  2023-12-01 07:58:25|                 N|       1.0|         236| 

In [18]:
test_df_3 = spark.read.format("delta").load(f"s3a://processed/data/yellow/2024/01/02")


In [19]:
test_df_3.show(10)

+--------------------+------------+-------------------+------------------+------------------+-----+-----------+----+---------------------+-------+---------------+------------+-------------------+------------------+------------------+------------+----------+----------+------------+------------+-------------+--------+
|congestion_surcharge|dolocationid|   dropoff_datetime|  dropoff_latitude| dropoff_longitude|extra|fare_amount| fee|improvement_surcharge|mta_tax|passenger_count|payment_type|    pickup_datetime|   pickup_latitude|  pickup_longitude|pulocationid|ratecodeid|tip_amount|tolls_amount|total_amount|trip_distance|vendorid|
+--------------------+------------+-------------------+------------------+------------------+-----+-----------+----+---------------------+-------+---------------+------------+-------------------+------------------+------------------+------------+----------+----------+------------+------------+-------------+--------+
|                 2.5|         170|2024-01-02 

In [20]:
test_df_4 = spark.read.format("delta").load(f"s3a://processed/data/green/2024/01/02")

In [21]:
test_df_4.show(10)

+--------------------+------------+-------------------+----------------+-----------------+-----+-----------+--------------+-----------+---------------------+-------+---------------+------------+-------------------+---------------+----------------+------------+----+----------+----------+------------+------------+-------------+--------+
|congestion_surcharge|dolocationid|   dropoff_datetime|dropoff_latitude|dropoff_longitude|extra|fare_amount|            id|id_customer|improvement_surcharge|mta_tax|passenger_count|payment_type|    pickup_datetime|pickup_latitude|pickup_longitude|pulocationid|rate|ratecodeid|tip_amount|tolls_amount|total_amount|trip_distance|vendorid|
+--------------------+------------+-------------------+----------------+-----------------+-----+-----------+--------------+-----------+---------------------+-------+---------------+------------+-------------------+---------------+----------------+------------+----+----------+----------+------------+------------+-------------

In [24]:
spark.stop()

In [22]:
test_df_3.printSchema()

root
 |-- congestion_surcharge: double (nullable = true)
 |-- dolocationid: integer (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- extra: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- ratecodeid: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- vendorid: integer (nullable = true)



In [23]:
test_df_4.printSchema()

root
 |-- congestion_surcharge: double (nullable = true)
 |-- dolocationid: integer (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- extra: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- id: string (nullable = true)
 |-- id_customer: long (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- rate: double (nullable = true)
 |-- ratecodeid: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- trip_dis

In [36]:
df.write.format("parquet") \
    .mode("append") \
    .save(f"s3a://{BUCKET_NAME_3}/test_2/test_2.parquet")

In [38]:
test_df_2 = spark.read.format("parquet") \
                .load(f"s3a://{BUCKET_NAME_3}/test_2/test_2.parquet")

In [40]:
test_df_2.show(10)

+--------------------+------------+-------------------+------------------+-------------------+-----+-----------+--------------+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----+----------+------------------+----------+------------+------------+-------------+--------+
|congestion_surcharge|dolocationid|   dropoff_datetime|  dropoff_latitude|  dropoff_longitude|extra|fare_amount|            id|id_customer|improvement_surcharge|mta_tax|passenger_count|payment_type|    pickup_datetime|pickup_latitude|  pickup_longitude|pulocationid|rate|ratecodeid|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|vendorid|
+--------------------+------------+-------------------+------------------+-------------------+-----+-----------+--------------+-----------+---------------------+-------+---------------+------------+-------------------+---------------+------------------+------------+----+-----

In [42]:
test_df_2.count()


2733522

In [None]:
test_df

In [1]:
from pyspark.sql import SparkSession
from delta.pip_utils import configure_spark_with_delta_pip
    
jars = "../../../jars/hadoop-aws-3.3.4.jar,../../../jars/aws-java-sdk-bundle-1.12.262.jar"

builder = SparkSession.builder \
        .appName("DeltaConvert") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provide+r", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.jars", jars)

spark = configure_spark_with_delta_pip(
    builder,
    extra_packages=["org.apache.hadoop:hadoop-aws:3.3.4"]
).getOrCreate()