# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [12]:

%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

%%configure
{
    "--conf": "spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false",
    "--enable-glue-datacatalog" :"true",
    "--datalake-formats":"hudi"
}

You are already connected to a glueetl session c78fb162-241e-4e1b-b6e0-3a332def3078.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 2880 minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session c78fb162-241e-4e1b-b6e0-3a332def3078.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 4.0


You are already connected to a glueetl session c78fb162-241e-4e1b-b6e0-3a332def3078.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: G.1X
Setting new worker type to: G.1X


You are already connected to a glueetl session c78fb162-241e-4e1b-b6e0-3a332def3078.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: 5
Setting new number of workers to: 5


You are already connected to a glueetl session c78fb162-241e-4e1b-b6e0-3a332def3078.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


The following configurations have been updated: {'--conf': 'spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false', '--enable-glue-datacatalog': 'true', '--datalake-formats': 'hudi'}


####  Run this cell to set up and start your interactive session.


In [5]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [6]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
|  default|
|  hudidb1|
|  hudidb2|
|  hudidb3|
+---------+


In [7]:
try:
    import os
    import sys


    import pyspark
    from pyspark import SparkConf, SparkContext
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col, asc, desc
    from awsglue.utils import getResolvedOptions
    from awsglue.dynamicframe import DynamicFrame
    from awsglue.context import GlueContext

    from faker import Faker

    print("All modules are loaded .....")

except Exception as e:
    print("Some modules are missing {} ".format(e))

Some modules are missing No module named 'faker'


In [20]:
database_name1 = "hudidb7"
table_name = "hudi_table"
base_s3_path = "s3a://test-ramneek-2"
final_base_path = "{base_s3_path}/{table_name}".format(
    base_s3_path=base_s3_path, table_name=table_name
)




In [21]:
class DataGenerator(object):

    @staticmethod
    def get_data():
        # Manually created data
        return [
            (1, "Alice Johnson", "IT", "CA", 120000, 30, 5000, 1677624870),
            (2, "Bob Smith", "HR", "NY", 90000, 40, 7000, 1677624871),
            (3, "Charlie Lee", "Sales", "TX", 110000, 35, 8000, 1677624872),
            (4, "David Brown", "Marketing", "FL", 95000, 29, 4000, 1677624873),
            (5, "Eve Davis", "IT", "IL", 105000, 32, 6000, 1677624874)
        ]




In [22]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
        .getOrCreate()
    return spark


spark = create_spark_session()
sc = spark.sparkContext
glueContext = GlueContext(sc)




In [23]:
hudi_options = {
    'hoodie.table.name': table_name,
    "hoodie.datasource.write.storage.type": "MERGE_ON_READ",
    'hoodie.datasource.write.recordkey.field': 'emp_id',
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'ts',

    'hoodie.datasource.hive_sync.enable': 'true',
    "hoodie.datasource.hive_sync.mode":"hms",
    'hoodie.datasource.hive_sync.sync_as_datasource': 'false',
    'hoodie.datasource.hive_sync.database': database_name1,
    'hoodie.datasource.hive_sync.table': table_name,
    'hoodie.datasource.hive_sync.use_jdbc': 'false',
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor',
    'hoodie.datasource.write.hive_style_partitioning': 'true',

}




In [24]:
data = DataGenerator.get_data()

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
df = spark.createDataFrame(data=data, schema=columns)




In [25]:
df.show()

+------+-------------+----------+-----+------+---+-----+----------+
|emp_id|employee_name|department|state|salary|age|bonus|        ts|
+------+-------------+----------+-----+------+---+-----+----------+
|     1|Alice Johnson|        IT|   CA|120000| 30| 5000|1677624870|
|     2|    Bob Smith|        HR|   NY| 90000| 40| 7000|1677624871|
|     3|  Charlie Lee|     Sales|   TX|110000| 35| 8000|1677624872|
|     4|  David Brown| Marketing|   FL| 95000| 29| 4000|1677624873|
|     5|    Eve Davis|        IT|   IL|105000| 32| 6000|1677624874|
+------+-------------+----------+-----+------+---+-----+----------+


In [26]:
df.write.format("hudi").options(**hudi_options).mode("overwrite").save(final_base_path)




In [27]:
spark.sql("show databases;").show()


+---------+
|namespace|
+---------+
|  default|
|  hudidb1|
|  hudidb2|
|  hudidb3|
|  hudidb7|
+---------+


In [28]:
spark.sql("use hudidb7;").show()

++
||
++
++


In [29]:
spark.sql("show tables;").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  hudidb7|hudi_table_ro|      false|
|  hudidb7|hudi_table_rt|      false|
+---------+-------------+-----------+


In [30]:
spark.sql("select * from hudi_table_rt;").show()

+-------------------+--------------------+------------------+----------------------+--------------------+------+-------------+----------+-----+------+---+-----+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|emp_id|employee_name|department|state|salary|age|bonus|        ts|
+-------------------+--------------------+------------------+----------------------+--------------------+------+-------------+----------+-----+------+---+-----+----------+
|  20240927075321357|20240927075321357...|                 1|                      |25309ccf-dec5-4ae...|     1|Alice Johnson|        IT|   CA|120000| 30| 5000|1677624870|
|  20240927075321357|20240927075321357...|                 5|                      |25309ccf-dec5-4ae...|     5|    Eve Davis|        IT|   IL|105000| 32| 6000|1677624874|
|  20240927075321357|20240927075321357...|                 3|                      |25309ccf-dec5-4ae...|     3|  Charlie Lee|     Sales|   

In [32]:
s3_parquet_path = "s3://test-ramneek-2/hudi_table/25309ccf-dec5-4ae1-8f39-f6391e564bcb-0_0-84-348_20240927075321357.parquet"

# Read the Parquet file
df1 = spark.read.parquet(s3_parquet_path)




In [33]:
df1.show()

+-------------------+--------------------+------------------+----------------------+--------------------+------+-------------+----------+-----+------+---+-----+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|emp_id|employee_name|department|state|salary|age|bonus|        ts|
+-------------------+--------------------+------------------+----------------------+--------------------+------+-------------+----------+-----+------+---+-----+----------+
|  20240927075321357|20240927075321357...|                 1|                      |25309ccf-dec5-4ae...|     1|Alice Johnson|        IT|   CA|120000| 30| 5000|1677624870|
|  20240927075321357|20240927075321357...|                 5|                      |25309ccf-dec5-4ae...|     5|    Eve Davis|        IT|   IL|105000| 32| 6000|1677624874|
|  20240927075321357|20240927075321357...|                 3|                      |25309ccf-dec5-4ae...|     3|  Charlie Lee|     Sales|   

In [34]:
impleDataUpd = [
    (6, "This is APPEND", "Sales", "RJ", 81000, 30, 23000, 827307999),
    (7, "This is APPEND", "Engineering", "RJ", 79000, 53, 15000, 1627694678),
]

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns)
usr_up_df.write.format("hudi").options(**hudi_options).mode("append").save(final_base_path)




In [35]:
###while we did append, one more parquet got created in s3, since it is only an append to the dataset, no delta log file is generated till this step

#25309ccf-dec5-4ae1-8f39-f6391e564bcb-0_0-122-497_20240927080316210.parquet

s3_parquet_path = "s3://test-ramneek-2/hudi_table/25309ccf-dec5-4ae1-8f39-f6391e564bcb-0_0-122-497_20240927080316210.parquet"

# Read the Parquet file
df2 = spark.read.parquet(s3_parquet_path)

df2.show()

+-------------------+--------------------+------------------+----------------------+--------------------+------+--------------+-----------+-----+------+---+-----+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|emp_id| employee_name| department|state|salary|age|bonus|        ts|
+-------------------+--------------------+------------------+----------------------+--------------------+------+--------------+-----------+-----+------+---+-----+----------+
|  20240927075321357|20240927075321357...|                 1|                      |25309ccf-dec5-4ae...|     1| Alice Johnson|         IT|   CA|120000| 30| 5000|1677624870|
|  20240927075321357|20240927075321357...|                 5|                      |25309ccf-dec5-4ae...|     5|     Eve Davis|         IT|   IL|105000| 32| 6000|1677624874|
|  20240927075321357|20240927075321357...|                 3|                      |25309ccf-dec5-4ae...|     3|   Charlie Lee|   

In [36]:

###update

impleDataUpd = [
    (3, "this is update on data lake", "Sales", "RJ", 81000, 30, 23000, 827307999),
]
columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns)
usr_up_df.write.format("hudi").options(**hudi_options).mode("append").save(final_base_path)




In [13]:
##after an update operation, no parquet file is generated, only one delta log file is generated. let;s view the real time table.

In [39]:
spark.sql("select * from hudi_table_rt;").show()   ###we can see, there is update at emp_id=3 record

+-------------------+--------------------+------------------+----------------------+--------------------+------+--------------------+-----------+-----+------+---+-----+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|emp_id|       employee_name| department|state|salary|age|bonus|        ts|
+-------------------+--------------------+------------------+----------------------+--------------------+------+--------------------+-----------+-----+------+---+-----+----------+
|  20240927075321357|20240927075321357...|                 1|                      |25309ccf-dec5-4ae...|     1|       Alice Johnson|         IT|   CA|120000| 30| 5000|1677624870|
|  20240927075321357|20240927075321357...|                 5|                      |25309ccf-dec5-4ae...|     5|           Eve Davis|         IT|   IL|105000| 32| 6000|1677624874|
|  20240927080655269|20240927080655269...|                 3|                      |25309ccf-dec5-4a