# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [6]:

%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

%%configure
{
    "--conf": "spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false",
    "--enable-glue-datacatalog" :"true",
    "--datalake-formats":"hudi"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
The following configurations have been updated: {'--conf': 'spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false', '--enable-glue-datacatalog': 'true', '--datalake-formats': 'hudi'}


In [2]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [3]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
|  default|
|  hudidb1|
|  hudidb2|
|  hudidb3|
|  hudidb7|
|  hudidb8|
+---------+


In [4]:
try:
    import os
    import sys


    import pyspark
    from pyspark import SparkConf, SparkContext
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col, asc, desc
    from awsglue.utils import getResolvedOptions
    from awsglue.dynamicframe import DynamicFrame
    from awsglue.context import GlueContext

    #from faker import Faker

    print("All modules are loaded .....")

except Exception as e:
    print("Some modules are missing {} ".format(e))

All modules are loaded .....


In [5]:
database_name1 = "hudidb8"
table_name = "hudi_table"
base_s3_path = "s3a://test-ramneek-3"
final_base_path = "{base_s3_path}/{table_name}".format(
    base_s3_path=base_s3_path, table_name=table_name
)




In [6]:
class DataGenerator(object):

    @staticmethod
    def get_data():
        # Manually created data
        return [
            (1, "Alice Johnson", "IT", "CA", 120000, 30, 5000, 1677624870),
            (2, "Bob Smith", "HR", "NY", 90000, 40, 7000, 1677624871),
            (3, "Charlie Lee", "Sales", "TX", 110000, 35, 8000, 1677624872),
            (4, "David Brown", "Marketing", "FL", 95000, 29, 4000, 1677624873),
            (5, "Eve Davis", "IT", "IL", 105000, 32, 6000, 1677624874)
        ]




In [7]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
        .getOrCreate()
    return spark


spark = create_spark_session()
sc = spark.sparkContext
glueContext = GlueContext(sc)




In [8]:
hudi_options = {
    'hoodie.table.name': table_name,
    "hoodie.datasource.write.storage.type": "MERGE_ON_READ",
    'hoodie.datasource.write.recordkey.field': 'emp_id',
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'ts',

    'hoodie.datasource.hive_sync.enable': 'true',
    "hoodie.datasource.hive_sync.mode":"hms",
    'hoodie.datasource.hive_sync.sync_as_datasource': 'false',
    'hoodie.datasource.hive_sync.database': database_name1,
    'hoodie.datasource.hive_sync.table': table_name,
    'hoodie.datasource.hive_sync.use_jdbc': 'false',
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor',
    'hoodie.datasource.write.hive_style_partitioning': 'true',

}




In [9]:
data = DataGenerator.get_data()

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
df = spark.createDataFrame(data=data, schema=columns)




In [10]:
df.show()

+------+-------------+----------+-----+------+---+-----+----------+
|emp_id|employee_name|department|state|salary|age|bonus|        ts|
+------+-------------+----------+-----+------+---+-----+----------+
|     1|Alice Johnson|        IT|   CA|120000| 30| 5000|1677624870|
|     2|    Bob Smith|        HR|   NY| 90000| 40| 7000|1677624871|
|     3|  Charlie Lee|     Sales|   TX|110000| 35| 8000|1677624872|
|     4|  David Brown| Marketing|   FL| 95000| 29| 4000|1677624873|
|     5|    Eve Davis|        IT|   IL|105000| 32| 6000|1677624874|
+------+-------------+----------+-----+------+---+-----+----------+


In [11]:
df.write.format("hudi").options(**hudi_options).mode("overwrite").save(final_base_path)




In [18]:
impleDataUpd = [
    (6, "This is APPEND", "Sales", "RJ", 81000, 30, 23000, 827307999),
    (7, "This is APPEND", "Engineering", "RJ", 79000, 53, 15000, 1627694678),
]

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns)
usr_up_df.write.format("hudi").options(**hudi_options).mode("append").save(final_base_path)




In [19]:
impleDataUpd = [
    (8, "This is APPEND1", "Sales", "RJ", 81000, 30, 23000, 827307999),
    (9, "This is APPEND1", "Engineering", "RJ", 79000, 53, 15000, 1627694678),
]

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns)
usr_up_df.write.format("hudi").options(**hudi_options).mode("append").save(final_base_path)




In [20]:
impleDataUpd = [
    (10, "This is APPEND1", "Sales", "RJ", 81000, 30, 23000, 827307999),
    (11, "This is APPEND1", "Engineering", "RJ", 79000, 53, 15000, 1627694678),
]

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns)
usr_up_df.write.format("hudi").options(**hudi_options).mode("append").save(final_base_path)




In [21]:
impleDataUpd = [
    (12, "This is APPEND1", "Sales", "RJ", 81000, 30, 23000, 827307999),
    (13, "This is APPEND1", "Engineering", "RJ", 79000, 53, 15000, 1627694678),
]

columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"]
usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns)
usr_up_df.write.format("hudi").options(**hudi_options).mode("append").save(final_base_path)




In [26]:
hudi_options_cleaner = {
    'hoodie.table.name': table_name,
    "hoodie.datasource.write.storage.type": "MERGE_ON_READ",
    'hoodie.datasource.write.recordkey.field': 'emp_id',
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'ts',

    'hoodie.datasource.hive_sync.enable': 'true',
    "hoodie.datasource.hive_sync.mode":"hms",
    'hoodie.datasource.hive_sync.sync_as_datasource': 'false',
    'hoodie.datasource.hive_sync.database': database_name1,
    'hoodie.datasource.hive_sync.table': table_name,
    'hoodie.datasource.hive_sync.use_jdbc': 'false',
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor',
    'hoodie.datasource.write.hive_style_partitioning': 'true',

    # Cleaner configurations
    'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS',  # Use KEEP_LATEST_COMMITS policy
    'hoodie.cleaner.max.commits': '3',  # Keep the latest 3 commits
    'hoodie.cleaner.parallelism': '4',  # Number of parallel cleaner threads
    
}





In [27]:
usr_up_df.write.format("hudi").options(**hudi_options_cleaner).mode("append").save(final_base_path)



