In [0]:
# loading segmentation file from AWS s3

df_load_seg = spark.read.csv("s3://amazon-l0-landing-prod/landing/segmentation/segmentation.csv", header=True, inferSchema=True)

df_load_seg.createOrReplaceTempView("df_load_seg")

spark.sql("select * from df_load_seg limit 10").show()


+-----------+----------+
|Customer ID|   Segment|
+-----------+----------+
|   ID135600|Priority 1|
|   ID135601|Priority 2|
|   ID135602|Priority 3|
|   ID135603|Priority 3|
|   ID135604|Priority 3|
|   ID135605|Priority 3|
|   ID135606|Priority 2|
|   ID135607|Priority 1|
|   ID135608|Priority 1|
|   ID135609|      NULL|
+-----------+----------+



In [0]:
# displaying unique values in segment
spark.sql("select distinct Segment from df_load_seg").show()

# we need to clean the dirty data

+----------+
|   Segment|
+----------+
|       ???|
|Priority 3|
|      NULL|
|Priority 2|
|       xyz|
|Priority 1|
|       342|
+----------+



In [0]:
# Restructuring the file

# Renaming headers
df_restructured_seg = df_load_seg.withColumnRenamed("Customer ID", "cust_id").withColumnRenamed("Segment", "seg_nm")

df_restructured_seg.createOrReplaceTempView("df_restructured_seg")

# Renaming segment values
# Bucketing unknown categories into "Unknown"
# Adding timestamp
df_renamed_seg = spark.sql("""
                        select
                        cust_id, 
                        case when seg_nm="Priority 1" then "High"
                        when seg_nm="Priority 2" then "Medium"
                        when seg_nm="Priority 3" then "Low"
                        else "Unknown"
                        end as seg_nm, 
                        now() as load_dt
                        from df_restructured_seg
                        """)

df_renamed_seg.createOrReplaceTempView("df_renamed_seg")

spark.sql("select distinct seg_nm from df_renamed_seg").show()


+-------+
| seg_nm|
+-------+
| Medium|
|   High|
|Unknown|
|    Low|
+-------+



In [0]:
# Defining s3 write path
s3_path = "s3://amazon-l1-staging-prod/staging/"+"segmentation/"
#print (s3_path)

# Write parquet file to s3

df_renamed_seg.write.parquet(s3_path)