In [0]:
# loading customer file from AWS s3

df_load_cust = spark.read.csv("s3://amazon-l0-landing-prod/landing/customer/customer.csv", header=True, inferSchema=True)
df_load_cust = df_load_cust.drop("Segment")

df_load_cust.createOrReplaceTempView("df_load_cust")

spark.sql("select * from df_load_cust limit 10").show()


+-----------+----------------+
|Customer ID|   Customer Name|
+-----------+----------------+
|   ID135600| Susan Hernandez|
|   ID135601|   Jackie Wagner|
|   ID135602|    Mariah Mccoy|
|   ID135603|  Courtney Doyle|
|   ID135604|     Dawn Torres|
|   ID135605|Jessica Faulkner|
|   ID135606|   Alicia Howard|
|   ID135607|      David Hill|
|   ID135608| Matthew Johnson|
|   ID135609| Kimberly Murphy|
+-----------+----------------+



In [0]:
# Restructuring the file

# Renaming headers
df_restructured_cust = df_load_cust.withColumnRenamed("Customer ID", "cust_id").withColumnRenamed("Customer Name", "cust_nm")

df_restructured_cust.createOrReplaceTempView("df_restructured_cust")

# Renaming segment values
# Adding timestamp
df_renamed_cust = spark.sql("""
                        select
                        cust_id, 
                        cust_nm,  
                        now() as load_dt
                        from df_restructured_cust
                        """)

df_renamed_cust.createOrReplaceTempView("df_renamed_cust")

df_renamed_cust.show()


+--------+------------------+--------------------+
| cust_id|           cust_nm|             load_dt|
+--------+------------------+--------------------+
|ID135600|   Susan Hernandez|2025-11-10 02:19:...|
|ID135601|     Jackie Wagner|2025-11-10 02:19:...|
|ID135602|      Mariah Mccoy|2025-11-10 02:19:...|
|ID135603|    Courtney Doyle|2025-11-10 02:19:...|
|ID135604|       Dawn Torres|2025-11-10 02:19:...|
|ID135605|  Jessica Faulkner|2025-11-10 02:19:...|
|ID135606|     Alicia Howard|2025-11-10 02:19:...|
|ID135607|        David Hill|2025-11-10 02:19:...|
|ID135608|   Matthew Johnson|2025-11-10 02:19:...|
|ID135609|   Kimberly Murphy|2025-11-10 02:19:...|
|ID135610|       Tyler Brown|2025-11-10 02:19:...|
|ID135611|    Sheila Coleman|2025-11-10 02:19:...|
|ID135612| Douglas Rodriguez|2025-11-10 02:19:...|
|ID135613|       Brandi King|2025-11-10 02:19:...|
|ID135614|      Susan Brewer|2025-11-10 02:19:...|
|ID135615|     Latoya Wright|2025-11-10 02:19:...|
|ID135616|   Richard Edwards|20

In [0]:
# Defining s3 write path
s3_path = "s3://amazon-l1-staging-prod/staging/"+"customer/"
#print (s3_path)

# Write parquet file to s3

df_renamed_cust.write.parquet(s3_path)