In [0]:
# loading product file from AWS s3

df_load_prod = spark.read.csv("s3://amazon-l0-landing-prod/landing/product/product.csv", header=True, inferSchema=True)

df_load_prod.createOrReplaceTempView("df_load_prod")

spark.sql("select * from df_load_prod limit 10").show()


+-----------+-----------+---------+---------+----------+---------+------------+
|Dosage Form|     Market|Market ID|  Product|Product ID| Strength|Package Size|
+-----------+-----------+---------+---------+----------+---------+------------+
|     Tablet|Respiratory|     4001|Product C|      1002|100 mg/ml|        50 g|
|      Cream|Respiratory|     4001|Product A|      1000|   500 mg|      100 ml|
|    Capsule|   Oncology|     4004|Product A|      1000|    50 mg|          60|
|  Injection|   Oncology|     4004|Product A|      1000|    10 mg|      100 ml|
|     Tablet|   Diabetes|     4002|Product D|      1003|100 mg/ml|        50 g|
|    Capsule|Respiratory|     4001|Product D|      1003|100 mg/ml|       10 ml|
|      Cream|   Diabetes|     4002|Product A|      1000|   500 mg|          30|
|     Tablet|Respiratory|     4001|Product A|      1000|   500 mg|       10 ml|
|    Capsule|      Renal|     4000|Product D|      1003|   500 mg|        50 g|
|      Cream|   Diabetes|     4002|Produ

In [0]:
# Restructuring the file

# Renaming headers
df_restructured_prod = df_load_prod.withColumnRenamed("Market ID", "mkt_id").withColumnRenamed("Market", "mkt_nm").withColumnRenamed("Product ID", "prod_id").withColumnRenamed("Product", "prod_nm")

df_restructured_prod.createOrReplaceTempView("df_restructured_prod")

# Dropping unneccessary columns
# Adding timestamp
df_renamed_prod = spark.sql("""
                        select
                        prod_id, 
                        prod_nm,  
                        mkt_id, 
                        mkt_nm,
                        now() as load_dt
                        from df_restructured_prod
                        """)

df_renamed_prod.createOrReplaceTempView("df_renamed_prod")

df_renamed_prod.show()


+-------+---------+------+-----------+--------------------+
|prod_id|  prod_nm|mkt_id|     mkt_nm|             load_dt|
+-------+---------+------+-----------+--------------------+
|   1002|Product C|  4001|Respiratory|2025-11-10 02:26:...|
|   1000|Product A|  4001|Respiratory|2025-11-10 02:26:...|
|   1000|Product A|  4004|   Oncology|2025-11-10 02:26:...|
|   1000|Product A|  4004|   Oncology|2025-11-10 02:26:...|
|   1003|Product D|  4002|   Diabetes|2025-11-10 02:26:...|
|   1003|Product D|  4001|Respiratory|2025-11-10 02:26:...|
|   1000|Product A|  4002|   Diabetes|2025-11-10 02:26:...|
|   1000|Product A|  4001|Respiratory|2025-11-10 02:26:...|
|   1003|Product D|  4000|      Renal|2025-11-10 02:26:...|
|   1000|Product A|  4002|   Diabetes|2025-11-10 02:26:...|
|   1001|Product B|  4002|   Diabetes|2025-11-10 02:26:...|
|   1003|Product D|  4000|      Renal|2025-11-10 02:26:...|
|   1003|Product D|  4002|   Diabetes|2025-11-10 02:26:...|
|   1003|Product D|  4004|   Oncology|20

In [0]:
# Defining s3 write path
s3_path = "s3://amazon-l1-staging-prod/staging/"+"product/"
#print (s3_path)

# Write parquet file to s3

df_renamed_prod.write.parquet(s3_path)