In [0]:
# loading geography file from AWS s3

df_load_geography = spark.read.csv("s3://amazon-l0-landing-prod/landing/geography/geography.csv", header=True, inferSchema=True)

df_load_geography.createOrReplaceTempView("df_load_geography")

spark.sql("select * from df_load_geography limit 10").show()


+-------------+-------+---------+--------+---------+-------+
|      Region0|   Area|  Region2|District|Territory|Terr ID|
+-------------+-------+---------+--------+---------+-------+
|South America|  South|  South 1|  Dist 2|   Terr 1| 134314|
|South America|  South|  South 1|  Dist 2|   Terr 2| 134303|
|       Africa|   West|   West 2|  Dist 2|   Terr 1| 134307|
|         Asia|   East|   East 1|  Dist 2|   Terr 2| 134293|
|South America|  South|  South 2|  Dist 2|   Terr 1| 134310|
|       Europe|Central|Central 2|  Dist 2|   Terr 2| 134296|
|North America|  North|  North 2|  Dist 2|   Terr 1| 134320|
|         Asia|   East|   East 2|  Dist 1|   Terr 2| 134323|
|       Europe|Central|Central 2|  Dist 1|   Terr 2| 134298|
|North America|  North|  North 1|  Dist 1|   Terr 2| 134308|
+-------------+-------+---------+--------+---------+-------+



In [0]:
# Restructuring the file

# Renaming headers
df_restructured_geography = df_load_geography.withColumnRenamed("Terr ID", "terr_id")

df_restructured_geography.createOrReplaceTempView("df_restructured_geography")

# Removing unneccessary columns
# Reordering columns
# Adding timestamp
df_renamed_geography = spark.sql("""
                        select
                        terr_id, 
                        Territory as terr_nm, 
                        District as dist_nm, 
                        Region2 as reg_nm, 
                        Area as area_nm, 
                        now() as load_dt
                        from df_restructured_geography
                        """)

df_renamed_geography.createOrReplaceTempView("df_renamed_geography")

df_renamed_geography.show()


+-------+-------+-------+---------+-------+--------------------+
|terr_id|terr_nm|dist_nm|   reg_nm|area_nm|             load_dt|
+-------+-------+-------+---------+-------+--------------------+
| 134314| Terr 1| Dist 2|  South 1|  South|2025-11-10 01:41:...|
| 134303| Terr 2| Dist 2|  South 1|  South|2025-11-10 01:41:...|
| 134307| Terr 1| Dist 2|   West 2|   West|2025-11-10 01:41:...|
| 134293| Terr 2| Dist 2|   East 1|   East|2025-11-10 01:41:...|
| 134310| Terr 1| Dist 2|  South 2|  South|2025-11-10 01:41:...|
| 134296| Terr 2| Dist 2|Central 2|Central|2025-11-10 01:41:...|
| 134320| Terr 1| Dist 2|  North 2|  North|2025-11-10 01:41:...|
| 134323| Terr 2| Dist 1|   East 2|   East|2025-11-10 01:41:...|
| 134298| Terr 2| Dist 1|Central 2|Central|2025-11-10 01:41:...|
| 134308| Terr 2| Dist 1|  North 1|  North|2025-11-10 01:41:...|
| 134312| Terr 2| Dist 1|Central 1|Central|2025-11-10 01:41:...|
| 134300| Terr 1| Dist 2|Central 2|Central|2025-11-10 01:41:...|
| 134311| Terr 2| Dist 1|

In [0]:
# Defining s3 write path
s3_path = "s3://amazon-l1-staging-prod/staging/"+"geography/"
#print (s3_path)

# Write parquet file to s3

df_renamed_geography.write.parquet(s3_path)