In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml, os

from utils.spark_path import get_latest_year_month_path, get_current_year_month_path

In [2]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_clean_coord")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 10:53:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# ============================================================
# Config
# ============================================================
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# input (bronze)
coord_src_base = os.path.join(
    ROOT, LAYERS["bronze"]["domains"]["coord"]["paths"]["parquet"]
)
coord_src_path = get_latest_year_month_path(spark, coord_src_base)

# output (silver clean)
coord_clean_base = os.path.join(
    ROOT, LAYERS["silver"]["clean"]["domains"]["coord_clean"]["paths"]["parquet"]
)
coord_clean_path = get_current_year_month_path(coord_clean_base)

partition_cols = LAYERS["silver"]["clean"]["domains"]["coord_clean"].get(
    "partition", ["region"]
)

print("[PATH] source =", coord_src_path)
print("[PATH] target =", coord_clean_path)
print("[CONF] partition =", partition_cols)

[PATH] source = /opt/spark/data/bronze/coord/parquet/year=2026/month=01
[PATH] target = /opt/spark/data/silver/clean/coord/year=2026/month=02
[CONF] partition = ['region']


In [4]:
# ============================================================
# LOAD
# ============================================================
coord_df = spark.read.parquet(coord_src_path)

print("=== raw schema ===")
coord_df.printSchema()
print("row count:", coord_df.count())

=== raw schema ===
root
 |-- 시군구코드: string (nullable = true)
 |-- 출입구일련번호: string (nullable = true)
 |-- 법정동코드: string (nullable = true)
 |-- 시도명: string (nullable = true)
 |-- 시군구명: string (nullable = true)
 |-- 읍면동명: string (nullable = true)
 |-- 도로명코드: string (nullable = true)
 |-- 도로명: string (nullable = true)
 |-- 지하여부: string (nullable = true)
 |-- 건물본번: string (nullable = true)
 |-- 건물부번: string (nullable = true)
 |-- 건물명: string (nullable = true)
 |-- 우편번호: string (nullable = true)
 |-- 건물용도분류: string (nullable = true)
 |-- 건물군여부: string (nullable = true)
 |-- 관할행정동: string (nullable = true)
 |-- X좌표: string (nullable = true)
 |-- Y좌표: string (nullable = true)
 |-- region: string (nullable = true)





row count: 6416525


                                                                                

In [5]:
# ============================================================
# CLEANING
# ============================================================

coord_clean_df = (
    coord_df
    .withColumn(
        "도로명주소",
        F.concat_ws(
            " ",
            F.col("시도명"),
            F.col("시군구명"),
            F.col("도로명"),
            F.concat(
                F.col("건물본번").cast("string"),
                F.when(
                    (F.col("건물부번").isNotNull()) & (F.col("건물부번") != "0") & (F.col("건물부번") != 0),
                    F.concat(F.lit("-"), F.col("건물부번").cast("string"))
                ).otherwise(F.lit(""))
            )
        )
    )
    .select(
        F.col("도로명주소"),
        F.expr("try_cast(nullif(trim(`X좌표`), '') as double)").alias("x_utmk"),
        F.expr("try_cast(nullif(trim(`Y좌표`), '') as double)").alias("y_utmk"),
        F.col("region"),
    )
    .dropDuplicates(["도로명주소"])
)

print("=== clean schema ===")
coord_clean_df.printSchema()
print("row count:", coord_clean_df.count())
coord_clean_df.show(5, truncate=False)

=== clean schema ===
root
 |-- 도로명주소: string (nullable = false)
 |-- x_utmk: double (nullable = true)
 |-- y_utmk: double (nullable = true)
 |-- region: string (nullable = true)



                                                                                

row count: 6416295


[Stage 12:>                                                         (0 + 1) / 1]

+------------------------------------+--------------+--------------+------+
|도로명주소                          |x_utmk        |y_utmk        |region|
+------------------------------------+--------------+--------------+------+
|강원특별자치도 강릉시 가둔지길 12-1 |1121299.652555|1981809.732449|강원  |
|강원특별자치도 강릉시 가둔지길 12-10|1121274.948184|1981763.490249|강원  |
|강원특별자치도 강릉시 가둔지길 12-16|1121295.95508 |1981721.133156|강원  |
|강원특별자치도 강릉시 가둔지길 20-2 |1121329.150923|1981736.365337|강원  |
|강원특별자치도 강릉시 가둔지길 20-8 |1121289.0481  |1981682.489337|강원  |
+------------------------------------+--------------+--------------+------+
only showing top 5 rows


                                                                                

In [6]:
# ============================================================
# SAVE (silver/clean/coord)
# ============================================================
(
    coord_clean_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(coord_clean_path)
)

print("✅ saved:", coord_clean_path)

                                                                                

✅ saved: /opt/spark/data/silver/clean/coord/year=2026/month=02


In [7]:
spark.stop()