In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml, os

from utils.spark_path import get_latest_year_month_path, get_current_year_month_path

In [2]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_clean_address")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 10:52:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# ============================================================
# Config
# ============================================================
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# input (bronze)
address_src_base = os.path.join(
    ROOT, LAYERS["bronze"]["domains"]["address"]["paths"]["parquet"]
)
address_src_path = get_latest_year_month_path(spark, address_src_base)

# output (silver clean)
address_clean_base = os.path.join(
    ROOT, LAYERS["silver"]["clean"]["domains"]["address_clean"]["paths"]["parquet"]
)
address_clean_path = get_current_year_month_path(address_clean_base)

partition_cols = LAYERS["silver"]["clean"]["domains"]["address_clean"].get(
    "partition", ["region"]
)

print("[PATH] source =", address_src_path)
print("[PATH] target =", address_clean_path)
print("[CONF] partition =", partition_cols)

[PATH] source = /opt/spark/data/bronze/address/parquet/year=2026/month=01
[PATH] target = /opt/spark/data/silver/clean/address/year=2026/month=02
[CONF] partition = ['region']


In [4]:
# ============================================================
# LOAD
# ============================================================
addr_df = spark.read.parquet(address_src_path)

print("=== raw schema ===")
addr_df.printSchema()

=== raw schema ===
root
 |-- 도로명주소관리번호: string (nullable = true)
 |-- 법정동코드: string (nullable = true)
 |-- 시도명: string (nullable = true)
 |-- 시군구명: string (nullable = true)
 |-- 법정읍면동명: string (nullable = true)
 |-- 법정리명: string (nullable = true)
 |-- 산여부: string (nullable = true)
 |-- 지번본번(번지): string (nullable = true)
 |-- 지번부번(호): string (nullable = true)
 |-- 도로명코드: string (nullable = true)
 |-- 도로명: string (nullable = true)
 |-- 지하여부: string (nullable = true)
 |-- 건물본번: string (nullable = true)
 |-- 건물부번: string (nullable = true)
 |-- 행정동코드: string (nullable = true)
 |-- 행정동명: string (nullable = true)
 |-- 기초구역번호(우편번호): string (nullable = true)
 |-- 이전도로명주소: string (nullable = true)
 |-- 효력발생일: string (nullable = true)
 |-- 공동주택구분: string (nullable = true)
 |-- 이동사유코드: string (nullable = true)
 |-- 건축물대장건물명: string (nullable = true)
 |-- 시군구용건물명: string (nullable = true)
 |-- 비고: string (nullable = true)
 |-- region: string (nullable = true)



                                                                                

In [5]:
# ============================================================
# CLEANING
# ============================================================

addr_clean_df = (
    addr_df
    .withColumn(
        "_pnu_land_gb",
        F.when(F.col("산여부").cast("string") == F.lit("1"), F.lit("2")).otherwise(F.lit("1"))
    )
    .withColumn(
        "PNU코드",
        F.concat(
            F.col("법정동코드").cast("string"),
            F.col("_pnu_land_gb"),
            F.lpad(F.col("지번본번(번지)").cast("string"), 4, "0"),
            F.lpad(F.coalesce(F.col("지번부번(호)"), F.lit("0")).cast("string"), 4, "0"),
        )
    )
    .drop("_pnu_land_gb")
    .withColumn(
        "도로명주소",
        F.concat_ws(
            " ",
            F.col("시도명"),
            F.col("시군구명"),
            F.col("도로명"),
            F.concat(
                F.col("건물본번").cast("string"),
                F.when(
                    (F.col("건물부번").isNotNull()) & (F.col("건물부번") != "0") & (F.col("건물부번") != 0),
                    F.concat(F.lit("-"), F.col("건물부번").cast("string"))
                ).otherwise(F.lit(""))
            )
        )
    )
    .select(
        F.col("PNU코드"),
        F.col("도로명주소"),
        F.col("region")
    )
    .dropDuplicates(["도로명주소"])
)

print("=== clean schema ===")
addr_clean_df.printSchema()
print("row count:", addr_clean_df.count())

=== clean schema ===
root
 |-- PNU코드: string (nullable = true)
 |-- 도로명주소: string (nullable = false)
 |-- region: string (nullable = true)



[Stage 3:>                                                          (0 + 4) / 4]

row count: 6416296


                                                                                

In [6]:
# ============================================================
# SAVE (silver/clean/address)
# ============================================================
(
    addr_clean_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(address_clean_path)
)

print("✅ saved:", address_clean_path)

                                                                                

✅ saved: /opt/spark/data/silver/clean/address/year=2026/month=02


In [7]:
spark.stop()