In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml, os

from utils.spark_path import get_latest_year_month_path, get_current_year_month_path

In [3]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_clean_building")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

# Config

In [4]:
# ============================================================
# Config
# ============================================================
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# input (bronze)
building_src_base = os.path.join(
    ROOT, LAYERS["bronze"]["domains"]["buildingLeader"]["paths"]["parquet"]
)
building_src_path = get_latest_year_month_path(spark, building_src_base)

# output (silver clean)
building_clean_base = os.path.join(
    ROOT, LAYERS["silver"]["clean"]["domains"]["building_clean"]["paths"]["parquet"]
)
building_clean_path = get_current_year_month_path(building_clean_base)

partition_cols = LAYERS["silver"]["clean"]["domains"]["building_clean"].get(
    "partition", ["region"]
)

print("[PATH] source =", building_src_path)
print("[PATH] target =", building_clean_path)
print("[CONF] partition =", partition_cols)

[PATH] source = /opt/spark/data/bronze/buildingLeader/parquet/year=2025/month=12
[PATH] target = /opt/spark/data/silver/clean/building/year=2026/month=02
[CONF] partition = ['region']


# 브론즈(Raw) 데이터 로드

In [9]:
# ============================================================
# LOAD
# ============================================================
building_df = spark.read.parquet(building_src_path)
print("=== raw schema ===")
building_df.printSchema()

=== raw schema ===
root
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 대장_구분_코드_명: string (nullable = true)
 |-- 대장_종류_코드: string (nullable = true)
 |-- 대장_종류_코드_명: string (nullable = true)
 |-- 대지_위치: string (nullable = true)
 |-- 도로명_대지_위치: string (nullable = true)
 |-- 건물_명: string (nullable = true)
 |-- 시군구_코드: string (nullable = true)
 |-- 법정동_코드: string (nullable = true)
 |-- 대지_구분_코드: string (nullable = true)
 |-- 번: string (nullable = true)
 |-- 지: string (nullable = true)
 |-- 특수지_명: string (nullable = true)
 |-- 블록: string (nullable = true)
 |-- 로트: string (nullable = true)
 |-- 외필지_수: string (nullable = true)
 |-- 새주소_도로_코드: string (nullable = true)
 |-- 새주소_법정동_코드: string (nullable = true)
 |-- 새주소_지상지하_코드: string (nullable = true)
 |-- 새주소_본_번: string (nullable = true)
 |-- 새주소_부_번: string (nullable = true)
 |-- 동_명: string (nullable = true)
 |-- 주_부속_구분_코드: string (nullable = true)
 |-- 주_부속_구분_코드_명: string (nullable = true)
 |-- 대

# 데이터 정제

In [12]:
# ============================================================
# CLEANING
# ============================================================

building_clean_df = (
    building_df
    
    .withColumn(
        "고유번호",
        F.concat(
            F.col("시군구_코드"),        # 5
            F.col("법정동_코드"),        # 5
            F.when(F.col("대지_구분_코드") == "0", F.lit("1"))  # 대지 -> 1
             .when(F.col("대지_구분_코드") == "1", F.lit("2"))  # 산   -> 2
             .otherwise(F.col("대지_구분_코드")),
            F.lpad(F.col("번"), 4, "0"), # 본번
            F.lpad(F.col("지"), 4, "0")  # 부번
        )
    )

    .select(
        F.col("관리_건축물대장_PK"),
        F.col("고유번호"),
        F.col("대장_구분_코드"),
        F.col("건축_면적(㎡)").alias("건축면적").cast("double"),
        F.col("region")
    )
)

print("=== clean schema ===")
building_clean_df.printSchema()
print("row count:", building_clean_df.count())

=== clean schema ===
root
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 건축면적: double (nullable = true)
 |-- region: string (nullable = true)

row count: 8034327


# S_clean 데이터 저장

In [13]:
# ============================================================
# SAVE (silver/clean/building)
# ============================================================
(
    building_clean_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(building_clean_path)
)

print("✅ saved:", building_clean_path)



✅ saved: /opt/spark/data/silver/clean/building/year=2026/month=02


                                                                                

In [14]:
spark.stop()