In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
import yaml, os

from utils.spark_path import get_latest_year_month_path, get_current_year_month_path

In [2]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_clean_toji")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 10:55:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Config

In [3]:
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# input (bronze)
toji_src_path = get_latest_year_month_path(spark, os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["tojiSoyuJeongbo"]["paths"]["parquet"]
))                                  

# output (silver clean)
toji_clean_path = get_current_year_month_path(os.path.join(
    ROOT,
    LAYERS["silver"]["clean"]["domains"]["toji_clean"]["paths"]["parquet"]
))

partition_cols = LAYERS["silver"]["clean"]["domains"]["toji_clean"].get(
    "partition", ["region"]
)

print("[PATH] source =", toji_src_path)
print("[PATH] target =", toji_clean_path)
print("[CONF] partition =", partition_cols)

[PATH] source = /opt/spark/data/bronze/tojiSoyuJeongbo/parquet/year=2026/month=01
[PATH] target = /opt/spark/data/silver/clean/toji/year=2026/month=02
[CONF] partition = ['region']


# 브론즈(Raw) 데이터 로드

In [4]:
# ============================================================
# LOAD
# ============================================================
toji_df = spark.read.parquet(toji_src_path)

print("=== raw schema ===")
toji_df.printSchema()

=== raw schema ===
root
 |-- 고유번호: long (nullable = true)
 |-- 법정동코드: long (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 대장구분코드: long (nullable = true)
 |-- 대장구분명: string (nullable = true)
 |-- 지번: string (nullable = true)
 |-- 집합건물일련번호: long (nullable = true)
 |-- 건물동명: string (nullable = true)
 |-- 건물층명: string (nullable = true)
 |-- 건물호명: string (nullable = true)
 |-- 건물실명: string (nullable = true)
 |-- 공유인일련번호: long (nullable = true)
 |-- 기준연월: string (nullable = true)
 |-- 지목코드: long (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 토지면적: double (nullable = true)
 |-- 공시지가: long (nullable = true)
 |-- 소유구분코드: string (nullable = true)
 |-- 소유구분: string (nullable = true)
 |-- 국가기관구분코드: string (nullable = true)
 |-- 국가기관구분: string (nullable = true)
 |-- 소유권변동원인코드: string (nullable = true)
 |-- 소유권변동원인: string (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 데이터기준일자: string (nullable = true)
 |-- 원천시도시군구코드: double (nul

                                                                                

# 데이터 정제

In [5]:
# ============================================================
# CLEANING
# ============================================================


toji_clean_df = (
    toji_df

    .filter(
        F.col("소유구분코드") == "01" 
    )
        
    # -----------------------------
    # 1. 필요한 컬럼만 선택
    # -----------------------------
    .select(
        F.col("고유번호").cast("string"),
        F.col("법정동명"),
        F.col("지번"),
        F.col("지목"),
        F.col("토지면적").cast("double"),
        F.col("공유인수"),
        F.col("소유권변동일자"),
        F.col("region")
    )
        
    # -----------------------------
    # 2. 날짜 파싱
    # -----------------------------
    .withColumn(
        "소유권변동일자_dt",
        F.to_date("소유권변동일자", "yyyy-MM-dd")
    )
    
    # -----------------------------
    # 3. 고유번호별 최신 1건만 유지
    # -----------------------------
    .withColumn(
        "rn",
        F.row_number().over(
            Window.partitionBy("고유번호")
                  .orderBy(F.col("소유권변동일자_dt").desc_nulls_last())
        )
    )
    .filter(F.col("rn") == 1)

    # -----------------------------
    # 4. 지번 분리
    # -----------------------------
    .withColumn("본번", F.split("지번", "-").getItem(0))
    .withColumn(
        "부번",
        F.when(
            F.size(F.split("지번", "-")) > 1,
            F.split("지번", "-").getItem(1)
        )
    )
    .drop("rn", "소유권변동일자_dt", "지번")
)

print("=== clean schema ===")
toji_clean_df.printSchema()
print("row count:", toji_clean_df.count())

=== clean schema ===
root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 토지면적: double (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)





row count: 24943989


                                                                                

# S_clean 데이터 저장

In [6]:
# ============================================================
# SAVE (silver/clean/toji)
# ============================================================

(
    toji_clean_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(toji_clean_path)
)

print("✅ saved:", toji_clean_path)

26/02/21 10:55:58 ERROR TaskSchedulerImpl: Lost executor 0 on 172.23.0.6: Command exited with code 137
26/02/21 10:55:58 WARN TaskSetManager: Lost task 0.0 in stage 7.0 (TID 20) (172.23.0.6 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Command exited with code 137
26/02/21 10:55:58 WARN TaskSetManager: Lost task 2.0 in stage 7.0 (TID 22) (172.23.0.6 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Command exited with code 137
26/02/21 10:56:08 ERROR TaskSchedulerImpl: Lost executor 2 on 172.23.0.6: Command exited with code 137
26/02/21 10:56:08 WARN TaskSetManager: Lost task 2.1 in stage 7.0 (TID 24) (172.23.0.6 executor 2): ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: Command exited with code 137
26/02/21 10:56:08 WARN TaskSetManager: Lost task 0.1 in stage 7.0 (TID 25) (172.23.0.6 executor 2): ExecutorLostFailure (executor 2 exited caused by one of th

✅ saved: /opt/spark/data/silver/clean/toji/year=2026/month=02


In [7]:
spark.stop()