In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
import datetime as dt
from pyspark.sql import functions as F
from pyspark.sql import Window
import yaml

import os

# Config

In [2]:
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="|utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

building_src_path = os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["buildingLeader"]["paths"]["parquet"]
)

toji_src_path = os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["tojiSoyuJeongbo"]["paths"]["parquet"]
)

s0_toji_building_path = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["toji_building"]["paths"]["parquet"]
)

s0_partition_cols = ["region"]

print("[PATH] address_src_path =", building_src_path)
print("[PATH] coord_src_path   =", toji_src_path)
print("[PATH] s0_address_path  =", s0_toji_building_path)
print("[CONF] s0_partition_cols =", s0_partition_cols)

[PATH] address_src_path = /opt/spark/data/bronze/buildingLeader/parquet
[PATH] coord_src_path   = /opt/spark/data/bronze/tojiSoyuJeongbo/parquet
[PATH] s0_address_path  = /opt/spark/data/silver/s0/toji_building
[CONF] s0_partition_cols = ['region']


In [3]:
today = dt.date.today()
year = today.year
month = today.month

s0_toji_building_path = s0_toji_building_path + f"/year={year}/month={month}"
s0_toji_building_path

'/opt/spark/data/silver/s0/toji_building/year=2026/month=2'

In [4]:
# Spark Session 설정
spark = SparkSession.builder \
    .appName(f'transfrom_s0_ground_building') \
    .master("spark://spark-master:7077") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "400") \
    .getOrCreate()

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/19 12:51:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 원천 데이터 로드

In [5]:
# 1) Building
building_df = spark.read.parquet(building_src_path)

# 2) Toji
toji_df = spark.read.parquet(toji_src_path)

In [6]:
building_df.printSchema()
toji_df.printSchema()

root
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 대장_구분_코드_명: string (nullable = true)
 |-- 대장_종류_코드: string (nullable = true)
 |-- 대장_종류_코드_명: string (nullable = true)
 |-- 대지_위치: string (nullable = true)
 |-- 도로명_대지_위치: string (nullable = true)
 |-- 건물_명: string (nullable = true)
 |-- 시군구_코드: string (nullable = true)
 |-- 법정동_코드: string (nullable = true)
 |-- 대지_구분_코드: string (nullable = true)
 |-- 번: string (nullable = true)
 |-- 지: string (nullable = true)
 |-- 특수지_명: string (nullable = true)
 |-- 블록: string (nullable = true)
 |-- 로트: string (nullable = true)
 |-- 외필지_수: string (nullable = true)
 |-- 새주소_도로_코드: string (nullable = true)
 |-- 새주소_법정동_코드: string (nullable = true)
 |-- 새주소_지상지하_코드: string (nullable = true)
 |-- 새주소_본_번: string (nullable = true)
 |-- 새주소_부_번: string (nullable = true)
 |-- 동_명: string (nullable = true)
 |-- 주_부속_구분_코드: string (nullable = true)
 |-- 주_부속_구분_코드_명: string (nullable = true)
 |-- 대지_면적(㎡): string (nu

# Building Pruning (PNU 고유번호 생성 + 필요한 컬럼만 select)

In [7]:
building_df = building_df.withColumn(
    "고유번호",
    F.concat(
        F.col("시군구_코드"),        # 5
        F.col("법정동_코드"),        # 5
        F.when(F.col("대지_구분_코드") == "0", F.lit("1"))  # 대지 -> 1
         .when(F.col("대지_구분_코드") == "1", F.lit("2"))  # 산   -> 2
         .otherwise(F.col("대지_구분_코드")),
        F.lpad(F.col("번"), 4, "0"), # 본번
        F.lpad(F.col("지"), 4, "0")  # 부번
    )
)

building_df = (
    building_df
    .select(
        F.col("관리_건축물대장_PK"),
        F.col("고유번호"),
        F.col("대장_구분_코드"),
        F.col("옥외_자주식_면적(㎡)").alias("옥외자주식면적").cast("double"),
        F.col("건축_면적(㎡)").alias("건축면적").cast("double"),
    )
)

building_df.printSchema()

root
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 옥외자주식면적: double (nullable = true)
 |-- 건축면적: double (nullable = true)



# Toji Pruning (개인 소유, 가장 최근 소유권 변동 정보만 필터링)

In [8]:

toji_df = (
    toji_df
    # 1) 조건 필터
    .filter(F.col("소유구분코드") == "01")
    # 2) 필요한 컬럼만 선택
    .select(
        F.col("고유번호").cast("string"),
        F.col("법정동명"),
        F.col("지번"),
        F.col("소유권변동일자"),
        F.col("토지면적"),
        F.col("지목"),
        F.col("region")
    )

    # 3) 날짜 파싱
    .withColumn(
        "소유권변동일자_dt",
        F.to_date(F.col("소유권변동일자"), "yyyy-MM-dd")
    )

    # 4) 고유번호별 최신 1행
    .withColumn(
        "rn",
        F.row_number().over(
            Window.partitionBy("고유번호")
                  .orderBy(F.col("소유권변동일자_dt").desc_nulls_last())
        )
    )
    .filter(F.col("rn") == 1)

    # 5) 지번 분리
    .withColumn(
        "본번",
        F.split(F.col("지번"), "-").getItem(0)
    )
    .withColumn(
        "부번",
        F.when(
            F.size(F.split(F.col("지번"), "-")) > 1,
            F.split(F.col("지번"), "-").getItem(1)
        ).otherwise(F.lit(None))
    )
    .drop("rn", "소유권변동일자_dt", "지번")
)

# 확인
toji_df.printSchema()

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 토지면적: double (nullable = true)
 |-- 지목: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)



# 토지 필터링
1. 토지에 건물이 없거나
2. 일반 건물 1개만 있는 토지만 필터링

In [9]:
t = toji_df.alias("t")
b = building_df.alias("b")

toji_building_df = (
    t.join(b, F.col("t.고유번호") == F.col("b.고유번호"), "left")
     .drop(F.col("b.고유번호"))
)

toji_building_df.printSchema()
print(toji_building_df.count())

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 토지면적: double (nullable = true)
 |-- 지목: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 옥외자주식면적: double (nullable = true)
 |-- 건축면적: double (nullable = true)





26341571


                                                                                

### 건물이 1개 이하인 토지 필터링

In [10]:
pk_cnt_df = (
    toji_building_df
    .groupBy("고유번호")
    .agg(
        F.count("관리_건축물대장_PK").alias("pk_cnt"),

        # 일반 건물 존재 여부
        F.max(
            F.when(F.col("대장_구분_코드").cast("int") == 1, 1).otherwise(0)
        ).alias("has_general_building")
    )
)

toji_binary_building_gg_df = (
    toji_building_df
    .join(
        pk_cnt_df.filter(
            (F.col("pk_cnt") == 0) |
            ((F.col("pk_cnt") == 1) & (F.col("has_general_building") == 1))
        ),
        on="고유번호",
        how="inner"
    )
    .drop("pk_cnt", "has_general_building", "대장_구분_코드")
)

print(toji_binary_building_gg_df.count())
toji_binary_building_gg_df.show(2)

                                                                                

23910526


[Stage 31:>                                                         (0 + 1) / 1]

+-------------------+------------------------+--------------+--------+----+------+----+----+------------------+--------------+--------+
|           고유번호|                법정동명|소유권변동일자|토지면적|지목|region|본번|부번|관리_건축물대장_PK|옥외자주식면적|건축면적|
+-------------------+------------------------+--------------+--------+----+------+----+----+------------------+--------------+--------+
|1111010100100040005|서울특별시 종로구 청운동|    2022-03-25|   25.51|  대|  서울|   4|   5|              NULL|          NULL|    NULL|
|1111010100100040009|서울특별시 종로구 청운동|    2024-02-21|   28.83|  대|  서울|   4|   9|              NULL|          NULL|    NULL|
+-------------------+------------------------+--------------+--------+----+------+----+----+------------------+--------------+--------+
only showing top 2 rows


                                                                                

In [11]:
type(s0_toji_building_path)

str

In [12]:
(
    toji_binary_building_gg_df
    .write.mode("overwrite")
    .partitionBy(*s0_partition_cols)
    .parquet(s0_toji_building_path)
)
print("✅ saved:", s0_toji_building_path)

                                                                                

✅ saved: /opt/spark/data/silver/s0/toji_building/year=2026/month=2


In [13]:
spark.stop()