In [91]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import functions as F
import yaml

import os

from utils.spark_path import get_latest_year_month_path, get_current_year_month_path

In [92]:
# Spark Session 설정
spark = SparkSession.builder \
    .appName(f'transfrom_s0_ground_building') \
    .master("spark://spark-master:7077") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "400") \
    .getOrCreate()

26/02/20 22:40:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Config

In [93]:
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="|utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

#input
building_clean_src_base = os.path.join(
    ROOT,
    LAYERS["silver"]["clean"]["domains"]["building_clean"]["paths"]["parquet"]
)
building_clean_src_path = get_latest_year_month_path(spark, building_clean_src_base)

toji_clean_src_base = os.path.join(
    ROOT,
     LAYERS["silver"]["clean"]["domains"]["toji_clean"]["paths"]["parquet"]
)
toji_clean_src_path = get_latest_year_month_path(spark, toji_clean_src_base)

#output
s0_toji_building_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["toji_building"]["paths"]["parquet"]
)
s0_toji_building_path = get_current_year_month_path(s0_toji_building_base)

s0_partition_cols = LAYERS["silver"]["stages"]["s0"]["domains"]["toji_building"].get("partition")

print("[PATH] address_src_path =", building_clean_src_path)
print("[PATH] coord_src_path   =", toji_clean_src_path)
print("[PATH] s0_address_path  =", s0_toji_building_path)
print("[CONF] s0_partition_cols =", s0_partition_cols)

[PATH] address_src_path = /opt/spark/data/silver/clean/building/year=2026/month=02
[PATH] coord_src_path   = /opt/spark/data/silver/clean/toji/year=2026/month=02
[PATH] s0_address_path  = /opt/spark/data/silver/s0/toji_building/year=2026/month=02
[CONF] s0_partition_cols = ['region']


# 원천 데이터 로드

In [98]:
# 1) Building
building_clean_df = spark.read.parquet(building_clean_src_path)

# 2) Toji
toji_clean_df = spark.read.parquet(toji_clean_src_path)

In [99]:
building_clean_df.printSchema()
toji_clean_df.printSchema()

root
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 건축면적: double (nullable = true)
 |-- region: string (nullable = true)

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 토지면적: double (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- region: string (nullable = true)



# 토지 필터링
1. 토지에 건물이 없거나
2. 일반 건물 1개만 있는 토지만 필터링

In [100]:
t = toji_clean_df.alias("t")
b = building_clean_df.drop("region").alias("b")

toji_building_df = (
    t.join(b, F.col("t.고유번호") == F.col("b.고유번호"), "left")
     .drop(F.col("b.고유번호"))
)

toji_building_df.printSchema()

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 토지면적: double (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 건축면적: double (nullable = true)



### 건물이 1개 이하인 토지 필터링

In [101]:
pk_cnt_df = (
    toji_building_df
    .groupBy("고유번호")
    .agg(
        F.count("관리_건축물대장_PK").alias("pk_cnt"),

        # 일반 건물 존재 여부
        F.max(
            F.when(F.col("대장_구분_코드").cast("int") == 1, 1).otherwise(0)
        ).alias("has_general_building")
    )
)

toji_binary_building_df = (
    toji_building_df
    .join(
        pk_cnt_df.filter(
            (F.col("pk_cnt") == 0) |
            ((F.col("pk_cnt") == 1) & (F.col("has_general_building") == 1))
        ),
        on="고유번호",
        how="inner"
    )
    .drop("pk_cnt", "has_general_building", "대장_구분_코드")
)

In [None]:
print("null:", toji_binary_building_df.filter(F.col("토지면적").isNull()).count())

# 0인 개수
print("zero:", toji_binary_building_df.filter(F.col("토지면적") == 0).count())

                                                                                

null: 0




zero: 180


                                                                                

In [97]:
toji_binary_building_df = toji_binary_building_df.withColumn(
    "유휴부지_면적",
    F.when(F.col("건축면적").isNull(), F.col("토지면적"))
     .otherwise(F.col("토지면적") - F.col("건축면적"))
).drop("토지면적", "건축면적")

toji_binary_building_df.printSchema()

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)



In [24]:
(
    toji_binary_building_df
    .write.mode("overwrite")
    .partitionBy(*s0_partition_cols)
    .parquet(s0_toji_building_path)
)
print("✅ saved:", s0_toji_building_path)

                                                                                

✅ saved: /opt/spark/data/silver/s0/toji_building/year=2026/month=02


In [13]:
spark.stop()