In [107]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import yaml, os

from utils.spark_path import (
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [108]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_s1_to_s2")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

# Config

In [109]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# Input Path
s1_toji_list_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s1"]["domains"]["toji_list"]["paths"]["parquet"]
)
s1_toji_list_path = get_latest_year_month_week_path(spark, s1_toji_list_base)

s2_ownership_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s2"]["domains"]["ownership_inference"]["paths"]["parquet"]
)
s2_ownership_path = get_latest_year_month_week_path(spark, s2_ownership_base)

# Output Path
s2_toji_owner_match_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s2"]["domains"]["toji_owner_match"]["paths"]["parquet"]
)
s2_toji_owner_match_path = get_current_year_month_week_path(s2_toji_owner_match_base)

s2_toji_owner_match_partition_cols = LAYERS["silver"]["stages"]["s2"]["domains"]["toji_owner_match"].get("partition")

print("[PATH] s1_toji_list_path   =", s1_toji_list_path)
print("[PATH] s2_ownership_path   =", s2_ownership_path)
print("[PATH] s2_toji_owner_match_path   =", s2_toji_owner_match_path)

[PATH] s1_toji_list_path   = /opt/spark/data/silver/s1/toji_list/year=2026/month=02/week=03
[PATH] s2_ownership_path   = /opt/spark/data/silver/s2/ownership_inference/year=2026/month=02/week=03
[PATH] s2_toji_owner_match_path   = /opt/spark/data/silver/s2/toji_owner_match/year=2026/month=02/week=03


# 데이터 로드 (Bronze 식당 정보 + S0 주소, 토지_건축물)

In [110]:
toji_df = (
    spark.read.parquet(s1_toji_list_path)
)

owner_df = (
    spark.read.parquet(s2_ownership_path)
)


print("toji_df")
toji_df.printSchema()
toji_df.show(2)
print("owner_df")
owner_df.printSchema()
owner_df.show(2)

                                                                                

toji_df
root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- region: string (nullable = true)

+---------------------------+----+-------------------+----+--------+--------------+----+------------------+--------------+------------------+--------------+----------+------+---------+--------------------------------+------------------+-----------------+------+
|                   법정동명|본번|           고유번호|지목|공유인수|소유권변동

In [111]:
t = toji_df.alias("t")
o = owner_df.alias("o")

toji_with_owner_df = (
    t.join(
        o,
        (F.col("t.법정동명") == F.col("o.주소")) &
        (F.col("t.본번") == F.col("o.본번")) &
        (F.col("t.소유권변동일자") == F.col("o.소유권변동일자")) &
        (F.col("t.region") == F.col("o.region")),
        how="inner"
    )
    .select(
        "t.*",                     
        F.col("o.지주").alias("지주"),
    )
)

print("row 수:", toji_with_owner_df.count())
toji_with_owner_df \
    .orderBy(F.col("유휴부지_면적").asc()) \
    .show(10, truncate=False)

row 수: 3609
+-----------------------------+----+-------------------+----+--------+--------------+----+------------------+--------------+-------------+-------------------------------+----------+----------+---------+-------------------------------------+------------------+------------------+------+------+
|법정동명                     |본번|고유번호           |지목|공유인수|소유권변동일자|부번|관리_건축물대장_PK|대장_구분_코드|유휴부지_면적|업체명                         |업종      |대표자    |대표자_수|도로명주소                           |longitude         |latitude          |region|지주  |
+-----------------------------+----+-------------------+----+--------+--------------+----+------------------+--------------+-------------+-------------------------------+----------+----------+---------+-------------------------------------+------------------+------------------+------+------+
|경기도 용인시 처인구 고림동  |748 |4146110600107480007|대  |2       |2019-08-02    |7   |1116131981        |1             |0.0          |지에스25처인고림점             |휴게음식점|이*용     |1       

In [112]:
toji_with_owner_df.printSchema()

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- region: string (nullable = true)
 |-- 지주: string (nullable = true)



In [113]:

(
    toji_with_owner_df
    .write.mode("overwrite")
    .partitionBy(*s2_toji_owner_match_partition_cols)
    .parquet(s2_toji_owner_match_path)
)
print("✅ saved:", s2_toji_owner_match_path)

                                                                                

✅ saved: /opt/spark/data/silver/s2/toji_owner_match/year=2026/month=02/week=03


In [114]:
spark.stop()