In [104]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import yaml, os

from utils.spark_path import (
    get_latest_year_month_path,
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [105]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_s0_to_s1")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

26/02/20 22:49:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Config

In [106]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# Input Path
restaurant_clean_src_base = os.path.join(
    ROOT,
    LAYERS["silver"]["clean"]["domains"]["restaurant_clean"]["paths"]["parquet"]
)
restaurant_clean_src_path = get_latest_year_month_week_path(spark, restaurant_clean_src_base)

s0_address_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["address"]["paths"]["parquet"]
)
s0_address_path = get_latest_year_month_path(spark, s0_address_base)

s0_toji_building_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["toji_building"]["paths"]["parquet"]
)
s0_toji_building_path = get_latest_year_month_path(spark, s0_toji_building_base)

# Output Path
s1_crawling_list_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s1"]["domains"]["crawling_list"]["paths"]["parquet"]
)
s1_crawling_list_path = get_current_year_month_week_path(s1_crawling_list_base)

s1_toji_list_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s1"]["domains"]["toji_list"]["paths"]["parquet"]
)
s1_toji_list_path = get_current_year_month_week_path(s1_toji_list_base)

s1_crawling_list_partition_cols = LAYERS["silver"]["stages"]["s1"]["domains"]["crawling_list"].get("partition")
s1_toji_list_partition_cols = LAYERS["silver"]["stages"]["s1"]["domains"]["toji_list"].get("partition")

print("[PATH] restaurant_src_path  =", restaurant_clean_src_path)
print("[PATH] s0_address_path      =", s0_address_path)
print("[PATH] s0_toji_building_path      =", s0_toji_building_path)
print("[PATH] s1_crawling_list_path   =", s1_crawling_list_path)
print("[PATH] s1_toji_list_path   =", s1_toji_list_path)

[PATH] restaurant_src_path  = /opt/spark/data/silver/clean/restaurant/year=2026/month=02/week=03
[PATH] s0_address_path      = /opt/spark/data/silver/s0/address/year=2026/month=02
[PATH] s0_toji_building_path      = /opt/spark/data/silver/s0/toji_building/year=2026/month=02
[PATH] s1_crawling_list_path   = /opt/spark/data/silver/s1/crawling_list/year=2026/month=02/week=03
[PATH] s1_toji_list_path   = /opt/spark/data/silver/s1/toji_list/year=2026/month=02/week=03


# 데이터 로드 (Bronze 식당 정보 + S0 주소, 토지_건축물)

In [107]:
rest_clean_df = (
    spark.read.parquet(restaurant_clean_src_path)
    .select("업체명", "대표자", "소재지", "업종", "대표자수")
)

addr_df = (
    spark.read.parquet(s0_address_path)
    .select("PNU코드", "도로명주소", "longitude", "latitude")
)

toji_building_df = (
    spark.read.parquet(s0_toji_building_path)
    .drop("year", "month")
)


print("rest_df")
rest_clean_df.printSchema()

print("addr_df")
addr_df.printSchema()

print("toji_building_df")
toji_building_df.printSchema()


rest_df
root
 |-- 업체명: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 소재지: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자수: integer (nullable = true)

addr_df
root
 |-- PNU코드: string (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)

toji_building_df
root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- region: string (nullable = true)



# 식당 + 주소 join

In [None]:
# ============================================================
# 식당 + 주소 join (matched + unmatched 같이 만들기)
# ============================================================

joined_df = (
    rest_clean_df.alias("r")
    .join(
        addr_df.alias("a"),
        F.col("r.소재지") == F.col("a.도로명주소"),
        "left"
    )
)

joined_rest_df = (
    joined_df
    .filter(F.col("PNU코드").isNotNull())
    .select(
        F.col("r.업체명").alias("업체명"),
        F.col("r.업종").alias("업종"),
        F.col("r.대표자").alias("대표자"),
        F.col("r.대표자수").alias("대표자_수"),
        F.col("r.소재지").alias("도로명주소"),
        F.col("a.PNU코드").alias("PNU코드"),
        F.col("a.longitude").alias("longitude"),
        F.col("a.latitude").alias("latitude"),
    )
)

joined_rest_df.printSchema()

root
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- PNU코드: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



# 토지 그룹 / 필터 리스트 추출

In [109]:
# 건물이 1개 있는 토지만 필터링
toji_with_1_building_df = (
    toji_building_df
    .filter(F.col("관리_건축물대장_PK").isNotNull())
)


In [110]:
t = toji_with_1_building_df.alias("t")
r = joined_rest_df.alias("r")

toji_building_restaurant_df = (
    t.join(
        r,
        F.col("t.고유번호") == F.col("r.PNU코드"),
        how="left"
    )
    .drop(F.col("r.PNU코드"))
    .filter(F.col("업체명").isNotNull())
)

toji_building_restaurant_df.printSchema()

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- region: string (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



In [111]:
# 아무 건물도 없는 필지
toji_with_0_building_df = (
    toji_building_df
    .filter(F.col("관리_건축물대장_PK").isNull())
)
toji_with_0_building_df.printSchema()

root
 |-- 고유번호: string (nullable = true)
 |-- 법정동명: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- region: string (nullable = true)



In [112]:
# 위에서 만든 join table과 concat을 위해 column 추가
toji_with_0_building_df = (
    toji_with_0_building_df
    .withColumn("업체명", F.lit(None).cast("string"))
    .withColumn("업종", F.lit(None).cast("string"))
    .withColumn("대표자", F.lit(None).cast("string"))
    .withColumn("도로명주소", F.lit(None).cast("string"))
    .withColumn("longitude", F.lit(None).cast("double"))
    .withColumn("latitude", F.lit(None).cast("double"))
)

final_toji_df = (
    toji_building_restaurant_df
    .unionByName(toji_with_0_building_df)
)

In [113]:
group_has_restaurant_df = (
    final_toji_df
    .groupBy("법정동명", "본번")
    .agg(
        F.max(
            F.when(F.col("업체명").isNotNull(), 1).otherwise(0)
        ).alias("has_restaurant")
    )
    .filter(F.col("has_restaurant") == 1)
    .select("법정동명", "본번")
)

filtered_final_toji_df = (
    final_toji_df
    .join(group_has_restaurant_df, on=["법정동명", "본번"], how="inner")
)

In [114]:
toji_group_df = (
    filtered_final_toji_df
    .groupBy("법정동명", "본번", "소유권변동일자", "region")
    .agg(
        F.min("부번").alias("부번")
    )
    .distinct()
)


In [115]:

print(toji_group_df.filter(F.col("법정동명").startswith("경기도 용인시 처인구")).count())

                                                                                

4359


In [116]:
filtered_final_toji_df.printSchema()

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- region: string (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



In [117]:
toji_group_df.printSchema()

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 부번: string (nullable = true)



In [118]:

(
    filtered_final_toji_df
    .write.mode("overwrite")
    .partitionBy(*s1_toji_list_partition_cols)
    .parquet(s1_toji_list_path)
)
print("✅ saved:", s1_toji_list_path)

(
    toji_group_df
    .write.mode("overwrite")
    .partitionBy(*s1_crawling_list_partition_cols)
    .parquet(s1_crawling_list_path)
)
print("✅ saved:", s1_toji_list_path)



                                                                                

✅ saved: /opt/spark/data/silver/s1/toji_list/year=2026/month=02/week=03




✅ saved: /opt/spark/data/silver/s1/toji_list/year=2026/month=02/week=03


                                                                                

In [None]:
spark.stop()