In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import yaml, os

from utils.spark_path import (
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [2]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_s2_to_gold")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 12:40:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Config

In [3]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# Input Path
s2_toji_owner_match_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s2"]["domains"]["toji_owner_match"]["paths"]["parquet"]
)
s2_toji_owner_match_path = get_latest_year_month_week_path(spark, s2_toji_owner_match_base)


# Output Path
gold_restaurant_base = os.path.join(
    ROOT,
    LAYERS["gold"]["domains"]["restaurant"]["paths"]["parquet"]
)
gold_restaurant_path = get_current_year_month_week_path(gold_restaurant_base)

gold_restaurant_partition_cols = LAYERS["gold"]["domains"]["restaurant"].get("partition")

print("[PATH] s2_toji_owner_match_path   =", s2_toji_owner_match_path)
print("[PATH] gold_restaurant_path   =", gold_restaurant_path)

[PATH] s2_toji_owner_match_path   = /opt/spark/data/silver/s2/toji_owner_match/year=2026/month=02/week=03
[PATH] gold_restaurant_path   = /opt/spark/data/gold/restaurant/year=2026/month=02/week=03


In [4]:
toji_owner_df = (
    spark.read.parquet(s2_toji_owner_match_path)
)

toji_owner_deduplicate_df = (
    toji_owner_df
    .drop("부번", "고유번호", "유휴부지_면적")
    .dropDuplicates()
)

toji_owner_df.printSchema()



                                                                                

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- 지주: string (nullable = true)
 |-- region: string (nullable = true)



In [5]:
parking_max_df = (
    toji_owner_df
    .groupBy("법정동명", "본번", "지주", "부번", "region")
    .agg(F.max("유휴부지_면적").alias("유휴부지_면적(max)"))
)

parking_sum_df = (
    parking_max_df
    .groupBy("법정동명", "본번", "지주", "region")
    .agg(
        F.sum("유휴부지_면적(max)").alias("유휴부지_면적")
    )
)

print(parking_sum_df.count())
parking_sum_df.show(10, truncate=False)

                                                                                

1953
+----------------------------------+----+------+------+------------------+
|법정동명                          |본번|지주  |region|유휴부지_면적     |
+----------------------------------+----+------+------+------------------+
|경기도 용인시 처인구 백암면 백암리|421 |이성희|경기  |34.5              |
|경기도 용인시 처인구 백암면 근창리|122 |이옥희|경기  |166.75            |
|경기도 용인시 처인구 포곡읍 삼계리|59  |버    |경기  |1060.0            |
|경기도 용인시 처인구 포곡읍 전대리|370 |이혜경|경기  |242.02            |
|경기도 용인시 처인구 남사읍 방아리|844 |백영미|경기  |697.0             |
|경기도 용인시 처인구 김량장동     |333 |배균태|경기  |145.04000000000002|
|경기도 용인시 처인구 원삼면 문촌리|6   |버    |경기  |1180.6            |
|경기도 용인시 처인구 모현읍 초부리|808 |이보용|경기  |67.42             |
|경기도 용인시 처인구 김량장동     |323 |이충희|경기  |22.19999999999999 |
|경기도 용인시 처인구 마평동       |601 |황붕연|경기  |183.0             |
+----------------------------------+----+------+------+------------------+
only showing top 10 rows


In [6]:
final_df = (
    toji_owner_deduplicate_df.alias("r")
    .join(
        parking_sum_df.alias("p"),
        on=["법정동명", "본번", "지주", "region"],
        how="left"
    )
    .filter(F.col("업체명").isNotNull())
)

final_df.printSchema()

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 지주: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)



In [7]:
# ============================================================
# 1. 영업 성공확률 (0~1)
# ============================================================

# 1-1. 업종 점수 (weight 0.4)
업종_score = (
    F.when(F.col("업종").isin("일반음식점", "제과점영업"), 0.4)
     .when(F.col("업종") == "휴게음식점", 0.1)
     .when(F.col("업종").isin("집단급식소", "위탁급식영업"), 0.0)
     .otherwise(0.0)
)

# 1-2. 지주 == 대표자 매칭 (weight 0.3)
rep_clean = F.regexp_replace(F.col("대표자"), r"\s+", "")
owner_clean = F.regexp_replace(F.col("지주"), r"\s+", "")

name_match = (
    F.col("대표자").isNotNull() &
    F.col("지주").isNotNull() &
    (F.length(rep_clean) >= 2) &
    (F.length(rep_clean) == F.length(owner_clean)) &
    (F.substring(rep_clean, 1, 1) == F.substring(owner_clean, 1, 1)) &
    (
        (F.length(owner_clean) == 2) |  # 2글자: 첫 글자만 비교
        (F.substring(rep_clean, F.length(rep_clean), 1) ==
         F.substring(owner_clean, F.length(owner_clean), 1))
    )
)
지주대표자_score = F.when(name_match, 0.3).otherwise(0.0)

# 1-3. 대장_구분_코드 (weight 0.15)
대장_score = F.when(F.col("대장_구분_코드") == "1", 0.15).otherwise(0.0)

# 1-4. 토지 공유인 수 (weight 0.1): 1 * 0.5^(n)
공유인_score = F.lit(0.1) * F.pow(F.lit(0.5), F.col("공유인수"))

# 1-5. 식당 대표자 수 (weight 0.05): 1명이면 0.5
대표자수_score = F.when(F.col("대표자_수") == 1, 0.05).otherwise(0.0)

# 합산
영업성공확률 = 업종_score + 지주대표자_score + 대장_score + 공유인_score + 대표자수_score


# ============================================================
# 2. 최대 수익성
# ============================================================
# (유휴부지_면적 / 40) * 10 = 유휴부지_면적 * 0.25, 최대 100
수익성 = F.least(F.col("유휴부지_면적") * 0.25, F.lit(100.0))

# ============================================================
# 3. 화물차접근성
# ============================================================
화물차_접근성 = F.lit(3)

# ============================================================
# 4. 총점
# ============================================================
총점 = 영업성공확률 * 수익성 * 화물차_접근성 / 5

# ============================================================
# 적용
# ============================================================
final_df = (
    final_df
    .withColumn("영업성공확률", F.round(영업성공확률, 2))
    .withColumn("화물차_접근성", 화물차_접근성)
    .withColumn("수익성", F.round(수익성, 2))
    .withColumn("총점", F.round(총점, 2))
)

In [8]:
final_df.printSchema()

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 지주: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 대장_구분_코드: string (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 영업성공확률: double (nullable = true)
 |-- 화물차_접근성: integer (nullable = false)
 |-- 수익성: double (nullable = true)
 |-- 총점: double (nullable = true)



In [9]:
final_df = final_df.withColumn(
    "sigungu",
    F.when(
        F.col("법정동명").rlike(r"[가-힣]+시\s+[가-힣]+구"),
        F.concat_ws(" ",
            F.regexp_extract(F.col("법정동명"), r"([가-힣]+시)\s+[가-힣]+구", 1),
            F.regexp_extract(F.col("법정동명"), r"[가-힣]+시\s+([가-힣]+구)", 1)
        )
    )
    .otherwise(
        F.regexp_extract(F.col("법정동명"), r"([가-힣]+(?:시|군))", 1)
    )
).select("sigungu", "총점", "영업성공확률", "수익성", "화물차_접근성", "업체명", "도로명주소", "유휴부지_면적", "longitude", "latitude", "region")

In [10]:
(
    final_df
    .write.mode("overwrite")
    .partitionBy(*gold_restaurant_partition_cols)
    .parquet(gold_restaurant_path)
)
print("✅ saved:", gold_restaurant_path)

✅ saved: /opt/spark/data/gold/restaurant/year=2026/month=02/week=03


                                                                                

In [11]:
spark.stop()