In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import yaml, os

from utils.spark_path import (
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [23]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_s1_to_s2")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

26/02/21 03:34:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Config

In [24]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# Input Path
s2_toji_owner_match_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s2"]["domains"]["toji_owner_match"]["paths"]["parquet"]
)
s2_toji_owner_match_path = get_latest_year_month_week_path(spark, s2_toji_owner_match_base)


# Output Path
gold_restaurant_master_base = os.path.join(
    ROOT,
    LAYERS["gold"]["domains"]["restaurant_master"]["paths"]["parquet"]
)
gold_restaurant_master_path = get_current_year_month_week_path(gold_restaurant_master_base)

gold_restaurant_master_partition_cols = LAYERS["gold"]["domains"]["restaurant_master"].get("partition")


print("[PATH] s2_toji_owner_match_path   =", s2_toji_owner_match_path)
print("[PATH] gold_restaurant_master_path   =", gold_restaurant_master_path)

[PATH] s2_toji_owner_match_path   = /opt/spark/data/silver/s2/toji_owner_match/year=2026/month=02/week=03
[PATH] gold_restaurant_master_path   = /opt/spark/data/gold/restaurant_master/year=2026/month=02/week=03


In [25]:
toji_owner_df = (
    spark.read.parquet(s2_toji_owner_match_path)
)

toji_owner_df.printSchema()



root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- 지주: string (nullable = true)
 |-- region: string (nullable = true)



In [26]:
parking_max_df = (
    toji_owner_df
    .groupBy("법정동명", "본번", "지주", "부번", "region")
    .agg(F.max("유휴부지_면적").alias("유휴부지_면적(max)"))
)

parking_sum_df = (
    parking_max_df
    .groupBy("법정동명", "본번", "지주", "region")
    .agg(
        F.sum("유휴부지_면적(max)").alias("유휴부지_면적")
    )
)

print(parking_sum_df.count())
parking_sum_df.show(10, truncate=False)

3112
+----------------------------------+----+------------+------+------------------+
|법정동명                          |본번|지주        |region|유휴부지_면적     |
+----------------------------------+----+------------+------+------------------+
|경기도 용인시 처인구 남사읍 방아리|844 |백영미      |경기  |2565.0            |
|경기도 용인시 처인구 모현읍 초부리|75  |이철희      |경기  |34.0              |
|경기도 용인시 처인구 포곡읍 삼계리|59  |버          |경기  |1327.0            |
|경기도 용인시 처인구 남사읍 봉무리|658 |김태홍      |경기  |1446.0            |
|경기도 용인시 처인구 원삼면 문촌리|6   |버          |경기  |1478.6            |
|경기도 용인시 처인구 역북동       |605 |최경서      |경기  |248.0             |
|경기도 용인시 처인구 유방동       |750 |버          |경기  |314.0             |
|경기도 용인시 처인구 역북동       |418 |사해환      |경기  |10.47             |
|경기도 용인시 처인구 원삼면 사암리|140 |버          |경기  |565.15            |
|경기도 용인시 처인구 원삼면 두창리|85  |주식회사라진|경기  |2386.6499999999996|
+----------------------------------+----+------------+------+------------------+
only showing top 10 rows


In [27]:
final_df = (
    toji_owner_df.alias("r")
    .join(
        parking_sum_df.alias("p"),
        on=["법정동명", "본번", "지주", "region"],
        how="left"
    )
)

final_df.printSchema()

root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 지주: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)



In [None]:
final_df.show()

+-----------------------------+----+------+------+-------------------+----+--------+--------------+----+------------------+-------------------+-------------------------+----------+------+---------+---------------------------------+------------------+------------------+-------------------+
|                     법정동명|본번|  지주|region|           고유번호|지목|공유인수|소유권변동일자|부번|관리_건축물대장_PK|      유휴부지_면적|                   업체명|      업종|대표자|대표자_수|                       도로명주소|         longitude|          latitude|      유휴부지_면적|
+-----------------------------+----+------+------+-------------------+----+--------+--------------+----+------------------+-------------------+-------------------------+----------+------+---------+---------------------------------+------------------+------------------+-------------------+
|  경기도 용인시 처인구 고림동| 984|박연화|  경기|4146110600109840000|  대|       8|    2017-03-30|NULL|    11161100387728|-116.33000000000001|         역시풍천민물장어|일반음식점| 이*희|        1|경기도 용인시 처인구 경안천로 230|127.2189

26/02/21 04:04:02 ERROR TaskSchedulerImpl: Lost executor 0 on 172.18.0.6: worker lost: Not receiving heartbeat for 60 seconds
26/02/21 04:04:02 ERROR TaskSchedulerImpl: Lost executor 1 on 172.18.0.5: worker lost: Not receiving heartbeat for 60 seconds


In [None]:
def name_match(col_rep, col_owner):
    return (
        col_rep.isNotNull() &
        col_owner.isNotNull() &
        (F.length(col_rep) >= 2) &
        (F.length(col_rep) == F.length(col_owner)) &
        (F.substring(col_rep, 1, 1) == F.substring(col_owner, 1, 1)) &
        (
            # 2글자: 첫 글자만 비교
            (F.length(col_owner) == 2) |
            # 3글자 이상: 첫 + 끝 글자 비교
            (F.substring(col_rep, F.length(col_rep), 1) ==
            F.substring(col_owner, F.length(col_owner), 1))
        )
    )

In [None]:
spark.stop()