In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml, os

from utils.spark_path import (
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [26]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_clean_restaurant")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

26/02/20 20:54:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [33]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# -------------------------
# INPUT
# -------------------------

restaurant_src_base = os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["restaurant_owner"]["paths"]["parquet"]
)

restaurant_src_path = get_latest_year_month_week_path(
    spark, restaurant_src_base
)

# -------------------------
# OUTPUT
# -------------------------

restaurant_clean_base = os.path.join(
    ROOT,
    LAYERS["silver"]["clean"]["domains"]["restaurant_clean"]["paths"]["parquet"]
)

restaurant_clean_path = get_current_year_month_week_path(
    restaurant_clean_base
)

partition_cols = LAYERS["silver"]["clean"]["domains"]["restaurant_clean"].get(
    "partition"
)

print("[SOURCE] restaurant =", restaurant_src_path)
print("[TARGET] restaurant =", restaurant_clean_path)
print("[CONF] partition    =", partition_cols)

[SOURCE] restaurant = /opt/spark/data/bronze/restaurant_owner/parquet/year=2026/month=02/week=01
[TARGET] restaurant = /opt/spark/data/silver/clean/restaurant/year=2026/month=02/week=08
[CONF] partition    = ['region']


In [37]:
# ============================================================
# LOAD
# ============================================================

rest_df = (
    spark.read.parquet(restaurant_src_path)
    .select("업체명", "대표자", "소재지", "업종", "region")
)


print("=== raw schema ===")
rest_df.printSchema()

=== raw schema ===
root
 |-- 업체명: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 소재지: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- region: string (nullable = true)



In [38]:
# ============================================================
# CLEANING
# ============================================================

rest_clean_df = (
    rest_df

    # 업체명 NULL 제거
    .filter(F.col("업체명").isNotNull())
    # 대표자 NULL 제거
    .filter(F.col("대표자").isNotNull())
    # 소재지 NULL 제거
    .filter(F.col("소재지").isNotNull())

    # 괄호 제거
    .withColumn("소재지", F.regexp_replace(F.col("소재지"), r"\s*\([^)]*\)", ""))

    # 쉼표 뒤 제거
    .withColumn("소재지", F.regexp_replace(F.col("소재지"), r",.*$", ""))

    # 도로명 앞 행정동 제거
    .withColumn(
        "소재지",
        F.regexp_replace(
            F.col("소재지"),
            r"\s+\S+(동|읍|면|리)\s+(?=\S+(로|길))",
            " "
        )
    )

    # 공백 정리
    .withColumn("소재지", F.trim(F.regexp_replace(F.col("소재지"), r"\s+", " ")))
)

rest_clean_df = (
    rest_clean_df
    # 대표자수 계산
    .withColumn(
        "대표자수",
         F.when(
             F.col("대표자").rlike(r"외\s*\d+"),
             1 + F.regexp_extract(F.col("대표자"), r"외\s*(\d+)", 1).cast("int")
         )
         .when(
             F.col("대표자").rlike(r"[,/·]"),
             F.size(F.split(F.col("대표자"), r"\s*[,/·]\s*"))
         )
         .otherwise(F.lit(1))
    )

    # 대표자 정규화
    .withColumn("대표자", F.trim(F.col("대표자")))
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*\([^)]*\)\s*", ""))   # 괄호 제거
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*[,/].*$", ""))       # 첫 토큰만
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*외\s*\d+\s*(인|명)\s*$", ""))
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*외\s*\d+\s*$", ""))  # "외3"
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s+", " "))
    .withColumn("대표자", F.trim(F.col("대표자")))

    .dropDuplicates(["업체명", "대표자", "소재지"])
)

rest_clean_df.printSchema()

root
 |-- 업체명: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 소재지: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 대표자수: integer (nullable = true)



In [39]:
# ============================================================
# SAVE (silver/clean/restaurant)
# ============================================================
(
    rest_clean_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(restaurant_clean_path)
)

print("✅ saved:", restaurant_clean_path)



✅ saved: /opt/spark/data/silver/clean/restaurant/year=2026/month=02/week=08


                                                                                

In [40]:
spark.stop()