In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml, os

from utils.spark_path import (
    get_current_year_month_week_path,
    get_latest_year_month_path,
    get_latest_year_month_week_path
)

In [2]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_clean_restaurant")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 05:56:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

REGION = cfg["target"]["REGION"]

# -------------------------
# INPUT
# -------------------------

restaurant_src_base = os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["restaurant_owner"]["paths"]["parquet"]
)

restaurant_src_path = get_latest_year_month_week_path(
    spark, restaurant_src_base
)

s0_address_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["address"]["paths"]["parquet"]
)

s0_address_path = get_latest_year_month_path(
    spark, s0_address_base
)

# -------------------------
# OUTPUT
# -------------------------

restaurant_clean_base = os.path.join(
    ROOT,
    LAYERS["silver"]["clean"]["domains"]["restaurant_clean"]["paths"]["parquet"]
)

restaurant_clean_path = get_current_year_month_week_path(
    restaurant_clean_base
)

partition_cols = LAYERS["silver"]["clean"]["domains"]["restaurant_clean"].get(
    "partition", ["region"]
)

print("[SOURCE] restaurant =", restaurant_src_path)
print("[SOURCE] address    =", s0_address_path)
print("[TARGET] restaurant =", restaurant_clean_path)
print("[CONF] partition    =", partition_cols)

[SOURCE] restaurant = /opt/spark/data/bronze/restaurant_owner/parquet/year=2026/month=02/week=03
[SOURCE] address    = /opt/spark/data/silver/s0/address/year=2026/month=02
[TARGET] restaurant = /opt/spark/data/silver/clean/restaurant/year=2026/month=02/week=03
[CONF] partition    = ['region']


In [4]:
# ============================================================
# LOAD
# ============================================================

rest_df = (
    spark.read.parquet(restaurant_src_path)
    .filter(F.col("region") == REGION)
    .select("업체명", "대표자", "소재지", "업종", "region")
)

addr_df = (
    spark.read.parquet(s0_address_path)
    .filter(F.col("region") == REGION)
    .select("PNU코드", "도로명주소", "longitude", "latitude", "region")
)


print("=== raw schema ===")
rest_df.printSchema()
print("restaurant :", rest_df.count())
print("address   :", addr_df.count())

                                                                                

=== raw schema ===
root
 |-- 업체명: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 소재지: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- region: string (nullable = true)

restaurant : 212286
address   : 1029733


In [5]:
# ============================================================
# CLEANING
# ============================================================

clean_rest_addr_df = (
    rest_df

    # 업체명 NULL 제거
    .filter(F.col("업체명").isNotNull())
    # 대표자 NULL 제거
    .filter(F.col("대표자").isNotNull())
    # 소재지 NULL 제거
    .filter(F.col("소재지").isNotNull())

    # 괄호 제거
    .withColumn("소재지", F.regexp_replace(F.col("소재지"), r"\s*\([^)]*\)", ""))

    # 쉼표 뒤 제거
    .withColumn("소재지", F.regexp_replace(F.col("소재지"), r",.*$", ""))

    # 도로명 앞 행정동 제거
    .withColumn(
        "소재지",
        F.regexp_replace(
            F.col("소재지"),
            r"\s+\S+(동|읍|면|리)\s+(?=\S+(로|길))",
            " "
        )
    )

    # 공백 정리
    .withColumn("소재지", F.trim(F.regexp_replace(F.col("소재지"), r"\s+", " ")))
)

clean_rest_df = (
    clean_rest_addr_df
    # 대표자수 계산
    .withColumn(
        "대표자수",
         F.when(
             F.col("대표자").rlike(r"외\s*\d+"),
             1 + F.regexp_extract(F.col("대표자"), r"외\s*(\d+)", 1).cast("int")
         )
         .when(
             F.col("대표자").rlike(r"[,/·]"),
             F.size(F.split(F.col("대표자"), r"\s*[,/·]\s*"))
         )
         .otherwise(F.lit(1))
    )

    # 대표자 정규화
    .withColumn("대표자", F.trim(F.col("대표자")))
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*\([^)]*\)\s*", ""))   # 괄호 제거
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*[,/].*$", ""))       # 첫 토큰만
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*외\s*\d+\s*(인|명)\s*$", ""))
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s*외\s*\d+\s*$", ""))  # "외3"
    .withColumn("대표자", F.regexp_replace(F.col("대표자"), r"\s+", " "))
    .withColumn("대표자", F.trim(F.col("대표자")))

    .dropDuplicates(["업체명", "소재지"])
)

print("restaurant :", clean_rest_df.count())
clean_rest_df.show(5, truncate=False)

                                                                                

restaurant : 189622


                                                                                

+-------------------------+------+----------------------------------------+----------+------+--------+
|업체명                   |대표자|소재지                                  |업종      |region|대표자수|
+-------------------------+------+----------------------------------------+----------+------+--------+
|#깡패소곱창              |공*균 |경기도 동두천시 거북마루로 4            |일반음식점|경기  |1       |
|#먹빵                    |진*   |경기도 수원시 팔달구 권광로180번길 41   |일반음식점|경기  |1       |
|#호프광장                |이*원 |경기도 화성시 만세구 발안공단로5길 117-1|일반음식점|경기  |1       |
|(ATTO)아토커피 양주옥정점|최*영 |경기도 양주시 옥정동로5가길 15          |휴게음식점|경기  |1       |
|(BBQ)풍덕천중앙점        |문*곤 |경기도 용인시 수지구 포은대로 319       |일반음식점|경기  |1       |
+-------------------------+------+----------------------------------------+----------+------+--------+
only showing top 5 rows


                                                                                

도로명주소 중복 개수: 0




+------+------+-----+
|소재지|업체명|count|
+------+------+-----+
+------+------+-----+



                                                                                

In [13]:
# ============================================================
# Address Matching
# ============================================================

joined_df = (
    clean_rest_df.alias("r")
    .join(
        addr_df.alias("a"),
        F.col("r.소재지") == F.col("a.도로명주소"),
        "left"
    )
    .drop("도로명주소", F.col("a.region"))
)

total = joined_df.count()
matched = joined_df.filter(F.col("PNU코드").isNotNull()).count()

print("total     :", total)
print("matched   :", matched)
print("unmatched :", total - matched)
print("match rate:", matched / total if total else 0)

                                                                                

total     : 189622
matched   : 186566
unmatched : 3056
match rate: 0.9838837265718113


In [14]:
restaurant_clean_df = (
    joined_df
    .filter(F.col("PNU코드").isNotNull())
)

print("=== clean schema ===")
restaurant_clean_df.printSchema()
restaurant_clean_df.show(5, truncate=False)
print("clean count:", restaurant_clean_df.count())

=== clean schema ===
root
 |-- 업체명: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 소재지: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- region: string (nullable = true)
 |-- 대표자수: integer (nullable = true)
 |-- PNU코드: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



                                                                                

+----------------+------+-----------------------------+----------+------+--------+-------------------+------------------+-----------------+
|업체명          |대표자|소재지                       |업종      |region|대표자수|PNU코드            |longitude         |latitude         |
+----------------+------+-----------------------------+----------+------+--------+-------------------+------------------+-----------------+
|멜로우스팟      |왕*령 |경기도 가평군 가평제방길 33-1|휴게음식점|경기  |1       |4182025022100540001|127.51864897300084|37.81903743972822|
|백암순대국      |공*희 |경기도 가평군 가화로 1030-62 |일반음식점|경기  |1       |4182035021108780004|127.5507082601009 |37.88896563541379|
|롤링파스타가평점|정*민 |경기도 가평군 가화로 114     |일반음식점|경기  |1       |4182025021104730016|127.51383060303066|37.82990535141203|
|피자랑쥬(가평점)|장*란 |경기도 가평군 가화로 114     |일반음식점|경기  |1       |4182025021104730016|127.51383060303066|37.82990535141203|
|파리바게뜨      |이*진 |경기도 가평군 가화로 114     |제과점영업|경기  |1       |4182025021104730016|127.51383060303066|37.82990535141203|
+----------------+-

                                                                                

clean count: 186566


                                                                                

도로명주소 중복 개수: 0


                                                                                

+------+------+-----+
|소재지|업체명|count|
+------+------+-----+
+------+------+-----+



In [8]:
# ============================================================
# SAVE (silver/clean/restaurant)
# ============================================================
(
    restaurant_clean_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(restaurant_clean_path)
)

print("✅ saved:", restaurant_clean_path)

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `region` is ambiguous, could be: [`a`.`region`, `r`.`region`]. SQLSTATE: 42704

In [None]:
spark.stop()