In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import yaml, os

from utils.spark_path import (
    get_latest_year_month_path,
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [2]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_s0_to_s1_chk")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 11:32:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# -------------------------
# INPUT
# -------------------------

restaurant_clean_base = os.path.join(
    ROOT,
    LAYERS["silver"]["clean"]["domains"]["restaurant_clean"]["paths"]["parquet"]
)

restaurant_clean_path = get_latest_year_month_week_path(
    spark, restaurant_clean_base
)

s0_address_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["address"]["paths"]["parquet"]
)

s0_address_path = get_latest_year_month_path(
    spark, s0_address_base
)

# -------------------------
# OUTPUT
# -------------------------

restaurant_coord_src_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s1"]["domains"]["restaurant_coord"]["paths"]["parquet"]
)
restaurant_coord_src_path = get_current_year_month_week_path(restaurant_coord_src_base)

partition_cols = LAYERS["silver"]["stages"]["s1"]["domains"]["restaurant_coord"].get(
    "partition"
)

print(" restaurant clean =", restaurant_clean_path)
print("restaurant =", restaurant_coord_src_path)

 restaurant clean = /opt/spark/data/silver/clean/restaurant/year=2026/month=02/week=03
restaurant = /opt/spark/data/silver/s1/restaurant_coord/year=2026/month=02/week=03


In [4]:
# ============================================================
# LOAD
# ============================================================

rest_df = (
    spark.read.parquet(restaurant_clean_path)
)

addr_df = (
    spark.read.parquet(s0_address_path)
    .select("PNU코드", "도로명주소", "longitude", "latitude")
)


rest_df.printSchema()
addr_df.printSchema()

root
 |-- 업체명: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 소재지: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자수: integer (nullable = true)
 |-- region: string (nullable = true)

root
 |-- PNU코드: string (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



In [5]:
joined_df = (
    rest_df.alias("r")
    .join(
        addr_df.alias("a"),
        F.col("r.소재지") == F.col("a.도로명주소"),
        "left"
    )
)

joined_rest_df = (
    joined_df
    .filter(F.col("PNU코드").isNotNull())
    .select(
        F.col("r.업체명").alias("업체명"),
        F.col("r.업종").alias("업종"),
        F.col("r.대표자").alias("대표자"),
        F.col("r.대표자수").alias("대표자_수"),
        F.col("r.소재지").alias("도로명주소"),
        F.col("r.region").alias("region"),
        F.col("a.PNU코드").alias("PNU코드"),
        F.col("a.longitude").alias("longitude"),
        F.col("a.latitude").alias("latitude"),
    )
)

In [6]:
joined_rest_df.printSchema()

root
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- region: string (nullable = true)
 |-- PNU코드: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)



In [7]:
(
    joined_rest_df
    .write
    .mode("overwrite")
    .partitionBy(*partition_cols)
    .parquet(restaurant_coord_src_path)
)

print("✅ saved:", restaurant_coord_src_path)



✅ saved: /opt/spark/data/silver/s1/restaurant_coord/year=2026/month=02/week=03


                                                                                