In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import yaml, os

from utils.spark_path import (
    get_current_year_month_week_path,
    get_latest_year_month_week_path
)

In [78]:
# ============================================================
# Spark
# ============================================================
spark = (
    SparkSession.builder
    .appName("silver_s1_to_s2")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

26/02/20 21:58:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Config

In [148]:
# ============================================================
# Config
# ============================================================

CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

# Input Path
s2_toji_owner_match_base = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s2"]["domains"]["toji_owner_match"]["paths"]["parquet"]
)
s2_toji_owner_match_path = get_latest_year_month_week_path(spark, s2_toji_owner_match_base)


# Output Path
gold_restaurant_master_base = os.path.join(
    ROOT,
    LAYERS["gold"]["domains"]["restaurant_master"]["paths"]["parquet"]
)
gold_restaurant_master_path = get_current_year_month_week_path(gold_restaurant_master_base)

gold_restaurant_master_partition_cols = LAYERS["gold"]["domains"]["restaurant_master"].get("partition")


print("[PATH] s2_toji_owner_match_path   =", s2_toji_owner_match_path)
print("[PATH] gold_restaurant_master_path   =", gold_restaurant_master_path)

[PATH] s2_toji_owner_match_path   = /opt/spark/data/silver/s2/toji_owner_match/year=2026/month=02/week=03
[PATH] gold_restaurant_master_path   = /opt/spark/data/gold/restaurant_master/year=2026/month=02/week=03


In [149]:
toji_owner_df = (
    spark.read.parquet(s2_toji_owner_match_path)
)


toji_owner_df.printSchema()



root
 |-- 법정동명: string (nullable = true)
 |-- 본번: string (nullable = true)
 |-- 고유번호: string (nullable = true)
 |-- 지목: string (nullable = true)
 |-- 공유인수: long (nullable = true)
 |-- 소유권변동일자: string (nullable = true)
 |-- 부번: string (nullable = true)
 |-- 관리_건축물대장_PK: string (nullable = true)
 |-- 유휴부지_면적: double (nullable = true)
 |-- 업체명: string (nullable = true)
 |-- 업종: string (nullable = true)
 |-- 대표자: string (nullable = true)
 |-- 대표자_수: integer (nullable = true)
 |-- 도로명주소: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- 지주: string (nullable = true)
 |-- region: string (nullable = true)



In [None]:
spark.stop()