In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType
from pyproj import Transformer
from pyspark.sql.functions import pandas_udf

import datetime as dt
import pandas as pd
import yaml
import os

# Config

In [2]:
CONFIG_PATH = "./config.yaml"

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

ROOT = cfg["data_lake"]["root"]
LAYERS = cfg["data_lake"]["layers"]

address_src_path = os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["address"]["paths"]["parquet"]
)

coord_src_path = os.path.join(
    ROOT,
    LAYERS["bronze"]["domains"]["coord"]["paths"]["parquet"]
)

s0_address_path = os.path.join(
    ROOT,
    LAYERS["silver"]["stages"]["s0"]["domains"]["address"]["paths"]["parquet"]
)

s0_partition_cols = LAYERS["silver"]["stages"]["s0"]["domains"]["address"].get("partition", ["region", "sigungu"])

print("[PATH] address_src_path =", address_src_path)
print("[PATH] coord_src_path   =", coord_src_path)
print("[PATH] s0_address_path  =", s0_address_path)
print("[CONF] s0_partition_cols =", s0_partition_cols)

[PATH] address_src_path = /opt/spark/data/bronze/address/parquet
[PATH] coord_src_path   = /opt/spark/data/bronze/coord/parquet
[PATH] s0_address_path  = /opt/spark/data/silver/s0/address
[CONF] s0_partition_cols = ['region', 'sigungu']


In [3]:
spark = (
    SparkSession.builder
    .appName("silver_s0_address_upsert")
    .master("spark://spark-master:7077")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    .config("spark.sql.shuffle.partitions", "200")
    .getOrCreate()
)

Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/19 10:05:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Bronze 데이터 로드 (도로명주소, 위치정보)

In [4]:
# ============================================================
# LOAD
# ============================================================
addr_df = spark.read.parquet(address_src_path)
coord_df = spark.read.parquet(coord_src_path)

addr_df.printSchema()
coord_df.printSchema()

                                                                                

root
 |-- 도로명주소관리번호: string (nullable = true)
 |-- 법정동코드: string (nullable = true)
 |-- 시도명: string (nullable = true)
 |-- 시군구명: string (nullable = true)
 |-- 법정읍면동명: string (nullable = true)
 |-- 법정리명: string (nullable = true)
 |-- 산여부: string (nullable = true)
 |-- 지번본번(번지): string (nullable = true)
 |-- 지번부번(호): string (nullable = true)
 |-- 도로명코드: string (nullable = true)
 |-- 도로명: string (nullable = true)
 |-- 지하여부: string (nullable = true)
 |-- 건물본번: string (nullable = true)
 |-- 건물부번: string (nullable = true)
 |-- 행정동코드: string (nullable = true)
 |-- 행정동명: string (nullable = true)
 |-- 기초구역번호(우편번호): string (nullable = true)
 |-- 이전도로명주소: string (nullable = true)
 |-- 효력발생일: string (nullable = true)
 |-- 공동주택구분: string (nullable = true)
 |-- 이동사유코드: string (nullable = true)
 |-- 건축물대장건물명: string (nullable = true)
 |-- 시군구용건물명: string (nullable = true)
 |-- 비고: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- region: string (

# 도로명주소, 위치정보 join

In [5]:
# 1) PNU코드, 도로명주소 만들기
addr_with_pnu_df = (
    addr_df
    .withColumn(
        "_pnu_land_gb",
        F.when(F.col("산여부").cast("string") == F.lit("1"), F.lit("2")).otherwise(F.lit("1"))
    )
    .withColumn(
        "PNU코드",
        F.concat(
            F.col("법정동코드").cast("string"),
            F.col("_pnu_land_gb"),
            F.lpad(F.col("지번본번(번지)").cast("string"), 4, "0"),
            F.lpad(F.coalesce(F.col("지번부번(호)"), F.lit("0")).cast("string"), 4, "0"),
        )
    )
    .drop("_pnu_land_gb")
    .withColumn(
        "도로명주소",
        F.concat_ws(
            " ",
            F.col("시도명"),
            F.col("시군구명"),
            F.col("도로명"),
            F.concat(
                F.col("건물본번").cast("string"),
                F.when(
                    (F.col("건물부번").isNotNull()) & (F.col("건물부번") != "0") & (F.col("건물부번") != 0),
                    F.concat(F.lit("-"), F.col("건물부번").cast("string"))
                ).otherwise(F.lit(""))
            )
        )
    )
    # partition용 sigungu 만들기 (공백을 _ 로 바꿔 안정적으로)
    .withColumn("sigungu", F.regexp_replace(F.col("시군구명"), r"\s+", "_"))
)

# 2) 좌표 붙이기 (join)
join_keys = ["법정동코드", "도로명코드", "건물본번", "건물부번"]

joined_df = (
    addr_with_pnu_df.alias("a")
    .join(coord_df.alias("c"), on=join_keys, how="left")
    .select(
        F.col("a.region").alias("region"),
        F.col("a.sigungu").alias("sigungu"),
        F.col("a.PNU코드").alias("PNU코드"),
        F.col("a.도로명주소").alias("도로명주소"),
        F.expr("try_cast(nullif(trim(c.`X좌표`), '') as double)").alias("x_utmk"),
        F.expr("try_cast(nullif(trim(c.`Y좌표`), '') as double)").alias("y_utmk"),
    )
)

print("joined_df schema")
joined_df.printSchema()
joined_df.show(5, truncate=False)

joined_df schema
root
 |-- region: string (nullable = true)
 |-- sigungu: string (nullable = true)
 |-- PNU코드: string (nullable = true)
 |-- 도로명주소: string (nullable = false)
 |-- x_utmk: double (nullable = true)
 |-- y_utmk: double (nullable = true)



[Stage 6:>                                                          (0 + 1) / 1]

+------+-------+-------------------+---------------------------------+-------------+--------------+
|region|sigungu|PNU코드            |도로명주소                       |x_utmk       |y_utmk        |
+------+-------+-------------------+---------------------------------+-------------+--------------+
|서울  |종로구 |1111010100101150001|서울특별시 종로구 자하문로 106-3 |953247.446389|1954143.302189|
|서울  |종로구 |1111010100100650000|서울특별시 종로구 자하문로 124   |953190.915688|1954318.21969 |
|서울  |종로구 |1111010100100390006|서울특별시 종로구 자하문로 131   |953145.030158|1954371.265195|
|서울  |종로구 |1111010100100530006|서울특별시 종로구 자하문로 131-13|953093.015008|1954391.93666 |
|서울  |종로구 |1111010100100520123|서울특별시 종로구 자하문로 131-23|953040.030637|1954398.335601|
+------+-------+-------------------+---------------------------------+-------------+--------------+
only showing top 5 rows


                                                                                

# 좌표 변환 (EPSG:5179 -> EPSG:4326)

In [6]:
schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
])

@pandas_udf(schema)
def utmk5179_to_wgs84(x: pd.Series, y: pd.Series) -> pd.DataFrame:
    # ✅ executor에서 import + transformer 생성
    from pyproj import Transformer
    transformer = Transformer.from_crs("EPSG:5179", "EPSG:4326", always_xy=True)

    # null 처리
    mask = x.isna() | y.isna()
    xx = x.astype("float64")
    yy = y.astype("float64")

    lon, lat = transformer.transform(xx, yy)

    out = pd.DataFrame({"latitude": lat, "longitude": lon})
    out.loc[mask, ["latitude", "longitude"]] = None
    return out

# 적용
converted_df = (
    joined_df
    .withColumn("wgs84", utmk5179_to_wgs84(F.col("x_utmk"), F.col("y_utmk")))
    .withColumn("latitude", F.col("wgs84.latitude"))
    .withColumn("longitude", F.col("wgs84.longitude"))
    .drop("wgs84")
)

joined_df = (
    converted_df
    .drop("x_utmk", "y_utmk")
    .select("region", "sigungu", "PNU코드", "도로명주소", "longitude", "latitude")
)

joined_df.show(5, truncate=False)

[Stage 11:>                                                         (0 + 1) / 1]

+------+-------+-------------------+---------------------------------+------------------+------------------+
|region|sigungu|PNU코드            |도로명주소                       |longitude         |latitude          |
+------+-------+-------------------+---------------------------------+------------------+------------------+
|서울  |종로구 |1111010100101150001|서울특별시 종로구 자하문로 106-3 |126.9704631686789 |37.585495823181596|
|서울  |종로구 |1111010100100650000|서울특별시 종로구 자하문로 124   |126.96981172387076|37.58706950637937 |
|서울  |종로구 |1111010100100390006|서울특별시 종로구 자하문로 131   |126.9692886221629 |37.587545277671495|
|서울  |종로구 |1111010100100530006|서울특별시 종로구 자하문로 131-13|126.96869816192319|37.58772894219645 |
|서울  |종로구 |1111010100100520123|서울특별시 종로구 자하문로 131-23|126.96809763656023|37.587783913973205|
+------+-------+-------------------+---------------------------------+------------------+------------------+
only showing top 5 rows


                                                                                

# 결과 저장

In [7]:
(
    joined_df
    .write.mode("overwrite")
    .partitionBy(*s0_partition_cols)
    .parquet(s0_address_path)
)

print("✅ saved:", s0_address_path)

                                                                                

✅ saved: /opt/spark/data/silver/s0/address


In [8]:
spark.stop()