# Overview

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS geodata.gold;

DROP TABLE IF EXISTS geodata.gold.well_header;

In [0]:
occ_df = spark.table("geodata.silver.well_header_occ")
sp_df = spark.table("geodata.silver.well_header_sp")
env_df = spark.table("geodata.silver.well_header_env")

wh_df = occ_df.unionByName(sp_df).unionByName(env_df)
wh_df.write.format("delta").saveAsTable("geodata.gold.well_header")

In [0]:
%sql
SHOW TABLES IN geodata.gold;


In [0]:
%sql
select 'Three-vendor', count(*) as row_count from geodata.gold.well_header

In [0]:
%sql
SELECT uwi_10, vendor, geom FROM geodata.gold.well_header
ORDER BY uwi_10, vendor

In [0]:
from pyspark.sql.functions import col, when, expr

well_header_df = spark.table("geodata.gold.well_header")

# Pivot the table wide by vendor:
wide_df = (
    well_header_df
    .groupBy("uwi_10")
    .pivot("vendor", ["ENV", "OCC", "SP"])
    .agg(expr("first(geom)"))
    .withColumnRenamed("ENV", "geom_ENV")
    .withColumnRenamed("OCC", "geom_OCC")
    .withColumnRenamed("SP",  "geom_SP")
)

# Compute distances for all available pairs
wide_df = wide_df.withColumn(
    "distance_env_occ",
    expr("ST_DISTANCESPHEROID(geom_ENV, geom_OCC)")
).withColumn(
    "distance_env_sp",
    expr("ST_DISTANCESPHEROID(geom_ENV, geom_SP)")
).withColumn(
    "distance_occ_sp",
    expr("ST_DISTANCESPHEROID(geom_OCC, geom_SP)")
)

# Filter for at least 2 non-null geoms if desired
wide_df = wide_df.filter(
    (
        (col("geom_ENV").isNotNull() & col("geom_OCC").isNotNull()) |
        (col("geom_ENV").isNotNull() & col("geom_SP").isNotNull()) |
        (col("geom_OCC").isNotNull() & col("geom_SP").isNotNull())
    )
)

display(wide_df)

-- Pivot wide
SELECT
  uwi_10,
  MAX(CASE WHEN vendor = 'ENV' THEN geom END) AS geom_env,
  MAX(CASE WHEN vendor = 'OCC' THEN geom END) AS geom_occ,
  MAX(CASE WHEN vendor = 'SP'  THEN geom END) AS geom_sp
FROM your_table
GROUP BY uwi_10


SELECT
  *,
  ST_DISTANCE(geom_env, geom_occ) AS distance_env_occ,
  ST_DISTANCE(geom_env, geom_sp)  AS distance_env_sp,
  ST_DISTANCE(geom_occ, geom_sp)  AS distance_occ_sp
FROM pivoted_table
WHERE (geom_env IS NOT NULL AND geom_occ IS NOT NULL)
   OR (geom_env IS NOT NULL AND geom_sp  IS NOT NULL)
   OR (geom_occ IS NOT NULL AND geom_sp  IS NOT NULL)


In [0]:
%sql
--SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'));
SELECT ST_DISTANCESPHEROID(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'));
--show functions like 'ST_DISTANCE%'