```
bryan@ichabod mac_bucket % for file in *_formation.json; do
  databricks fs cp "$file" dbfs:/Volumes/geodata/petra/formation/
done
```


In [None]:
df = spark.read.format("json").load("/Volumes/geodata/petra/formation/")

display(df)

In [None]:
from common.transforms import string_to_iso_date, generate_hash, replace_10e30_with_null
import pyspark.sql.functions as F

df_flat = df.select(
    F.col("repo_id"),
    F.col("well.uwi").alias("uwi"),
    F.col("well.wsn").alias("wsn"),
    F.col("zdata.z").alias("depth"),
    F.col("zflddef.adddate").alias("app_row_created"),
    F.col("zflddef.chgdate").alias("app_row_changed"),
    F.col("zflddef.desc").alias("desc"),
    F.col("zflddef.kind").alias("kind"),
    F.col("zflddef.name").alias("name"),
    F.col("zflddef.ndec").alias("ndec"),
    F.col("zflddef.remarks").alias("remarks"),
    F.col("zflddef.source").alias("source"),
    F.col("zflddef.units").alias("units"),
    F.col("zflddef.unitstype").alias("unitstype"),
)


df_formation = df_flat


# enforce timestamp for dates
for col_name in ["app_row_created", "app_row_changed"]:
    df_formation = string_to_iso_date(df_formation, col_name, col_name)


# ensure real nulls
for col_name in ["depth"]:
    df_formation = replace_10e30_with_null(df_formation, col_name, col_name)


# add id hash
id_columns = ["repo_id", "uwi"]
df_formation = generate_hash(df_formation, "id", "formation", *id_columns)


display(df_formation)