```
bryan@ichabod mac_bucket % for file in *_survey.json; do
  databricks fs cp "$file" dbfs:/Volumes/geodata/petra/survey_raw/
done
```


In [None]:
df = spark.read.format("json").load("/Volumes/geodata/petra/survey_raw/")

display(df)

In [None]:
from common.transforms import (
    string_to_iso_date,
    generate_hash,
    int_to_boolean,
    replace_10e30_with_null,
)
import pyspark.sql.functions as F

df_flat = df.select(
    F.col("repo_id"),
    F.col("uwi.uwi").alias("uwi"),
    F.col("uwi.wsn").alias("wsn"),
    F.col("dirsurvdata.active").alias("active"),
    F.col("dirsurvdata.adddate").alias("app_row_created"),
    F.col("dirsurvdata.chgdate").alias("app_row_changed"),
    # F.col("dirsurvdata.data").alias('data'),
    F.col("dirsurvdata.datasize").alias("datasize"),
    F.col("dirsurvdata.depunits").alias("depunits"),
    F.col("dirsurvdata.dippresent").alias("dippresent"),
    F.col("dirsurvdata.md1").alias("md1"),
    F.col("dirsurvdata.md2").alias("md2"),
    F.col("dirsurvdata.numrecs").alias("numrecs"),
    F.col("dirsurvdata.remarks").alias("remarks"),
    F.col("dirsurvdata.survrecid").alias("survrecid"),
    F.col("dirsurvdata.tvd1").alias("tvd1"),
    F.col("dirsurvdata.tvd2").alias("tvd2"),
    F.col("dirsurvdata.vs_1").alias("vs_1"),
    F.col("dirsurvdata.vs_2").alias("vs_2"),
    F.col("dirsurvdata.vs_3").alias("vs_3"),
    F.col("dirsurvdata.xoff1").alias("xoff1"),
    F.col("dirsurvdata.xoff2").alias("xoff2"),
    F.col("dirsurvdata.xyunits").alias("xyunits"),
    F.col("dirsurvdata.yoff1").alias("yoff1"),
    F.col("dirsurvdata.yoff2").alias("yoff2"),
    F.col("dirsurvdef.survey_type").alias("survey_type"),
)

df_survey = df_flat


# enforce timestamp for dates
for col_name in ["app_row_created", "app_row_changed"]:
    df_survey = string_to_iso_date(df_survey, col_name, col_name)


# define booleans
for col_name in ["active", "dippresent"]:
    df_survey = int_to_boolean(df_survey, col_name, col_name)


# ensure real nulls
for col_name in [
    "md1",
    "md2",
    "tvd1",
    "tvd2",
    "vs_1",
    "vs_2",
    "vs_3",
    "xoff1",
    "xoff2",
    "yoff1",
    "yoff2",
]:
    df_survey = replace_10e30_with_null(df_survey, col_name, col_name)


# add id hash
id_columns = ["repo_id", "uwi"]
df_survey = generate_hash(df_survey, "id", "survey", *id_columns)


display(df_survey)

In [None]:
from common.transforms import upsert_dataframe_to_table

result = upsert_dataframe_to_table(df_survey, "geodata.petra.survey_bronze")
display(result)