```
bryan@ichabod mac_bucket % for file in *_perforation.json; do
  databricks fs cp "$file" dbfs:/Volumes/geodata/petra/perforation/
done
```


In [None]:
df = spark.read.format("json").load("/Volumes/geodata/petra/perforation/")

display(df)

In [None]:
from common.transforms import string_to_iso_date, generate_hash, replace_10e30_with_null
import pyspark.sql.functions as F

df_flat = df.select(
    F.col("repo_id"),
    F.col("uwi.uwi").alias("uwi"),
    F.col("uwi.wsn").alias("wsn"),
    F.col("perfs.base").alias("base"),
    F.col("perfs.chgdate").alias("app_row_changed"),
    F.col("perfs.comptype").alias("comptype"),
    F.col("perfs.date").alias("date"),
    F.col("perfs.diameter").alias("diameter"),
    F.col("perfs.enddate").alias("enddate"),
    F.col("perfs.fmname").alias("formation"),
    F.col("perfs.method").alias("method"),
    F.col("perfs.numshots").alias("numshots"),
    F.col("perfs.perftype").alias("perftype"),
    # F.col("perfs.recid").alias("recid"),
    F.col("perfs.remark").alias("remark"),
    F.col("perfs.source").alias("source"),
    F.col("perfs.top").alias("top"),
)

df_perforation = df_flat


# enforce timestamp for dates
for col_name in ["app_row_changed", "date"]:
    df_perforation = string_to_iso_date(df_perforation, col_name, col_name)

# ensure real nulls
for col_name in ["base", "diameter"]:
    df_perforation = replace_10e30_with_null(df_perforation, col_name, col_name)

# add id hash
id_columns = ["repo_id", "uwi"]
df_perforation = generate_hash(df_perforation, "id", "perforation", *id_columns)


display(df_perforation)