In [1]:
import pyspark
import delta
from delta.tables import DeltaTable

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("nested-delta")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = delta.configure_spark_with_delta_pip(builder).getOrCreate()

24/05/07 06:02:54 WARN Utils: Your hostname, anders-silo-MB-air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.114 instead (on interface en0)
24/05/07 06:02:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/anders/.ivy2/cache
The jars for the packages stored in: /Users/anders/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8219aca1-36bb-4346-a968-074bdfacd124;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 79ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	--------------------------------------------------------------------

:: loading settings :: url = jar:file:/Users/anders/Library/Caches/pypoetry/virtualenvs/nested-delta-bqIhkT2k-py3.12/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	0 artifacts copied, 3 already retrieved (0kB/2ms)
24/05/07 06:02:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Data Structure

In [5]:
import pyspark.sql.functions as F
from pyspark.sql import types as t
import uuid
from delta.tables import *
from enum import Enum
from dataclasses import dataclass, field

In [13]:

@dataclass
class SIUnitDataMixin:
    sign: str
    unit: int
    dimension: str

class SIUnit(SIUnitDataMixin, Enum):
    AMPERE = ("A", "ampere", "electric current")
    AMPERE_HOUR = ("Ah", "ampere-hour", "electric capacity")
    CELCIUS = ("c", "celcius", "temperature")
    VOLT = ("V", "volt", "electric potential")
    MILLISECOND = ("ms", "millisecond", "time")

    def to_dict(self):
        return {
            "sign": self.sign,
            "name": self.unit,
            "dimension": self.dimension
        }

In [9]:
SIUnit.AMPERE_HOUR.to_dict()

{'sign': 'Ah', 'name': 'ampere-hour', 'dimension': 'electric capacity'}

In [20]:
data_schema = StructType([
    StructField(
        "test_id",
        t.StringType(),
        nullable=False,
        metadata={"comment": "uuid identifier for the test"}
    ),
    StructField(
        name="cycler_info",
        nullable=False,
        metadata={"comment": "Details about the cycler used to run the test"},
        dataType=t.StructType([
            StructField(
                "channel_number",
                t.IntegerType(),
                nullable=False,
                metadata={"comment": "Which equipment channel number (or similar identifier) was used to run this test"}
            ),
            StructField(
                "cycler_id",
                t.StringType(),
                nullable=False,
                metadata={"comment": "An optional identifying string unique to the cycler across the organization"}
            ),
            StructField(
                "server_version",
                t.StringType(),
                nullable=True,
                metadata={"comment": "The version of the server software used to run the test"}
            ),
            StructField(
                "client_version",
                t.StringType(),
                nullable=True,
                metadata={"comment": "The version of the client software used to run the test"}
            ),
        ]),
    ),
    StructField(
        "device_info",
        t.StructType([
            StructField(
                "device_id",
                t.StringType(),
                nullable=False,
                metadata={"comment": "Identifying string unique to the device/cell/pack across the organization"}
            ),
            StructField(
                "device_name",
                t.StringType(),
                nullable=False,
                metadata={"comment": "Descriptive name the test device"}
            ),
            StructField(
                "nominal_capacity",
                t.FloatType(),
                nullable=True,
                metadata={
                    "comment": "The nominal capacity of the test device in Ampere Hours",
                    "unit": SIUnit.AMPERE_HOUR.to_dict()
                }
            ),
            StructField(
                "calibration_date",
                t.DateType(),
                nullable=True,
                metadata={"comment": "The date the test device was calibrated"}
            ),
            StructField(
                "cell_type",
                t.StringType(),
                nullable=True,
                metadata={"comment": "The cell type of the test device"}
            ),
            StructField(
                "cell_formulation_id",
                t.StringType(),
                nullable=True,
                metadata={"comment": "An id that identifies what chemical formulation was used to make the cell."}
            )
        ]),
        nullable=False,
        metadata={"comment": "Info about the device/cell/pack that was tested"}
    ),
    StructField(
        "procedure",
        t.StructType([
            StructField(
                "procedure_id",
                t.StringType(),
                nullable=False,
                metadata={"comment": "uuid identifier for the test procedure"}
            ),
            StructField(
                "procedure_name",
                t.StringType(),
                nullable=False,
                metadata={"comment": "Descriptive name the test procedure"}
            ),
        ]),
        nullable=False,
        metadata={"comment": "Info about the test procedure"}
    ),
    StructField(
        "project",
        t.StringType(),
        nullable=True,
        metadata={"comment": "The name of the project the device belongs to"}
    ),
    StructField(
        name="test_data",
        nullable=False,
        metadata={"comment": "The measurements made by the cycler during the test"},
        dataType=StructType([
            StructField(
              "start_datetime",
              t.TimestampType(),
              nullable=False,
              metadata={"comment": "The absolute datetime the test started. ISO 8601 format. Must include timezone. If timezone is not specified, UTC is assumed"}  
            ),
            StructField(
            "measurements",
                t.ArrayType(
                    StructType([
                        StructField(
                            "current",
                            t.FloatType(),
                            nullable=False,
                            metadata={
                                "comment": "The current in Ampere. The sign convention is positive for charge current and negative for discharge current.",
                                "unit": SIUnit.AMPERE.to_dict()
                            }
                        ),
                        StructField(
                            "voltage",
                            t.FloatType(),
                            nullable=False,
                            metadata={
                                "comment": "The voltage in Volts",
                                "unit": SIUnit.VOLT.to_dict()
                            }
                        ),
                        StructField(
                            "elapsed_time",
                            t.IntegerType(),
                            nullable=False,
                            metadata={"comment": "Elapsed time in milliseconds since `start_datetime`"}
                        ),
                        StructField(
                            "measurement_datetime",
                            t.TimestampType(),
                            nullable=False,
                            metadata={"comment": """The absolute datetime of the measurement for the individual data point. 
                                    ISO 8601 format. Must include timezone. If timezone is not specified, UTC is assumed"""}
                        ),
                        StructField(
                            "datapoint_number",
                            t.IntegerType(),
                            nullable=True,
                            metadata={"comment": "The index number of the data point within the test. Starting at 1."}
                        ),
                        StructField(
                            "cycle_number",
                            t.LongType(),
                            nullable=True,
                            metadata={"comment": """The index number of the cycle within the test. Starting at 1.
                                    Cycles are monotonically increasing and gapless (doesn’t skip any numbers).
                                    Special Note: if a cycle column is not observed, the default algorith mwill look for 
                                    the first charge datapoint after any discharge datapoint as the boundary for a new cycle"""}
                        ),
                        StructField(
                            "step_index",
                            t.LongType(),
                            metadata={"comment": "Program step number associated with each control step."}
                        ),
                        StructField(
                            "step_time",
                            t.FloatType(),
                            metadata={
                                "comment": "Elapsed time since the start of the start of the current step.",
                                "unit": SIUnit.MILLISECOND.to_dict()
                            }
                        )

                    ])
                )
            )
        ]),
       
    )
])

In [22]:
# create a delta table for fake test data
battery_test_data = (
    DeltaTable
    .createOrReplace(spark)
    .property("descrption", "battery test data")
    .addColumns(data_schema)
    .location("./temp/battery_test_data")
    .execute()
)

24/05/07 06:22:35 ERROR Utils: Aborting task
org.apache.spark.sql.delta.DeltaAnalysisException: [DELTA_NESTED_NOT_NULL_CONSTRAINT] The element type of the field test_data.measurements contains a NOT NULL constraint. Delta does not support NOT NULL constraints nested within arrays or maps. To suppress this error and silently ignore the specified constraints, set spark.databricks.delta.constraints.allowUnenforcedNotNull.enabled = true.
Parsed element type:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "current",
    "type" : "float",
    "nullable" : false,
    "metadata" : {
      "comment" : "The current in Ampere. The sign convention is positive for charge current and negative for discharge current.",
      "unit" : {
        "sign" : "A",
        "name" : "ampere",
        "dimension" : "electric current"
      }
    }
  }, {
    "name" : "voltage",
    "type" : "float",
    "nullable" : false,
    "metadata" : {
      "comment" : "The voltage in Volts",
      "unit" : {
     

AnalysisException: [DELTA_NESTED_NOT_NULL_CONSTRAINT] The element type of the field test_data.measurements contains a NOT NULL constraint. Delta does not support NOT NULL constraints nested within arrays or maps. To suppress this error and silently ignore the specified constraints, set spark.databricks.delta.constraints.allowUnenforcedNotNull.enabled = true.
Parsed element type:
{
  "type" : "struct",
  "fields" : [ {
    "name" : "current",
    "type" : "float",
    "nullable" : false,
    "metadata" : {
      "comment" : "The current in Ampere. The sign convention is positive for charge current and negative for discharge current.",
      "unit" : {
        "sign" : "A",
        "name" : "ampere",
        "dimension" : "electric current"
      }
    }
  }, {
    "name" : "voltage",
    "type" : "float",
    "nullable" : false,
    "metadata" : {
      "comment" : "The voltage in Volts",
      "unit" : {
        "sign" : "V",
        "name" : "volt",
        "dimension" : "electric potential"
      }
    }
  }, {
    "name" : "elapsed_time",
    "type" : "integer",
    "nullable" : false,
    "metadata" : {
      "comment" : "Elapsed time in milliseconds since `start_datetime`"
    }
  }, {
    "name" : "measurement_datetime",
    "type" : "timestamp",
    "nullable" : false,
    "metadata" : {
      "comment" : "The absolute datetime of the measurement for the individual data point. \n                                    ISO 8601 format. Must include timezone. If timezone is not specified, UTC is assumed"
    }
  }, {
    "name" : "datapoint_number",
    "type" : "integer",
    "nullable" : true,
    "metadata" : {
      "comment" : "The index number of the data point within the test. Starting at 1."
    }
  }, {
    "name" : "cycle_number",
    "type" : "long",
    "nullable" : true,
    "metadata" : {
      "comment" : "The index number of the cycle within the test. Starting at 1.\n                                    Cycles are monotonically increasing and gapless (doesn’t skip any numbers).\n                                    Special Note: if a cycle column is not observed, the default algorith mwill look for \n                                    the first charge datapoint after any discharge datapoint as the boundary for a new cycle"
    }
  }, {
    "name" : "step_index",
    "type" : "long",
    "nullable" : true,
    "metadata" : {
      "comment" : "Program step number associated with each control step."
    }
  }, {
    "name" : "step_time",
    "type" : "float",
    "nullable" : true,
    "metadata" : {
      "comment" : "Elapsed time since the start of the start of the current step.",
      "unit" : {
        "sign" : "ms",
        "name" : "millisecond",
        "dimension" : "time"
      }
    }
  } ]
}

In [27]:
# generate fake Data based on data_schema and insert into delta table
from pyspark import Row
import datetime

fake_data = spark.createDataFrame(
    data = [
        Row(
            test_id=uuid.uuid4().hex,
            cycler_info=Row(
                channel_number=1,
                cycler_id="some cycler id",
                server_version="some server version",
                client_version="some client version"
            ),
            device_info=Row(
                device_id="some device id",
                device_name="some device name",
                nominal_capacity=1.0,
                calibration_date=datetime.date.fromisoformat("2022-01-01")
            ),
            procedure=Row(
                procedure_id="some procedure id",
                procedure_name="some procedure name"
            ),
            project="some project",
            test_data=Row(
                start_datetime=datetime.datetime.fromisoformat("2022-01-01T00:00:00Z"),
                measurements=[
                    Row(
                        current=1.0,
                        voltage=1.0,
                        elapsed_time=1,
                        measurement_datetime=datetime.datetime.fromisoformat("2022-01-01T00:00:00Z"),
                        datapoint_number=1,
                        cycle_number=1,
                        step_index=1,
                        step_time=1.0
                    )
                ]
            )
        )
    ],
    schema=data_schema
)


fake_data.write.mode("append").format("delta").saveAsTable("battery_test_data")

24/05/07 06:40:52 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [35]:
# select from the table
selection = spark.sql("SELECT * FROM battery_test_data")

In [36]:
selection.select("test_data.measurements.current").show()

+-------+
|current|
+-------+
|  [1.0]|
+-------+

