In [1]:
import pyspark
import delta
from delta.tables import DeltaTable

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("nested-delta")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = delta.configure_spark_with_delta_pip(builder).getOrCreate()

24/05/04 09:05:28 WARN Utils: Your hostname, anders-silo-MB-air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.114 instead (on interface en0)
24/05/04 09:05:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/anders/.ivy2/cache
The jars for the packages stored in: /Users/anders/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ba70de9c-3c7d-4ddc-9de3-68a1bbd0f12c;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/anders/Library/Caches/pypoetry/virtualenvs/nested-delta-bqIhkT2k-py3.12/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 97ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-ba70de9c-3c7d-4ddc-9de3-68a1bbd0f12c
	confs: [default]
	0 artifacts copied, 3 already retrieved (0kB/3ms)
24/05/04 09:05:28 

# Data Structure

In [6]:
import pyspark.sql.functions as F
from pyspark.sql import types as t
import uuid
from enum import Enum
from delta.tables import *
from enum import Enum
from dataclasses import dataclass, field

In [15]:

@dataclass
class SIUnitDataMixin:
    sign: str
    unit: int
    dimension: bool = field(repr=False, default=True)

class SIUnit(SIUnitDataMixin, Enum):
    AMPERE = ("A", "ampere", "electric current")
    AMPERE_HOUR = ("Ah", "ampere-hour", "electric capacity")
    CELCIUS = ("c", "celcius", "temperature")
    VOLT = ("V", "volt", "electric potential")

    def to_dict(self):
        return {
            "sign": self.sign,
            "name": self.unit,
            "dimension": self.dimension
        }

In [16]:
SIUnit.AMPERE_HOUR.to_dict()

{'sign': 'Ah', 'name': 'ampere-hour', 'dimension': 'electric capacity'}

In [None]:
data_schema = StructType([
    StructField(
        "test_id",
        t.StringType(),
        nullable=False,
        metadata={"comment": "uuid identifier for the test"}
    ),
    StructField(
        "cycler_info",
        t.StructType([
            StructField(
                "channel_number",
                t.IntegerType(),
                nullable=False,
                metadata={"comment": "Which equipment channel number (or similar identifier) was used to run this test"}
            ),
            StructField(
                "cycler_id",
                t.StringType(),
                nullable=False,
                metadata={"comment": "An optional identifying string unique to the cycler across the organization"}
            ),
            StructField(
                "server_version",
                t.StringType(),
                nullable=True,
                metadata={"comment": "The version of the server software used to run the test"}
            ),
            StructField(
                "client_version",
                t.StringType(),
                nullable=True,
                metadata={"comment": "The version of the client software used to run the test"}
            ),
        ]),
        nullable=False,
        metadata={"comment": "Details about the cycler used to run the test"}
    ),
    StructField(
        "device_info",
        t.StructType([
            StructField(
                "device_id",
                t.StringType(),
                nullable=False,
                metadata={"comment": "Identifying string unique to the device/cell/pack across the organization"}
            ),
            StructField(
                "device_name",
                t.StringType(),
                nullable=False,
                metadata={"comment": "Descriptive name the test device"}
            ),
            StructField(
                "nominal_capacity",
                t.FloatType(),
                nullable=True,
                metadata={
                    "comment": "The nominal capacity of the test device in Ampere Hours",
                    "unit": SIUnit.to_dict()
                }
            ),
            StructField(
                "calibration_date",
                t.DateType(),
                nullable=True,
                metadata={"comment": "The date the test device was calibrated"}
            )
        ]),
        nullable=False,
        metadata={"comment": "Info about the device/cell/pack that was tested"}
    ),
    StructField(
        "procedure",
        t.StructType([
            StructField(
                "procedure_id",
                t.StringType(),
                nullable=False,
                metadata={"comment": "uuid identifier for the test procedure"}
            ),
            StructField(
                "procedure_name",
                t.StringType(),
                nullable=False,
                metadata={"comment": "Descriptive name the test procedure"}
            ),
        ]),
        nullable=False,
        metadata={"comment": "Info about the test procedure"}
    ),
    StructField(
        "project",
        t.StringType(),
        nullable=True,
        metadata={"comment": "The name of the project the device belongs to"}
    ),
    StructField(
        "test_data",
        StructType([
            StructField(
              "start_datetime",
              t.TimestampType(),
              nullable=False,
              metadata={"comment": "The absolute datetime the test started. ISO 8601 format. Must include timezone. If timezone is not specified, UTC is assumed"}  
            ),
           "measurements",
            t.ArrayType(
                StructType([
                    StructField(
                        "current",
                        t.FloatType(),
                        nullable=False,
                        metadata={
                            "comment": "The current in Ampere. The sign convention is positive for charge current and negative for discharge current.",
                            "unit": SIUnit.AMPERE.to_dict()
                        }
                    ),
                    StructField(
                        "voltage",
                        t.FloatType(),
                        nullable=False,
                        metadata={
                            "comment": "The voltage in Volts",
                            "unit": SIUnit.VOLT.to_dict()
                        }
                    ),
                    StructField(
                        "elapsed_time",
                        t.InegerType(),
                        nullable=False,
                        metadata={"comment": "Elapsed time in milliseconds since `start_datetime`"}
                    ),
                    StructField(
                        "measurement_datetime",
                        t.TimestampType(),
                        nullable=False,
                        metadata={"comment": """The absolute datetime of the measurement for the individual data point. 
                                  ISO 8601 format. Must include timezone. If timezone is not specified, UTC is assumed"""}
                    ),
                    StructField(
                        "datapoint_number",
                        t.IntegerType(),
                        nullable=True,
                        metadata={"comment": "The index number of the data point within the test. Starting at 1."}
                    ),
                    StructField(
                        "cycle_number",
                        t.LongType(),
                        nullable=True,
                        metadata={"comment": """The index number of the cycle within the test. Starting at 1.
                                   Cycles are monotonically increasing and gapless (doesn’t skip any numbers).
                                   Special Note: if a cycle column is not observed, the default algorith mwill look for 
                                   the first charge datapoint after any discharge datapoint as the boundary for a new cycle"""}
                    ),
                    StructField(
                        "step_index",
                        t.LongType(),
                        metadata={"comment": "Program step number associated with each control step."}
                    ),
                    StructField(
                        "step_time",
                        t.FloatType(),
                        metadata={
                            "comment": "Elapsed time since the start of the start of the current step.",
                            "unit": SIUnit.MILLISECOND.to_dict()
                        }
                    )

                ])
            ),

        ]),
        nullable=False,
        metadata={"comment": "Test data"}
    )
])