sahnib · ericm-db · Mar 18, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -3530,6 +3530,12 @@
     ],
     "sqlState" : "0A000"
   },
+  "STATEFUL_PROCESSOR_CANNOT_ASSIGN_TTL_IN_NO_TTL_MODE" : {
+    "message" : [
+      "State store operation=<operationType> on state=<stateName> does not support TTL in NoTTL() mode."
+    ],
+    "sqlState" : "42802"
+  },
   "STATEFUL_PROCESSOR_CANNOT_PERFORM_OPERATION_WITH_INVALID_HANDLE_STATE" : {
     "message" : [
       "Failed to perform stateful processor operation=<operationType> with invalid handle state=<handleState>."
@@ -4336,6 +4342,11 @@
           "Removing column families with <stateStoreProvider> is not supported."
         ]
       },
+      "STATE_STORE_TTL" : {
+        "message" : [
+          "State TTL with <stateStoreProvider> is not supported. Please use RocksDBStateStoreProvider."
+        ]
+      },
       "TABLE_OPERATION" : {
         "message" : [
           "Table <tableName> does not support <operation>. Please check the current catalog and namespace to make sure the qualified table name is expected, and also check the catalog implementation which is configured by \"spark.sql.catalog\"."

diff --git a/...ector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/...ector/connect/client/jvm/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.ProductEncoder
 import org.apache.spark.sql.connect.common.UdfUtils
 import org.apache.spark.sql.expressions.ScalarUserDefinedFunction
 import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StatefulProcessor, TimeoutMode}
+import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StatefulProcessor, TimeoutMode, TTLMode}
 
 /**
  * A [[Dataset]] has been logically grouped by a user specified grouping key. Users should not
@@ -830,12 +830,15 @@ class KeyValueGroupedDataset[K, V] private[sql] () extends Serializable {
    *   Instance of statefulProcessor whose functions will be invoked by the operator.
    * @param timeoutMode
    *   The timeout mode of the stateful processor.
+   * @param ttlMode
+   *   The ttlMode to evict user state on ttl expiration.
    * @param outputMode
    *   The output mode of the stateful processor. Defaults to APPEND mode.
    */
   def transformWithState[U: Encoder](
       statefulProcessor: StatefulProcessor[K, V, U],
       timeoutMode: TimeoutMode,
+      ttlMode: TTLMode,
       outputMode: OutputMode = OutputMode.Append()): Dataset[U] = {
     throw new UnsupportedOperationException
   }

diff --git a/dev/checkstyle-suppressions.xml b/dev/checkstyle-suppressions.xml
@@ -60,6 +60,8 @@
               files="sql/api/src/main/java/org/apache/spark/sql/streaming/TimeoutMode.java"/>
     <suppress checks="MethodName"
               files="sql/api/src/main/java/org/apache/spark/sql/streaming/Trigger.java"/>
+    <suppress checks="MethodName"
+              files="sql/api/src/main/java/org/apache/spark/sql/streaming/TTLMode.java"/>
     <suppress checks="LineLength"
               files="src/main/java/org/apache/spark/sql/api/java/*"/>
     <suppress checks="IllegalImport"

diff --git a/docs/sql-error-conditions-unsupported-feature-error-class.md b/docs/sql-error-conditions-unsupported-feature-error-class.md
@@ -202,6 +202,10 @@ Creating multiple column families with `<stateStoreProvider>` is not supported.
 
 Removing column families with `<stateStoreProvider>` is not supported.
 
+## STATE_STORE_TTL
+
+State TTL with `<stateStoreProvider>` is not supported. Please use RocksDBStateStoreProvider.
+
 ## TABLE_OPERATION
 
 Table `<tableName>` does not support `<operation>`. Please check the current catalog and namespace to make sure the qualified table name is expected, and also check the catalog implementation which is configured by "spark.sql.catalog".

diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
@@ -2162,6 +2162,12 @@ The SQL config `<sqlConf>` cannot be found. Please verify that the config exists
 
 Star (*) is not allowed in a select list when GROUP BY an ordinal position is used.
 
+### STATEFUL_PROCESSOR_CANNOT_ASSIGN_TTL_IN_NO_TTL_MODE
+
+[SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+State store operation=`<operationType>` on state=`<stateName>` does not support TTL in NoTTL() mode.
+
 ### STATEFUL_PROCESSOR_CANNOT_PERFORM_OPERATION_WITH_INVALID_HANDLE_STATE
 
 [SQLSTATE: 42802](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)

diff --git a/sql/api/src/main/java/org/apache/spark/sql/streaming/TTLMode.java b/sql/api/src/main/java/org/apache/spark/sql/streaming/TTLMode.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming;
+
+import org.apache.spark.annotation.Evolving;
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.sql.catalyst.plans.logical.*;
+
+/**
+ * Represents the type of ttl modes possible for the Dataset operations
+ * {@code transformWithState}.
+ */
+@Experimental
+@Evolving
+public class TTLMode {
+
+    /**
+     * Specifies that there is no TTL for the user state. User state would not
+     * be cleaned up by Spark automatically.
+     */
+    public static final TTLMode NoTTL() {
+        return NoTTL$.MODULE$;
+    }
+
+    /**
+     * Specifies that all ttl durations for user state are in processing time.
+     */
+    public static final TTLMode ProcessingTimeTTL() { return ProcessingTimeTTL$.MODULE$; }
+
+    /**
+     * Specifies that all ttl durations for user state are in event time.
+     */
+    public static final TTLMode EventTimeTTL() { return EventTimeTTL$.MODULE$; }
+}
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TTLMode.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TTLMode.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.streaming.TTLMode
+
+/** TTL types used in tranformWithState operator */
+case object NoTTL extends TTLMode
+
+case object ProcessingTimeTTL extends TTLMode
+
+case object EventTimeTTL extends TTLMode
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/ListState.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/ListState.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql.streaming
 
+import java.time.Duration
+
 import org.apache.spark.annotation.{Evolving, Experimental}
 
 @Experimental
@@ -33,13 +35,13 @@ private[sql] trait ListState[S] extends Serializable {
   def get(): Iterator[S]
 
   /** Update the value of the list. */
-  def put(newState: Array[S]): Unit
+  def put(newState: Array[S], ttlDuration: Duration = Duration.ZERO): Unit
 
   /** Append an entry to the list */
-  def appendValue(newState: S): Unit
+  def appendValue(newState: S, ttlDuration: Duration = Duration.ZERO): Unit
 
   /** Append an entire list to the existing value */
-  def appendList(newState: Array[S]): Unit
+  def appendList(newState: Array[S], ttlDuration: Duration = Duration.ZERO): Unit
 
   /** Removes this state for the given grouping key. */
   def clear(): Unit

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/StatefulProcessorHandle.scala
@@ -30,10 +30,11 @@ import org.apache.spark.sql.Encoder
 private[sql] trait StatefulProcessorHandle extends Serializable {
 
   /**
-   * Function to create new or return existing single value state variable of given type
+   * Function to create new or return existing single value state variable of given type.
    * The user must ensure to call this function only within the `init()` method of the
    * StatefulProcessor.
-   * @param stateName - name of the state variable
+   *
+   * @param stateName  - name of the state variable
    * @param valEncoder - SQL encoder for state variable
    * @tparam T - type of state variable
    * @return - instance of ValueState of type T that can be used to store state persistently

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/streaming/ValueState.scala b/sql/api/src/main/scala/org/apache/spark/sql/streaming/ValueState.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import java.io.Serializable
+import java.time.Duration
 
 import org.apache.spark.annotation.{Evolving, Experimental}
 
@@ -42,8 +43,13 @@ private[sql] trait ValueState[S] extends Serializable {
   /** Get the state if it exists as an option and None otherwise */
   def getOption(): Option[S]
 
-  /** Update the value of the state. */
-  def update(newState: S): Unit
+  /**
+   * Update the value of the state.
+   * @param newState the new value
+   * @param ttlDuration set the ttl to current batch processing time (for processing time TTL mode)
+   *                    or current watermark (for event time ttl mode) plus ttlDuration
+   */
+  def update(newState: S, ttlDuration: Duration = Duration.ZERO): Unit
 
   /** Remove this state. */
   def clear(): Unit

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.types.DataTypeUtils
 import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StatefulProcessor, TimeoutMode}
+import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, StatefulProcessor, TimeoutMode, TTLMode}
 import org.apache.spark.sql.types._
 
 object CatalystSerde {
@@ -574,6 +574,7 @@ object TransformWithState {
       groupingAttributes: Seq[Attribute],
       dataAttributes: Seq[Attribute],
       statefulProcessor: StatefulProcessor[K, V, U],
+      ttlMode: TTLMode,
       timeoutMode: TimeoutMode,
       outputMode: OutputMode,
       child: LogicalPlan): LogicalPlan = {
@@ -584,6 +585,7 @@ object TransformWithState {
       groupingAttributes,
       dataAttributes,
       statefulProcessor.asInstanceOf[StatefulProcessor[Any, Any, Any]],
+      ttlMode,
       timeoutMode,
       outputMode,
       keyEncoder.asInstanceOf[ExpressionEncoder[Any]],
@@ -600,6 +602,7 @@ case class TransformWithState(
     groupingAttributes: Seq[Attribute],
     dataAttributes: Seq[Attribute],
     statefulProcessor: StatefulProcessor[Any, Any, Any],
+    ttlMode: TTLMode,
     timeoutMode: TimeoutMode,
     outputMode: OutputMode,
     keyEncoder: ExpressionEncoder[Any],

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.expressions.ReduceAggregator
 import org.apache.spark.sql.internal.TypedAggUtils
-import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StatefulProcessor, TimeoutMode}
+import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StatefulProcessor, TimeoutMode, TTLMode}
 
 /**
  * A [[Dataset]] has been logically grouped by a user specified grouping key.  Users should not
@@ -656,19 +656,22 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * @param statefulProcessor Instance of statefulProcessor whose functions will be invoked by the
    *                          operator.
    * @param timeoutMode The timeout mode of the stateful processor.
+   * @param ttlMode The ttlMode to evict user state on ttl expiration
    * @param outputMode The output mode of the stateful processor. Defaults to APPEND mode.
    *
    */
   private[sql] def transformWithState[U: Encoder](
       statefulProcessor: StatefulProcessor[K, V, U],
       timeoutMode: TimeoutMode,
+      ttlMode: TTLMode,
       outputMode: OutputMode = OutputMode.Append()): Dataset[U] = {
     Dataset[U](
       sparkSession,
       TransformWithState[K, V, U](
         groupingAttributes,
         dataAttributes,
         statefulProcessor,
+        ttlMode,
         timeoutMode,
         outputMode,
         child = logicalPlan

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -751,14 +751,15 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case TransformWithState(
         keyDeserializer, valueDeserializer, groupingAttributes,
-        dataAttributes, statefulProcessor, timeoutMode, outputMode,
+        dataAttributes, statefulProcessor, ttlMode, timeoutMode, outputMode,
         keyEncoder, outputAttr, child) =>
         val execPlan = TransformWithStateExec(
           keyDeserializer,
           valueDeserializer,
           groupingAttributes,
           dataAttributes,
           statefulProcessor,
+          ttlMode,
           timeoutMode,
           outputMode,
           keyEncoder,
@@ -917,10 +918,10 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
           hasInitialState, planLater(initialState), planLater(child)
         ) :: Nil
       case logical.TransformWithState(keyDeserializer, valueDeserializer, groupingAttributes,
-          dataAttributes, statefulProcessor, timeoutMode, outputMode, keyEncoder,
+          dataAttributes, statefulProcessor, ttlMode, timeoutMode, outputMode, keyEncoder,
           outputObjAttr, child) =>
         TransformWithStateExec.generateSparkPlanForBatchQueries(keyDeserializer, valueDeserializer,
-          groupingAttributes, dataAttributes, statefulProcessor, timeoutMode, outputMode,
+          groupingAttributes, dataAttributes, statefulProcessor, ttlMode, timeoutMode, outputMode,
           keyEncoder, outputObjAttr, planLater(child)) :: Nil
 
       case _: FlatMapGroupsInPandasWithState =>