In [None]:
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
val base_path = "..."

val spark = SparkSession
  .builder
  .appName("pm")
  .getOrCreate()

val error_schema = new StructType()
      .add("deviceID",StringType)
      .add("timestamp",StringType)
      .add("error",StringType)

val warning_schema = new StructType()
      .add("deviceID",StringType)
      .add("timestamp",StringType)
      .add("warning",StringType)

val get_tsID = (ts: Long) => {
    val startTs = 1262304000
    val interval = 900
    Math.round(((ts - startTs) / interval) + 1)
}

val get_tsID_UDF = udf(get_tsID)

val errors_df = spark
    .readStream
    .schema(error_schema)
    .parquet(base_path + "errors.parquet")
    .withColumnRenamed("timestamp", "unix_ts")
    .withColumn("timestamp", from_unixtime($"unix_ts" / 1000))
    .withColumn("timestamp", $"timestamp".cast("timestamp"))
    .withColumn("tsID", get_tsID_UDF($"unix_ts"))


val warnings_df = spark
    .readStream
    .schema(warning_schema)    
    .parquet(base_path + "warnings.parquet")
    .withColumnRenamed("timestamp", "unix_ts")
    .withColumn("timestamp", from_unixtime($"unix_ts" / 1000))
    .withColumn("timestamp", $"timestamp".cast("timestamp"))
    .withColumn("tsID", get_tsID_UDF($"unix_ts"))



In [None]:
val errors_wtk = errors_df.withWatermark("timestamp", "15 minutes")
val warnings_wtk = warnings_df.withWatermark("timestamp", "15 minutes")

val errors_warnings = errors_wtk.as("err").join(
  warnings_wtk.as("war"),
  expr("""
    err.deviceID = war.deviceID AND
    err.tsID = war.tsID
    """)
)
.select("err.*", "war.warning")

errors_warnings
  .writeStream
  .outputMode("append")
  .format("parquet")
  .option("checkpointLocation", base_path + "checkpoint/ew/")
  .option("path", base_path + "ew.parquet")
  .start()
  .awaitTermination()