In [1]:
val source = spark.readStream
    .format("socket")
    .option("host", "127.0.0.1")
    .option("port", 9876)
    .load()

source = [value: string]


[value: string]

In [2]:
// read more at this url
// https://docs.databricks.com/spark/latest/dataframes-datasets/complex-nested-data.html#complex-nested-data-notebook

import org.apache.spark.sql.types._                         // include the Spark Types to define our schema
import org.apache.spark.sql.functions._                     // include the Spark helper functions

val jsonSchema = new StructType()
        .add("date", TimestampType)
        .add("gender", StringType)
        .add("icd10",StringType)
        .add("patient_type", StringType)
        .add("subdistrict", StringType)

jsonSchema = StructType(StructField(date,TimestampType,true), StructField(gender,StringType,true), StructField(icd10,StringType,true), StructField(patient_type,StringType,true), StructField(subdistrict,StringType,true))


StructType(StructField(date,TimestampType,true), StructField(gender,StringType,true), StructField(icd10,StringType,true), StructField(patient_type,StringType,true), StructField(subdistrict,StringType,true))

In [3]:
val visits = source.select(from_json($"value", jsonSchema) as "data")
    .select($"data.*")

visits = [date: timestamp, gender: string ... 3 more fields]


[date: timestamp, gender: string ... 3 more fields]

In [33]:
import org.apache.spark.sql.functions._ // for `when`and `udf`

val ili = udf((s:String) => {
    if (s == "A01.1" || s == "C01.2") 1 else 0
})

val cases = visits.withColumn("flu", when($"icd10" === "C01.2", 1).otherwise(0))
    .withColumn("SARI", when($"icd10" === "B01.3", 1).otherwise(0))
    .withColumn("pneumonia", when($"icd10" === "A01.1", 1).otherwise(0))
    .withColumn("ILI", ili($"icd10"))
    .withColumn("IPD", when($"patient_type" === "IPD", 1).otherwise(0))
    .withColumn("OPD", when($"patient_type" === "OPD", 1).otherwise(0))
    .select($"date", $"subdistrict", $"flu", $"SARI", $"pneumonia", $"ILI", $"IPD", $"OPD")


val summary = cases
    .withWatermark("date", "5 seconds")
    .groupBy(window($"date", "1 minutes", "1 minutes").alias("date"), $"subdistrict")
    .agg(sum($"flu") as "flu"
         ,sum($"ILI") as "ILI"
         ,sum($"SARI") as "SARI"
         ,sum($"pneumonia") as "pneumonia"
         ,sum($"IPD") as "IPD"
         ,sum($"OPD") as "OPD"
        )

ili = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))
cases = [date: timestamp, subdistrict: string ... 6 more fields]
summary = [date: struct<start: timestamp, end: timestamp>, subdistrict: string ... 6 more fields]


[date: struct<start: timestamp, end: timestamp>, subdistrict: string ... 6 more fields]

In [34]:

import org.apache.spark.sql.streaming.Trigger
val query = summary.writeStream
    .outputMode("update")
    .format("console")
    .trigger(Trigger.ProcessingTime("5 seconds"))
    .start()

query = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@46ad23be


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@46ad23be

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----------+---+---+----+---------+---+---+
|date|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+----+-----------+---+---+----+---------+---+---+
+----+-----------+---+---+----+---------+---+---+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+-----------+---+---+----+---------+---+---+
|                date|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+--------------------+-----------+---+---+----+---------+---+---+
|[2020-03-08 14:22...|      92001|  0|  1|   0|        1|  0|  1|
|[2020-03-08 14:22...|      92002|  1|  2|   0|        1|  1|  1|
|[2020-03-08 14:22...|      92003|  0|  0|   1|        0|  1|  0|
+--------------------+-----------+---+---+----+---------+---+---+

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+-----------+---+---+---

In [35]:
query.stop()