In [1]:
val source = spark.readStream
    .format("socket")
    .option("host", "127.0.0.1")
    .option("port", 9876)
    .load()

source = [value: string]


[value: string]

In [2]:
// read how to transform column in dataframe 
// https://sparkbyexamples.com/spark/spark-dataframe-withcolumn/

import org.apache.spark.sql.functions.split

val visits = source
    .withColumn("_tmp", split($"value", "\\,"))
    .select(
        $"_tmp".getItem(0).as("date"),
        $"_tmp".getItem(1).as("gender"),
        $"_tmp".getItem(2).as("icd10"),
        $"_tmp".getItem(3).as("patient_type"),
        $"_tmp".getItem(4).as("subdistrict")
    ).drop("_tmp")



visits = [date: string, gender: string ... 3 more fields]


[date: string, gender: string ... 3 more fields]

In [None]:

import org.apache.spark.sql.streaming.Trigger
val query = visits.writeStream
    .outputMode("append")
    .format("console")
    .trigger(Trigger.ProcessingTime("5 seconds"))
    .start()




In [3]:
import org.apache.spark.sql.functions._ // for `when`and `udf`

val ili = udf((s:String) => {
    if (s == "A01.1" || s == "C01.2") 1 else 0
})

val cases = visits.withColumn("flu", when($"icd10" === "C01.2", 1).otherwise(0))
    .withColumn("SARI", when($"icd10" === "B01.3", 1).otherwise(0))
    .withColumn("pneumonia", when($"icd10" === "A01.1", 1).otherwise(0))
    .withColumn("ILI", ili($"icd10"))
    .withColumn("IPD", when($"patient_type" === "IPD", 1).otherwise(0))
    .withColumn("OPD", when($"patient_type" === "OPD", 1).otherwise(0))
    .select($"date", $"subdistrict", $"flu", $"SARI", $"pneumonia", $"ILI", $"IPD", $"OPD")



ili = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))
cases = [date: string, subdistrict: string ... 6 more fields]


[date: string, subdistrict: string ... 6 more fields]

In [None]:

import org.apache.spark.sql.streaming.Trigger
val query = cases.writeStream
    .outputMode("append")
    .format("console")
    .trigger(Trigger.ProcessingTime("5 seconds"))
    .start()



In [4]:
val summary = cases.groupBy($"subdistrict")
    .agg(sum($"flu") as "flu"
         ,sum($"ILI") as "ILI"
         ,sum($"SARI") as "SARI"
         ,sum($"pneumonia") as "pneumonia"
         ,sum($"IPD") as "IPD"
         ,sum($"OPD") as "OPD"
        )
.orderBy(asc("subdistrict"))

summary = [subdistrict: string, flu: bigint ... 5 more fields]


[subdistrict: string, flu: bigint ... 5 more fields]

In [5]:

import org.apache.spark.sql.streaming.Trigger
val query = summary.writeStream
    .outputMode("complete")
    .format("console")
    .trigger(Trigger.ProcessingTime("5 seconds"))
    .start()




query = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@533a363f


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@533a363f

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+---+---+----+---------+---+---+
|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+-----------+---+---+----+---------+---+---+
+-----------+---+---+----+---------+---+---+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+---+---+----+---------+---+---+
|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+-----------+---+---+----+---------+---+---+
|      92001|  0|  1|   0|        1|  1|  0|
|      92002|  0|  0|   1|        0|  1|  0|
|      92003|  1|  2|   2|        1|  2|  2|
+-----------+---+---+----+---------+---+---+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------+---+---+----+---------+---+---+
|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+-----------+---+---+----+---------+---+---+
|      92001|  0|  2|   1|        2|  2|  1|
|      92002|  0|  0|   1|      

In [6]:
spark.stop()