In [1]:
val visits = spark.read.json("visit.json")

rowVisits = [date: string, gender: string ... 3 more fields]


[date: string, gender: string ... 3 more fields]

In [2]:
visits.printSchema()

root
 |-- date: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- icd10: string (nullable = true)
 |-- patient_type: string (nullable = true)
 |-- subdistrict: string (nullable = true)



In [4]:
visits.count

19

In [5]:
visits.show()

+-------------------+------+-----+------------+-----------+
|               date|gender|icd10|patient_type|subdistrict|
+-------------------+------+-----+------------+-----------+
|2020-03-12T00:00:00|     F|C01.2|         OPD|      92001|
|2020-03-11T00:00:00|     M|C01.2|         IPD|      92003|
|2020-03-03T00:00:00|     M|C01.2|         IPD|      92002|
|2020-03-14T00:00:00|     F|A01.1|         IPD|      92001|
|2020-03-01T00:00:00|     M|B01.3|         OPD|      92001|
|2020-03-02T00:00:00|     M|C01.2|         OPD|      92001|
|2020-03-14T00:00:00|     M|C01.2|         IPD|      92003|
|2020-03-16T00:00:00|     F|B01.3|         IPD|      92001|
|2020-03-11T00:00:00|     F|C01.2|         OPD|      92002|
|2020-03-12T00:00:00|     F|B01.3|         IPD|      92001|
|2020-03-04T00:00:00|     M|A01.1|         OPD|      92003|
|2020-03-14T00:00:00|     F|C01.2|         OPD|      92002|
|2020-03-02T00:00:00|     F|C01.2|         IPD|      92003|
|2020-03-03T00:00:00|     F|C01.2|      

In [6]:
visits.select($"date", $"subdistrict").show()

+-------------------+-----------+
|               date|subdistrict|
+-------------------+-----------+
|2020-03-12T00:00:00|      92001|
|2020-03-11T00:00:00|      92003|
|2020-03-03T00:00:00|      92002|
|2020-03-14T00:00:00|      92001|
|2020-03-01T00:00:00|      92001|
|2020-03-02T00:00:00|      92001|
|2020-03-14T00:00:00|      92003|
|2020-03-16T00:00:00|      92001|
|2020-03-11T00:00:00|      92002|
|2020-03-12T00:00:00|      92001|
|2020-03-04T00:00:00|      92003|
|2020-03-14T00:00:00|      92002|
|2020-03-02T00:00:00|      92003|
|2020-03-03T00:00:00|      92003|
|2020-03-16T00:00:00|      92001|
|2020-03-15T00:00:00|      92001|
|2020-03-16T00:00:00|      92003|
|2020-03-13T00:00:00|      92002|
|2020-03-05T00:00:00|      92001|
+-------------------+-----------+



In [7]:

import org.apache.spark.sql.functions._ // for `when`and `udf`

val ili = udf((s:String) => {
    if (s == "A01.1" || s == "C01.2") 1 else 0
})

val cases = visits.withColumn("flu", when($"icd10" === "C01.2", 1).otherwise(0))
    .withColumn("SARI", when($"icd10" === "B01.3", 1).otherwise(0))
    .withColumn("pneumonia", when($"icd10" === "A01.1", 1).otherwise(0))
    .withColumn("ILI", ili($"icd10"))
    .withColumn("IPD", when($"patient_type" === "IPD", 1).otherwise(0))
    .withColumn("OPD", when($"patient_type" === "OPD", 1).otherwise(0))
    .select($"date", $"subdistrict", $"flu", $"SARI", $"pneumonia", $"ILI", $"IPD", $"OPD")

cases.show()
    

+-------------------+-----------+---+----+---------+---+---+---+
|               date|subdistrict|flu|SARI|pneumonia|ILI|IPD|OPD|
+-------------------+-----------+---+----+---------+---+---+---+
|2020-03-12T00:00:00|      92001|  1|   0|        0|  1|  0|  1|
|2020-03-11T00:00:00|      92003|  1|   0|        0|  1|  1|  0|
|2020-03-03T00:00:00|      92002|  1|   0|        0|  1|  1|  0|
|2020-03-14T00:00:00|      92001|  0|   0|        1|  1|  1|  0|
|2020-03-01T00:00:00|      92001|  0|   1|        0|  0|  0|  1|
|2020-03-02T00:00:00|      92001|  1|   0|        0|  1|  0|  1|
|2020-03-14T00:00:00|      92003|  1|   0|        0|  1|  1|  0|
|2020-03-16T00:00:00|      92001|  0|   1|        0|  0|  1|  0|
|2020-03-11T00:00:00|      92002|  1|   0|        0|  1|  0|  1|
|2020-03-12T00:00:00|      92001|  0|   1|        0|  0|  1|  0|
|2020-03-04T00:00:00|      92003|  0|   0|        1|  1|  0|  1|
|2020-03-14T00:00:00|      92002|  1|   0|        0|  1|  0|  1|
|2020-03-02T00:00:00|    

ili = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))
cases = [date: string, subdistrict: string ... 6 more fields]


[date: string, subdistrict: string ... 6 more fields]

In [8]:
cases.groupBy($"date", $"subdistrict")
    .agg(sum($"flu") as "flu"
         ,sum($"ILI") as "ILI"
         ,sum($"SARI") as "SARI"
         ,sum($"pneumonia") as "pneumonia"
         ,sum($"IPD") as "IPD"
         ,sum($"OPD") as "OPD"
        )
.orderBy(asc("date"))
.show()

+-------------------+-----------+---+---+----+---------+---+---+
|               date|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+-------------------+-----------+---+---+----+---------+---+---+
|2020-03-01T00:00:00|      92001|  0|  0|   1|        0|  0|  1|
|2020-03-02T00:00:00|      92001|  1|  1|   0|        0|  0|  1|
|2020-03-02T00:00:00|      92003|  1|  1|   0|        0|  1|  0|
|2020-03-03T00:00:00|      92003|  1|  1|   0|        0|  0|  1|
|2020-03-03T00:00:00|      92002|  1|  1|   0|        0|  1|  0|
|2020-03-04T00:00:00|      92003|  0|  1|   0|        1|  0|  1|
|2020-03-05T00:00:00|      92001|  0|  1|   0|        1|  1|  0|
|2020-03-11T00:00:00|      92002|  1|  1|   0|        0|  0|  1|
|2020-03-11T00:00:00|      92003|  1|  1|   0|        0|  1|  0|
|2020-03-12T00:00:00|      92001|  1|  1|   1|        0|  1|  1|
|2020-03-13T00:00:00|      92002|  1|  1|   0|        0|  0|  1|
|2020-03-14T00:00:00|      92003|  1|  1|   0|        0|  1|  0|
|2020-03-14T00:00:00|    

In [9]:
cases.groupBy($"subdistrict")
    .agg(sum($"flu") as "flu"
         ,sum($"ILI") as "ILI"
         ,sum($"SARI") as "SARI"
         ,sum($"pneumonia") as "pneumonia"
         ,sum($"IPD") as "IPD"
         ,sum($"OPD") as "OPD"
        )
.orderBy(asc("subdistrict"))
.show()

+-----------+---+---+----+---------+---+---+
|subdistrict|flu|ILI|SARI|pneumonia|IPD|OPD|
+-----------+---+---+----+---------+---+---+
|      92001|  3|  6|   3|        3|  6|  3|
|      92002|  4|  4|   0|        0|  1|  3|
|      92003|  4|  5|   1|        1|  4|  2|
+-----------+---+---+----+---------+---+---+

